# Table of Contents
* [Load data](#Load-data)
* [Hierarchical edge bundling](#Hierarchical-edge-bundling)
	* [computing similarity](#computing-similarity)
	* [clustering](#clustering)
	* [splitting](#splitting)
	* [rendering](#rendering)


In [1]:
import numpy as np
import pandas as pd
import scipy as st

import pickle
from collections import defaultdict
import json
import os
import random
from copy import deepcopy

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data

In [2]:
dataset_root_dir = '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta7/'
file_name = 'tqa_dataset_beta7_5.json'
data_file =  os.path.join(dataset_root_dir, file_name)

In [3]:
with open(os.path.join(dataset_root_dir, file_name), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

with open('ck_12_vocab_words.pkl', 'rb') as f:
    glossary_terms = pickle.load(f)

# Hierarchical edge bundling

## computing similarity

In [4]:
vocab_topics = ['Lesson Vocabulary', 'Vocabulary']
cached_sw = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [5]:
def collect_filtered_lesson_text(complete_ds, include_adjunct=False, include_descriptions=False):
    filtered_lesson_text = defaultdict(str)
    lesson_names = {}
    for lesson in complete_ds:
        # lesson_key = lesson['lessonName'] + '_' + lesson['globalID']
        lesson_key = lesson['globalID']
        lesson_names[lesson_key] = lesson['lessonName']
        for topic_name, topic in sorted(lesson['topics'].items(), key=lambda x: x[1]['globalID']):
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_adjunct:
            for topic_name, topic in lesson['adjunctTopics'].items():
                if topic_name not in vocab_topics:
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_descriptions:
            for d_description in lesson['instructionalDiagrams'].values():
                filtered_lesson_text[lesson_key] += d_description['processedText'] + '\n'
    return filtered_lesson_text, lesson_names

def tokenize_and_stem(text, stopwords=cached_sw):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_and_mark_sci_terms(text, stopwords=cached_sw, science_terms=None):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            if stem in science_terms:
                normalized_tokens.append('__CONCEPT__')
            else:
                normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_lesson(text):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if cleaned_token and cleaned_token not in cached_sw and cleaned_token.isalpha():
             normalized_tokens.append(cleaned_token)
    return normalized_tokens

In [6]:
lesson_text, lesson_name_lookup = collect_filtered_lesson_text(ck12_combined_dataset, True, True)
lesson_corp = [lesson for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_ids = [lid for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_names = [lesson_name_lookup[lid] for lid in lesson_ids]

In [7]:
tfizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x: tokenize_and_stem(x, cached_sw), ngram_range=(1,3), min_df = 0.05, max_df=0.8)
tfidf =  tfizer.fit_transform(lesson_corp)
feature_names = tfizer.get_feature_names()

In [8]:
pairwise_similarity = tfidf * tfidf.T
pairwise_similar = pairwise_similarity.toarray()

## clustering

In [9]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters, max_iter=10000 , n_init=10, n_jobs=7)
km.fit(tfidf)
clusters = km.labels_.tolist()

In [10]:
cluster_members = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_members[cluster].append(lesson_ids[idx])

In [11]:
top_centroids = km.cluster_centers_.argsort()[:, ::-1] 
for i in range(num_clusters):
    print("Cluster {} words:".format(i))   
    [print(' {}'.format(feature_names[ind])) for ind in top_centroids[i, :6]]
    print('\n')

Cluster 0 words:
 matter
 volume
 liquid
 change
 water
 solution


Cluster 1 words:
 plate
 earthquake
 volcano
 ocean
 wind
 temperature


Cluster 2 words:
 wave
 earthquake
 travel
 energy
 speed
 medium


Cluster 3 words:
 blood
 system
 body
 disease
 cell
 organ


Cluster 4 words:
 earth
 light
 planet
 moon
 star
 solar


Cluster 5 words:
 acid
 protein
 molecule
 base
 cell
 sugar


Cluster 6 words:
 nucleus
 nuclear
 radiation
 energy
 element
 atom


Cluster 7 words:
 force
 object
 motion
 gravity
 distance
 greater


Cluster 8 words:
 current
 electric
 magnetic
 field
 pole
 device


Cluster 9 words:
 water
 soil
 pollution
 waste
 ocean
 river


Cluster 10 words:
 science
 scientific
 scientist
 theory
 research
 experiment


Cluster 11 words:
 energy
 heat
 transfer
 solar
 fuel
 matter


Cluster 12 words:
 cell
 membrane
 organism
 nucleus
 bacteria
 reproduction


Cluster 13 words:
 specie
 animal
 food
 organism
 population
 human


Cluster 14 words:
 atom
 electron
 

## splitting

In [9]:
def dict_key_extract(key, var):
    if hasattr(var, 'items'):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, dict):
                for result in dict_key_extract(key, v):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in dict_key_extract(key, d):
                        yield result

def compute_split_stats(test_train_assignments):
    stat_counts = {
        'text_questions': {
            'train': 0,
            'test': 0,
            'val': 0,
            'id_to_find': 'nonDiagramQuestions'
        },
        'diagram_questions': {
            'train': 0,
            'test': 0,
            'val': 0,
            'id_to_find': 'diagramQuestions'
        },
        'topics': {
            'train': 0,
            'test': 0,
            'val': 0,
            'id_to_find': 'topics'
        },
    }
    for split in ['test', 'train', 'val']:
        for lesson_id in test_train_assignments[split]:
            for stat_type, stats in stat_counts.items():
                lesson_content = [lesson for lesson in ck12_combined_dataset if lesson['globalID'] == lesson_id][0]
                stats[split] += len(list(dict_key_extract(stats['id_to_find'], lesson_content))[0].values())
    stat_counts['n_lessons'] = {
        "test": len(test_train_assignments['test']),
        "train": len(test_train_assignments['train']),
        "val": len(test_train_assignments['val']),
        'id_to_find': 'n_lessons'
    }
    for stat_type, stat in stat_counts.items():
        stat['train_fraction'] = "{0:.3f}".format(stat['train'] / (stat['train'] + stat['test'] + stat['val']))
        stat['test_fraction'] = "{0:.3f}".format(stat['test'] / (stat['train'] + stat['test'] + stat['val']))
        stat['val_fraction'] = "{0:.3f}".format(stat['val'] / (stat['train'] + stat['test'] + stat['val']))
    return stat_counts

In [14]:
manual_assignments = {
                      "covalent bonds": "train",
                      "the sun and the earthmoon system": "train",
                      "the senses": "train",
                      "the nervous system": "train",
                      "introduction to the solar system": "train",
                      "nuclear energy": "train",
                      "the digestive system": "train",
                      "seasons": "train",
                      "eclipses": "train",
    
                      "introduction to plants": "val",
                      "inside earth": "val",
                      "volcanic eruptions": "val",
                     }

In [20]:
tt_assignments_id = defaultdict(list)
for c, members in cluster_members.items():
    member_names = [lesson_name_lookup[m] for m in members]
    assigned_split = 'train'
    rand_n = random.random()
    man_assigned = set(member_names).intersection(set(manual_assignments.keys()))
    if man_assigned:
        man_assignments = [manual_assignments[ma] for ma in man_assigned]
        assigned_split = manual_assignments[man_assigned.pop()]
    elif rand_n > 0.8:
        assigned_split = 'val'
    elif rand_n > 0.55 :
        assigned_split = 'test'
    tt_assignments_id[assigned_split] += members

tot_len = sum([len(v) for v in tt_assignments_id.values()])
print([(k, len(v)) for k,v in tt_assignments_id.items()])
print([(k, len(v) / tot_len) for k,v in sorted(tt_assignments_id.items())])

[('val', 310), ('train', 460), ('test', 306)]
[('test', 0.2843866171003718), ('train', 0.4275092936802974), ('val', 0.28810408921933084)]


In [21]:
# with open('./new_tt_assignments.json', 'w') as f:
#     json.dump(tt_assignments_id, f)

In [25]:
split_lookup ={}
for k, vals in tt_assignments_id.items():
    for v in vals:
        split_lookup[v] = k

In [26]:
computed_stats_non_diagram = [compute_split_stats(tt_assignments_id)]

split_trials_train_fracts = [{k:v['train_fraction'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_train_counts = [{k:v['train'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_test_fracts = [{k:v['test_fraction'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_test_counts = [{k:v['test'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_val_fracts = [{k:v['val_fraction'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_val_counts = [{k:v['val'] for k, v in trial.items()} for trial in computed_stats_non_diagram]


split_trial_df = pd.DataFrame(split_trials_train_fracts)
split_trial_df = split_trial_df.append(pd.DataFrame(split_trials_val_fracts))
split_trial_df = split_trial_df.append(pd.DataFrame(split_trials_test_fracts))

split_trial_df.index = ['train', 'val', 'test']
split_stats_df = pd.concat([split_trial_df, pd.DataFrame(split_trials_train_counts + split_trials_val_counts + split_trials_test_counts, index=['train', 'val', 'test'])], axis=1, join='inner')
split_stats_df = split_stats_df.apply(pd.to_numeric)

split_stats_df

Unnamed: 0,diagram_questions,n_lessons,text_questions,topics,diagram_questions.1,n_lessons.1,text_questions.1,topics.1
train,0.715,0.704,0.689,0.693,8985,758,13036,3473
val,0.157,0.161,0.173,0.171,1977,173,3279,859
test,0.128,0.135,0.138,0.136,1605,145,2614,683


## rendering

In [13]:
with open('./new_tt_assignments.json', 'r') as f:
    tt_assignments_id = json.load(f)

In [14]:
split_lookup ={}
for k, vals in tt_assignments_id.items():
    for v in vals:
        split_lookup[v] = k

In [174]:
all_lessons = [{'lname': lesson['lessonName'], 'tta': split_lookup[lesson['globalID']]} for lesson in ck12_combined_dataset if split_lookup[lesson['globalID']] != 'skip' if lesson['questions']['diagramQuestions'] or True]
for lesson in all_lessons:
    lesson['importName'] = lesson['tta'] + '.' + lesson['lname']

connectivity_threshold = 0.80
lesson_connections = []
lessons_to_show = all_lessons

affinity_to_use = pairwise_similar

for i in range(len(lessons_to_show)):
    this_lesson = lessons_to_show[i]
    connected_lessons = []
    connectivities = {}
    for j in range(len(lessons_to_show)):
        if i == j:
            continue
        if affinity_to_use[i][j] > connectivity_threshold and lessons_to_show[j]['importName'] != this_lesson['importName']:
            connected_lessons.append(lessons_to_show[j]['importName'])
    this_entry = {
        'name': this_lesson['importName'].replace('vs.', 'vs'),
        'imports': connected_lessons,
        'size': '300',
    }
    if this_entry['imports']:
        lesson_connections.append(this_entry)

with open('lesson_connections.json', 'w') as f:
    json.dump(lesson_connections, f)

In [175]:
%%HTML
<iframe width="100%" height="500" src="index_ia.html?inline=false"></iframe>