# Table of Contents
* [Load data](#Load-data)
* [Hierarchical edge bundling](#Hierarchical-edge-bundling)
	* [computing similarity](#computing-similarity)
	* [clustering](#clustering)
	* [splitting](#splitting)
	* [rendering](#rendering)


In [8]:
import numpy as np
import pandas as pd

import pickle
from collections import defaultdict
import json
import os
import random
from copy import deepcopy

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data

In [9]:
dataset_root_dir = '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta7/'
file_name = 'tqa_dataset_beta7_5.json'
data_file =  os.path.join(dataset_root_dir, file_name)

In [10]:
with open(os.path.join(dataset_root_dir, file_name), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

with open('ck_12_vocab_words.pkl', 'rb') as f:
    glossary_terms = pickle.load(f)

# Hierarchical edge bundling

## computing similarity

In [11]:
vocab_topics = ['Lesson Vocabulary', 'Vocabulary']
cached_sw = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [12]:
def collect_filtered_lesson_text(complete_ds, include_adjunct=False, include_descriptions=False):
    filtered_lesson_text = defaultdict(str)
    lesson_names = {}
    for lesson in complete_ds:
        # lesson_key = lesson['lessonName'] + '_' + lesson['globalID']
        lesson_key = lesson['globalID']
        lesson_names[lesson_key] = lesson['lessonName']
        for topic_name, topic in sorted(lesson['topics'].items(), key=lambda x: x[1]['globalID']):
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_adjunct:
            for topic_name, topic in lesson['adjunctTopics'].items():
                if topic_name not in vocab_topics:
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_descriptions:
            for d_description in lesson['instructionalDiagrams'].values():
                filtered_lesson_text[lesson_key] += d_description['processedText'] + '\n'
    return filtered_lesson_text, lesson_names

def tokenize_and_stem(text, stopwords=cached_sw):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_and_mark_sci_terms(text, stopwords=cached_sw, science_terms=None):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            if stem in science_terms:
                normalized_tokens.append('__CONCEPT__')
            else:
                normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_lesson(text):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if cleaned_token and cleaned_token not in cached_sw and cleaned_token.isalpha():
             normalized_tokens.append(cleaned_token)
    return normalized_tokens

In [13]:
lesson_text, lesson_name_lookup = collect_filtered_lesson_text(ck12_combined_dataset, True, True)
lesson_corp = [lesson for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_ids = [lid for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_names = [lesson_name_lookup[lid] for lid in lesson_ids]

In [14]:
tfizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x: tokenize_and_stem(x, cached_sw), ngram_range=(1,3), min_df = 0.05, max_df=0.8)
tfidf =  tfizer.fit_transform(lesson_corp)
feature_names = tfizer.get_feature_names()

In [15]:
pairwise_similarity = tfidf * tfidf.T
pairwise_similar = pairwise_similarity.toarray()

## clustering

In [43]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters, max_iter=10000 , n_init=10, n_jobs=7)
km.fit(tfidf)
clusters = km.labels_.tolist()

In [44]:
cluster_members = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_members[cluster].append(lesson_ids[idx])

In [45]:
top_centroids = km.cluster_centers_.argsort()[:, ::-1] 
for i in range(num_clusters):
    print("Cluster {} words:".format(i))   
    [print(' {}'.format(feature_names[ind])) for ind in top_centroids[i, :6]]
    print('\n')

Cluster 0 words:
 earthquake
 plate
 volcano
 crust
 continent
 ocean


Cluster 1 words:
 science
 scientific
 scientist
 theory
 research
 experiment


Cluster 2 words:
 plant
 food
 soil
 bacteria
 organism
 photosynthesis


Cluster 3 words:
 blood
 system
 body
 disease
 cell
 organ


Cluster 4 words:
 star
 light
 system
 away
 billion
 solar system


Cluster 5 words:
 pressure
 temperature
 change
 fossil
 wind
 matter


Cluster 6 words:
 wave
 sound
 travel
 speed
 energy
 medium


Cluster 7 words:
 atom
 electron
 element
 compound
 carbon
 bond


Cluster 8 words:
 light
 visible
 object
 ray
 color
 wave


Cluster 9 words:
 magnetic
 field
 pole
 north
 earth
 south


Cluster 10 words:
 electric
 current
 device
 flow
 field
 power


Cluster 11 words:
 rock
 mineral
 sediment
 layer
 form
 surface


Cluster 12 words:
 cell
 protein
 membrane
 organism
 molecule
 nucleus


Cluster 13 words:
 earth
 planet
 moon
 orbit
 solar system
 solar


Cluster 14 words:
 water
 ocean
 pollu

## splitting

In [46]:
manual_assignments = {
                      "insects and other arthropods": "train",
                      "covalent bonds": "train",
                      "the sun and the earthmoon system": "train",
                      "the senses": "train",
                      "the respiratory system": "test",
                      "evolution and classification of plants": "test",
                      "the nervous system": "train",
                      "parts leaf": "train",
                      "volcanic eruptions": "train",
                      "introduction to the solar system": "train",
                      "nuclear energy": "train",
    
                      "seasons": "test",
                      "introduction to plants": "test",
                      "cell structures": "train",
                      "inside the atom": "test",
                      "inside earth": "test",
                      "vision and the eye": "test",
                      "the digestive system": "train",
                      "eclipses": "test",
                      "flow of energy": "test"
                     }

In [47]:
tt_assignments_id = defaultdict(list)
for c, members in cluster_members.items():
    member_names = [lesson_name_lookup[m] for m in members]
    assigned_split = 'train'
    rand_n = random.random()
    man_assigned = set(member_names).intersection(set(manual_assignments.keys()))
    if man_assigned:
        assigned_split = manual_assignments[man_assigned.pop()]
    elif rand_n > 0.95:
        assigned_split = 'test'
    tt_assignments_id[assigned_split] += members

tot_len = len(tt_assignments_id['test']) + len(tt_assignments_id['train'])
print(len(tt_assignments_id['test']), len(tt_assignments_id['train']))
print(len(tt_assignments_id['test']) / tot_len, len(tt_assignments_id['train'])/ tot_len)

202 874
0.18773234200743494 0.8122676579925651


In [48]:
with open('./new_tt_assignments.json', 'w') as f:
    json.dump(tt_assignments_id, f)

In [49]:
split_lookup ={}
for k, vals in tt_assignments_id.items():
    for v in vals:
        split_lookup[v] = k

## rendering

In [50]:
all_lessons = [{'lname': lesson['lessonName'], 'tta': split_lookup[lesson['globalID']]} for lesson in ck12_combined_dataset if split_lookup[lesson['globalID']] != 'skip']
for lesson in all_lessons:
    lesson['importName'] = lesson['tta'] + '.' + lesson['lname']

connectivity_threshold = 0.8
lesson_connections = []
lessons_to_show = all_lessons

affinity_to_use = pairwise_similar

for i in range(len(lessons_to_show)):
    this_lesson = lessons_to_show[i]
    connected_lessons = []
    for j in range(len(lessons_to_show)):
        if i == j:
            continue
        if affinity_to_use[i][j] > connectivity_threshold and lessons_to_show[j]['importName'] != this_lesson['importName']:
            connected_lessons.append(lessons_to_show[j]['importName'])
    this_entry = {
        'name': this_lesson['importName'],
        'imports': connected_lessons,
        'size': '300'
    }
    if this_entry['imports']:
        lesson_connections.append(this_entry)

with open('lesson_connections.json', 'w') as f:
    json.dump(lesson_connections, f)

In [51]:
%%HTML
<iframe width="100%" height="500" src="index_ia.html?inline=false"></iframe>