# Table of Contents
* [Load data](#Load-data)
* [Hierarchical edge bundling](#Hierarchical-edge-bundling)
	* [computing similarity](#computing-similarity)
	* [clustering](#clustering)
	* [splitting](#splitting)
	* [rendering](#rendering)


In [41]:
import numpy as np
import pandas as pd
import scipy as st

import pickle
from collections import defaultdict
import json
import os
import random
from copy import deepcopy

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load data

In [2]:
dataset_root_dir = '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta7/'
file_name = 'tqa_dataset_beta7_5.json'
data_file =  os.path.join(dataset_root_dir, file_name)

In [3]:
with open(os.path.join(dataset_root_dir, file_name), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

with open('ck_12_vocab_words.pkl', 'rb') as f:
    glossary_terms = pickle.load(f)

# Hierarchical edge bundling

## computing similarity

In [4]:
vocab_topics = ['Lesson Vocabulary', 'Vocabulary']
cached_sw = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [5]:
def collect_filtered_lesson_text(complete_ds, include_adjunct=False, include_descriptions=False):
    filtered_lesson_text = defaultdict(str)
    lesson_names = {}
    for lesson in complete_ds:
        # lesson_key = lesson['lessonName'] + '_' + lesson['globalID']
        lesson_key = lesson['globalID']
        lesson_names[lesson_key] = lesson['lessonName']
        for topic_name, topic in sorted(lesson['topics'].items(), key=lambda x: x[1]['globalID']):
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_adjunct:
            for topic_name, topic in lesson['adjunctTopics'].items():
                if topic_name not in vocab_topics:
                    filtered_lesson_text[lesson_key] += topic['content']['text'] + '\n'
        if include_descriptions:
            for d_description in lesson['instructionalDiagrams'].values():
                filtered_lesson_text[lesson_key] += d_description['processedText'] + '\n'
    return filtered_lesson_text, lesson_names

def tokenize_and_stem(text, stopwords=cached_sw):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_and_mark_sci_terms(text, stopwords=cached_sw, science_terms=None):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if len(cleaned_token) > 3 and cleaned_token not in stopwords and cleaned_token.isalpha():
            stem = lemmatizer.lemmatize(cleaned_token)
            if stem in science_terms:
                normalized_tokens.append('__CONCEPT__')
            else:
                normalized_tokens.append(stem)
    return normalized_tokens

def tokenize_lesson(text):
    tokens = wordpunct_tokenize(text)
    normalized_tokens = []
    for toke in tokens:
        cleaned_token = toke.strip().lower()
        if cleaned_token and cleaned_token not in cached_sw and cleaned_token.isalpha():
             normalized_tokens.append(cleaned_token)
    return normalized_tokens

In [6]:
lesson_text, lesson_name_lookup = collect_filtered_lesson_text(ck12_combined_dataset, True, True)
lesson_corp = [lesson for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_ids = [lid for lid, lesson in sorted(lesson_text.items(), key=lambda x: x[0])]
lesson_names = [lesson_name_lookup[lid] for lid in lesson_ids]

In [7]:
tfizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x: tokenize_and_stem(x, cached_sw), ngram_range=(1,3), min_df = 0.05, max_df=0.8)
tfidf =  tfizer.fit_transform(lesson_corp)
feature_names = tfizer.get_feature_names()

In [8]:
pairwise_similarity = tfidf * tfidf.T
pairwise_similar = pairwise_similarity.toarray()

## clustering

In [9]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters, max_iter=10000 , n_init=10, n_jobs=7)
km.fit(tfidf)
clusters = km.labels_.tolist()

In [10]:
cluster_members = defaultdict(list)
for idx, cluster in enumerate(clusters):
    cluster_members[cluster].append(lesson_ids[idx])

In [11]:
top_centroids = km.cluster_centers_.argsort()[:, ::-1] 
for i in range(num_clusters):
    print("Cluster {} words:".format(i))   
    [print(' {}'.format(feature_names[ind])) for ind in top_centroids[i, :6]]
    print('\n')

Cluster 0 words:
 reaction
 chemical
 chemical reaction
 equation
 product
 energy


Cluster 1 words:
 specie
 animal
 organism
 population
 food
 mammal


Cluster 2 words:
 force
 object
 gravity
 motion
 greater
 work


Cluster 3 words:
 electron
 atom
 element
 metal
 compound
 nucleus


Cluster 4 words:
 mineral
 rock
 chemical
 water
 element
 surface


Cluster 5 words:
 current
 electric
 magnetic
 field
 pole
 device


Cluster 6 words:
 cell
 membrane
 organism
 body
 bacteria
 nucleus


Cluster 7 words:
 earth
 planet
 moon
 orbit
 solar system
 solar


Cluster 8 words:
 energy
 fuel
 heat
 nuclear
 fossil fuel
 power


Cluster 9 words:
 wave
 sound
 travel
 speed
 energy
 medium


Cluster 10 words:
 blood
 muscle
 system
 cell
 body
 heart


Cluster 11 words:
 rock
 earthquake
 plate
 volcano
 crust
 sediment


Cluster 12 words:
 acid
 protein
 base
 molecule
 compound
 cell


Cluster 13 words:
 light
 matter
 pressure
 volume
 change
 object


Cluster 14 words:
 system
 disea

## splitting

In [240]:
manual_assignments = {
                      "covalent bonds": "train",
                      "the sun and the earthmoon system": "train",
                      "the senses": "train",
                      "evolution and classification of plants": "test",
                      "the nervous system": "train",
                      "introduction to the solar system": "train",
                      "nuclear energy": "train",
                      "the digestive system": "train",
                      "seasons": "train",
                      "eclipses": "train",

    
                      "introduction to plants": "test",
                      "inside earth": "test",
                      "volcanic eruptions": "test",
#                       "flow of energy": "test",
    
#                       "parts leaf": "val", 
#                       "cell structures": "val",
                     }

In [285]:
tt_assignments_id = defaultdict(list)
for c, members in cluster_members.items():
    member_names = [lesson_name_lookup[m] for m in members]
    assigned_split = 'train'
    rand_n = random.random()
    man_assigned = set(member_names).intersection(set(manual_assignments.keys()))
    if man_assigned:
        man_assignments = [manual_assignments[ma] for ma in man_assigned]
        assigned_split = manual_assignments[man_assigned.pop()]
    elif rand_n > 0.8:
        assigned_split = 'test'
    elif rand_n > 0.5 :
        assigned_split = 'val'
    tt_assignments_id[assigned_split] += members

tot_len = sum([len(v) for v in tt_assignments_id.values()])
print([(k, len(v)) for k,v in tt_assignments_id.items()])
print([(k, len(v) / tot_len) for k,v in sorted(tt_assignments_id.items())])

[('train', 758), ('test', 175), ('val', 143)]
[('test', 0.16263940520446096), ('train', 0.7044609665427509), ('val', 0.13289962825278812)]


In [286]:
with open('./new_tt_assignments.json', 'w') as f:
    json.dump(tt_assignments_id, f)

In [287]:
split_lookup ={}
for k, vals in tt_assignments_id.items():
    for v in vals:
        split_lookup[v] = k

## rendering

In [288]:
all_lessons = [{'lname': lesson['lessonName'], 'tta': split_lookup[lesson['globalID']]} for lesson in ck12_combined_dataset if split_lookup[lesson['globalID']] != 'skip']
for lesson in all_lessons:
    lesson['importName'] = lesson['tta'] + '.' + lesson['lname']

connectivity_threshold = 0.8
lesson_connections = []
lessons_to_show = all_lessons

affinity_to_use = pairwise_similar

for i in range(len(lessons_to_show)):
    this_lesson = lessons_to_show[i]
    connected_lessons = []
    for j in range(len(lessons_to_show)):
        if i == j:
            continue
        if affinity_to_use[i][j] > connectivity_threshold and lessons_to_show[j]['importName'] != this_lesson['importName']:
            connected_lessons.append(lessons_to_show[j]['importName'])
    this_entry = {
        'name': this_lesson['importName'],
        'imports': connected_lessons,
        'size': '300'
    }
    if this_entry['imports']:
        lesson_connections.append(this_entry)

with open('lesson_connections.json', 'w') as f:
    json.dump(lesson_connections, f)

In [289]:
%%HTML
<iframe width="100%" height="500" src="index_ia.html?inline=false"></iframe>