# Table of Contents
* [Load data](#Load-data)
* [dataset validation](#dataset-validation)
* [dataset stats](#dataset-stats)
	* [topic names](#topic-names)
	* [question type dist](#question-type-dist)
	* [looking for missing values](#looking-for-missing-values)
* [html rendering](#html-rendering)
* [building word corpus](#building-word-corpus)
* [End](#End)


In [1]:
%%capture
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict, OrderedDict

import cv2
import pprint
import pickle
import json
import requests
import io
import sys
import os
from binascii import b2a_hex
import base64
from wand.image import Image as WImage
from IPython.display import display
import PIL.Image as Image
from copy import deepcopy
import glob

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import jsonschema
# from pdfextraction.ck12_new_schema import ck12_schema


In [2]:
%%capture
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light solarized
plt.rcParams['grid.linewidth'] = 0
plt.rcParams['figure.figsize'] = (16.0, 10.0)

# Load data

In [174]:
dataset_root_dir = '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta6/'
data_file = 'tqa_dataset_beta6.json'
# data_file = 'beta6_final.json'


with open(os.path.join(dataset_root_dir, data_file), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)

In [107]:
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

# dataset validation

In [111]:
from pdfextraction.validate_and_split import DataSetIntegrityChecker

In [112]:
dataset_validator = DataSetIntegrityChecker(dataset_root_dir, data_file)
terr = dataset_validator.validate_dataset() 
if terr == 'all validation test passed':
    print(terr)



In [113]:
gids_seen = dataset_validator.check_global_counts()



In [114]:
terr['image_paths']

['/Users/schwenk/wrk/stb/dataset_releases/data_release_beta6/textbook_images/biotechnology_in_agriculture_21732.png',
 '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta6/textbook_images/chemical_properties_of_matter_22718.png']

In [115]:
len(terr['schema'])

0

In [185]:
img_path_errors = [err for err in terr['image_paths'] if 'textbook' not in err]

In [186]:
len(img_path_errors)

504

# test train splits

In [269]:
from pdfextraction.validate_and_split import TestTrainSplitter
test_train_splitter = TestTrainSplitter(dataset_root_dir, data_file)

*insects and other arthropods to train*

In [251]:
tt_split = test_train_splitter.perform_split(manual_assignments=manual_assignments)

In [312]:
trial_counter = 1

In [313]:
exp_dir = 'test_train_split_trials/'

In [321]:
manual_assignments = {
                      "ML_0260": {"lesson_name": "insects and other arthropods", "split": "train"},
                      "ML_0310": {"lesson_name": "covalent bonds", "split": "train"},
                      "ML_0243": {"lesson_name": "sun earth moon system", "split": "train"},
                      "ML_0288": {"lesson_name": "the senses", "split": "train"},
                      "ML_0006": {"lesson_name": "respiratory system", "split": "train"},
                      "ML_0344": {"lesson_name": "evolution and classifiaction plants", "split": "train"},
                      "ML_0023": {"lesson_name": "the nervous system", "split": "train"},
                      "ML_0243": {"lesson_name": "the sun", "split": "train"},
                      "ML_0368": {"lesson_name": "seasons", "split": "test"},
                      "ML_0164": {"lesson_name": "introduction to plants", "split": "test"},
                      "ML_0440": {"lesson_name": "cell structures", "split": "test"},
                      "ML_0423": {"lesson_name": "inside the atom", "split": "test"},
                      "ML_0490": {"lesson_name": "vision and the eye", "split": "test"},
                      "ML_0403": {"lesson_name": "the digestive system", "split": "test"},
                      "ML_0403": {"lesson_name": "", "split": "test"},
                      "ML_0434": {"lesson_name": "flow of energy", "split": "test"}, 
                     }

In [319]:
from pdfextraction.validate_and_split import TestTrainSplitter

def write_trial_split(tt_split, exp_dir, multirun_idx=0):
    global trial_counter
    if not multirun_idx:
        record_filename = os.path.join(exp_dir,'record_splits_' + str(trial_counter) + '_.json')
    else:
        record_filename = os.path.join(exp_dir,'record_splits_' + str(trial_counter) + '_' +str(multirun_idx) +  '.json')     
    with open(record_filename, 'w') as f:
        json.dump(tt_split, f)
    print('saving trial ' + str(trial_counter) + '_' + str(multirun_idx))
    trial_counter += 1

def split_and_save(manual_assignments, multirun_idx=0):
    global trial_counter
    test_train_splitter = TestTrainSplitter(dataset_root_dir, data_file)
    tt_split, debug = test_train_splitter.perform_split(manual_assignments=manual_assignments)
    write_trial_split(tt_split, exp_dir, multirun_idx)
    return  tt_split

In [320]:
n_trials = 5
split_trials = [split_and_save(manual_assignments, i) for i in range(n_trials)]  

saving trial 11_0
saving trial 12_1
saving trial 13_2
saving trial 14_3
saving trial 15_4


In [297]:
computed_stats_non_diagram = [test_train_splitter.compute_split_stats(this_split) for this_split in split_trials]
computed_stats_diagram = [test_train_splitter.compute_split_stats(this_split, True) for this_split in split_trials]

In [296]:
computed_stats[0]

{'n_diagram_questions': {'id_to_find': 'diagramQuestions',
  'test': 2418,
  'test_fraction': '0.192',
  'train': 10149},
 'n_instructional_diagrams': {'id_to_find': 'instructionalDiagrams',
  'test': 46,
  'test_fraction': '0.167',
  'train': 230},
 'n_lessons': {'id_to_find': 'n_lessons',
  'test': 14,
  'test_fraction': '0.165',
  'train': 71},
 'n_text_questions': {'id_to_find': 'nonDiagramQuestions',
  'test': 389,
  'test_fraction': '0.174',
  'train': 1843},
 'n_topics': {'id_to_find': 'topics',
  'test': 142,
  'test_fraction': '0.181',
  'train': 642}}

In [294]:
computed_stats[0]

{'n_diagram_questions': {'id_to_find': 'diagramQuestions',
  'test': 2418,
  'test_fraction': '0.192',
  'train': 10149},
 'n_instructional_diagrams': {'id_to_find': 'instructionalDiagrams',
  'test': 46,
  'test_fraction': '0.167',
  'train': 230},
 'n_lessons': {'id_to_find': 'n_lessons',
  'test': 216,
  'test_fraction': '0.201',
  'train': 860},
 'n_text_questions': {'id_to_find': 'nonDiagramQuestions',
  'test': 2567,
  'test_fraction': '0.190',
  'train': 10970},
 'n_topics': {'id_to_find': 'topics',
  'test': 949,
  'test_fraction': '0.189',
  'train': 4066}}

In [299]:
split_trials_test_fracts = [{k:v['test_fraction'] for k, v in trial.items()} for trial in computed_stats_diagram]
split_trials_test_counts = [{k:v['test'] for k, v in trial.items()} for trial in computed_stats_diagram]
split_trial_df = pd.DataFrame(split_trials_test_fracts)
split_trial_df = split_trial_df.apply(pd.to_numeric)
split_stats_df = pd.concat([split_trial_df, pd.DataFrame(split_trials_test_counts)], axis=1)
split_stats_df_diagram = split_stats_df.apply(pd.to_numeric)

split_trials_test_fracts = [{k:v['test_fraction'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trials_test_counts = [{k:v['test'] for k, v in trial.items()} for trial in computed_stats_non_diagram]
split_trial_df = pd.DataFrame(split_trials_test_fracts)
split_trial_df = split_trial_df.apply(pd.to_numeric)
split_stats_df = pd.concat([split_trial_df, pd.DataFrame(split_trials_test_counts)], axis=1)
split_stats_df_non_diagram = split_stats_df.apply(pd.to_numeric)

In [300]:
split_stats_df_non_diagram

Unnamed: 0,n_diagram_questions,n_instructional_diagrams,n_lessons,n_text_questions,n_topics,n_diagram_questions.1,n_instructional_diagrams.1,n_lessons.1,n_text_questions.1,n_topics.1
0,0.192,0.167,0.201,0.19,0.189,2418,46,216,2567,949


In [301]:
split_stats_df_diagram

Unnamed: 0,n_diagram_questions,n_instructional_diagrams,n_lessons,n_text_questions,n_topics,n_diagram_questions.1,n_instructional_diagrams.1,n_lessons.1,n_text_questions.1,n_topics.1
0,0.192,0.167,0.165,0.174,0.181,2418,46,14,389,142


## hide

In [None]:
split_stats_df.columns = ['fract_diagram_questions', 'fract_instructional_diagrams', 'fract_text_questions', 'fract_topics', 'n_diagram_questions', 'n_instructional_diagrams', 'n_text_questions', 'n_topics']

In [319]:
len(tt_split['test'] + tt_split['train'])

1076

In [126]:
# sorted(debug_tt_splits['train'], key= lambda x: x[1])

# sorted(debug_tt_splits['test'], key= lambda x: x[1])

In [315]:
print('L_0575' in tt_split['test'])
print('L_0575' in tt_split['train'])

test_st = 'te'
test_st.format()


manual_assignments.keys()

test_train_splitter.compute_split_stats(tt_split)

False
True


In [261]:
test_st = 'te'
test_st.format()

manual_assignments = {"ML_0289": {"lesson_name": "flow of energy", "split": "test"}, "ML_0505": {"lesson_name": "insect reproduction and life cycle", "split": "train"}}

manual_assignments.keys()

'te'

In [8]:
def record_validation_errors(dataset, schema):
    qs_removed = []
    validator = jsonschema.Draft4Validator(schema)
    for error in sorted(validator.iter_errors(dataset), key=lambda x: x.absolute_path[0]):
        print(error)
        lesson, quest, question_class, q_number = list(error.absolute_schema_path)[:4]
        problem_q_section = dataset[subject][lesson][quest][question_class]
        if q_number in problem_q_section.keys():
            print(dataset[subject][lesson][quest][question_class].pop(q_number))
            qs_removed.append(dataset[subject][lesson][quest][question_class].pop(q_number))
    return qs_removed

In [167]:
# ml_ids_train, ml_ids_test, debug_train = test_train_splitter.perform_split()

# ml_ids_train, ml_ids_test, debug_train = test_train_splitter.perform_split()

In [9]:
validator = jsonschema.Draft4Validator(new_schema)
for error in sorted(validator.iter_errors(ck12_combined_dataset_raw), key=lambda x: x.absolute_path[0]):
    print(error.message)
    print(error.absolute_path)
    print()

NameError: name 'new_schema' is not defined

# dataset stats

## topic names

In [None]:
es_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['earth-science'].values()] for item in sublist]
ps_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['physical-science'].values()] for item in sublist]
ls_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['life-science'].values()] for item in sublist]

combined_topics = es_lesson_names + ps_lesson_names + ls_lesson_names
topic_series = pd.Series(combined_topics).value_counts()
# topic_series[:18]
# topic_series[18:40]

In [None]:
topic_series[:20]

In [None]:
len(combined_topics)

In [None]:
len(topic_series)

__earth science topics__

In [None]:
sorted(ck12_combined_dataset['earth-science'].keys())

__life science topics__

In [None]:
sorted(ck12_combined_dataset['life-science'].keys())

__physical science topics__

In [None]:
sorted(ck12_combined_dataset['physical-science'].keys())

In [None]:
 all_lessons = ck12_combined_dataset['physical-science'].keys() + ck12_combined_dataset['life-science'].keys() + ck12_combined_dataset['physical-science'].keys() +ck12_combined_dataset['earth-science'].keys()
print len(all_lessons)

In [None]:
ck12_combined_dataset.values()

In [None]:
topic_list = [subject[lesson]['topics'].keys() for subject in ck12_combined_dataset.values() for lesson in subject]
flattened_topics = [topic for lesson in topic_list for topic in lesson]
len(flattened_topics)

In [None]:
len(set(flattened_topics))

## question type dist

prior to quizes

In [None]:
# q_types = []
# for subject, flexbook in ck12_combined_dataset.items():
#     for lesson in flexbook.values():
#         for question in lesson['questions']['nonDiagramQuestions'].values():
#             q_types.append(question['type'])
# question_counts = pd.Series(q_types).value_counts()
# print 'total number of questions = ' + str(question_counts.sum())
# question_counts

post quizes

In [82]:
6590 + 5401 + 1711

13702

In [116]:
q_types = []
for lesson in ck12_combined_dataset:
    for question in lesson['questions']['nonDiagramQuestions'].values():
        q_types.append(question['questionSubType'])
question_counts = pd.Series(q_types).value_counts()
print('total number of questions = ' + str(question_counts.sum()))
question_counts

total number of questions = 13537


Multiple Choice      6590
Fill in the Blank    3643
Matching             1711
Short Answer         1593
dtype: int64

In [None]:
_ = question_counts.plot(kind="barh")
plt.title('Question Format Distribution', fontsize=50, verticalalignment='bottom', color = b16_colors.b)
plt.ylabel("Question type", fontsize=30, labelpad=10, color = b16_colors.b)
plt.xlabel("Number of unique questions", fontsize=30, labelpad=10, color = b16_colors.b)
plt.tick_params(axis='x', which='major', labelsize=20)
plt.tick_params(axis='y', which='major', labelsize=20)

In [None]:
usable_questions = question_counts[:2].append(question_counts[3:4])

In [None]:
print 'questions usable immedeatly = ' + str(usable_questions.sum())
usable_questions

In [None]:
for subject, flexbook in ck12_combined_dataset.items():
    q_types = []
    for lesson in flexbook.values():
        for question in lesson['questions']['nonDiagramQuestions'].values():
            q_types.append(question['type'])
    question_counts = pd.Series(q_types).value_counts()
    print 'total number of ' + subject + ' questions = ' + str(question_counts.sum())
    print question_counts
    print 

## looking for missing values

In [None]:
for subject, flexbook in flexbook_ds.items():
    q_len = []
    for lesson_name, lesson in flexbook.items():
        q_len.append(len(lesson['questions']['nonDiagramQuestions'].values()))
        if q_len[-1] == 7:
            print  subject, lesson_name
#             pprint.pprint(lesson['questions']['nonDiagramQuestions'])
    q_lengths = pd.Series(q_len).value_counts()
    print 'total number of ' + subject + ' lessons = ' + str(q_lengths.sum())
    print q_lengths
    print 

The lessons with fewer questions seem to be genuine, i.e. those are the number of questions in the workbook

# html rendering

### old nested format

In [None]:
from IPython.core.display import HTML

In [None]:
for subject, flexbook in ck12_combined_dataset.items():
    for lesson_name, lesson in flexbook.items():
        if lesson['questions']['diagramQuestions']:
            lesson_html = display_lesson_html(ck12_combined_dataset[subject], lesson_name, 'questions')
            with open(os.path.join('review_diagram_questions', lesson_name + '.html'), 'w') as f:
                f.write(lesson_html.encode('ascii', 'ignore').decode('utf-8'))

In [None]:
for subject, flexbook in ck12_combined_dataset.items():
    for lesson_name, lesson in flexbook.items():
        if 'instructionalDiagrams' in lesson.keys():
            lesson_html = display_lesson_html(ck12_combined_dataset[subject], lesson_name, 'descriptions')
            with open(os.path.join('review_diagram_descriptions', lesson_name + '.html'), 'w') as f:
                f.write(lesson_html.encode('ascii', 'ignore').decode('utf-8'))

### new flat format

In [None]:
# display_lesson_html(ck12_combined_dataset[20], ck12_combined_dataset[20]['lessonName'], 'lessons', out_path)

In [None]:
%%writefile render_html.py
def render_html_from_dataset(path_to_data_json):
    
    with open(output_dir + 'build_v5f.json', 'r') as f:
        ck12_combined_dataset = json.load(f)
    
    out_path = '../html_renders' 
    render_types = ['lessons', 'questions', 'descriptions']
    for render in render_types:
        for lesson in ck12_combined_dataset:
            if render == 'lessons':
                lesson_html = display_lesson_html(lesson, lesson['lessonName'], render, out_path)
                with open(os.path.join(out_path, render, lesson['lessonName'].replace(' ', '_') + '.html'), 'w') as f:
                    f.write(lesson_html.encode('ascii', 'ignore').decode('utf-8'))
            elif lesson['questions']['diagramQuestions']:
                lesson_html = display_lesson_html(lesson, lesson['lessonName'], render, out_path)
                with open(os.path.join(out_path, render, lesson['lessonName'].replace(' ', '_') + '.html'), 'w') as f:
                    f.write(lesson_html.encode('ascii', 'ignore').decode('utf-8'))

In [None]:
render_html_from_dataset('./output_data_from_nbs/build_v5f.json')

# building science word corpus

In [None]:
flexbook_ds['life-science'].values()[0]['topics'].values()[0]

In [None]:
vocab_sections = ['Lesson Vocabulary', 'Vocabulary']
flexbook_vocab = set()

for subject in flexbook_ds.values():
    for lesson in subject.values():
        for topic_name, topic in lesson['topics'].items():
            if topic_name in vocab_sections:
                vocab_section = topic['content']['text']
                if '\n' in vocab_section:
                    flexbook_vocab.update(vocab_section.lower().split('\n'))
                elif ' ' in vocab_section:
                    flexbook_vocab.update(vocab_section.lower().split())
                else:
                    print 'no vocab'

In [None]:
len(flexbook_vocab)

In [None]:
from nltk.tokenize import wordpunct_tokenize
cached_sw = stopwords.words("english") + list(string.punctuation)

In [None]:
vocab_list = list(flexbook_vocab)

In [None]:
seperated_list = [word.replace('(', ' ').replace(')', ' ').split() for word in vocab_list]

In [None]:
flattened_list = [item for sublist in seperated_list for item in sublist if len(item) > 3 and item not in cached_sw]

In [None]:
len(flattened_list)

In [None]:
with open('ck_12_vocab_words.pkl', 'w') as f:
    pickle.dump(flattened_list, f)

Use entire corpus

In [None]:
flexbook_vocab = ''

for subject in flexbook_ds.values():
    for lesson in subject.values():
        for topic_name, topic in lesson['topics'].items():
            vocab_section = topic['content']['text']
            flexbook_vocab += ' ' + vocab_section.lower()

In [None]:
len(flexbook_vocab)

In [None]:
fb_tokens = wordpunct_tokenize(flexbook_vocab)
normalized_tokens = [toke.strip().lower().encode('ascii', 'ignore').decode() for toke in fb_tokens if toke not in cached_sw]

In [None]:
tb_freq_d = nltk.FreqDist(normalized_tokens)
most_common_fb_words = tb_freq_d.most_common()

In [None]:
fb_entire_corpus_vb = set(normalized_tokens)

In [None]:
with open('ck_12_all_words.pkl', 'w') as f:
    pickle.dump(fb_entire_corpus_vb, f)

In [None]:
len(fb_entire_corpus_vb)

# End

In [None]:
import jinja2
from IPython.core.display import HTML

jnjenv = jinja2.Environment()

%%writefile lesson_viz.py
def make_lesson_data(lesson_json):
    nested_text = []    
    for topic, content in sorted(lesson_json['topics'].items(), key=lambda (k,v): v['orderID']):
        nested_text.append((topic, content['content']['text']))
    return nested_text

def make_page_html(lesson_data, page_html):
    return jnjenv.from_string(page_html).render(lesson=lesson_data[0], topics=lesson_data[1])

def display_lesson_html(flexbook, lesson):
    lesson_json = flexbook[lesson]
    lesson_data = (lesson, make_lesson_data(lesson_json))
    lesson_html = make_page_html(lesson_data, page_html)
    return HTML(lesson_html)

page_html = """
<!DOCTYPE html>
<html>
  <head>
    <style type="text/css">
    </style>
  </head>
  <body>
    <div class="container">
      <h1>Lesson: {{lesson}}</h1>
      <ul>
        {% for topic in topics %}
        <p>
        </p>
        <h3>{{topic.0}}</h3>
        <p>{{
        topic.1
        }}</p>
        {% endfor %}
      </ul>
    </div>
    <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
    <script src="http://netdna.bootstrapcdn.com/bootstrap/3.0.0/js/bootstrap.min.js"></script>
  </body>
</html>
"""

In [None]:

stat_data = {'Number of Entities':stats_counter, 'Average Number per image': stats_fract}
count = 2
html = "<table>"
# add header row

html += "<tr><th>"
for k in stat_data.keys():
    html += "<th>"+k

html += "<tr><th>Entity Category"
for j in range(count):
    html += "<th>"

for k, v in stats_counter.items():
    html += "<tr><th>"+k
#     for j in range(count):
    html += "<td>" + str(v)
    html += "<td>" + "%.2f" % stats_fract[k]
html += '<tr>'
    
# for k, v in stats_fract.items():
#     html += "<tr><th>"+k
#     for j in range(count):
#         html += "<td>" + str(v)
        
html += "</table>"
HTML(html)

In [None]:
page_html = """
<html>
<head>
<title>{{ title }}</title>
</head>
<body>
Hello.
</body>
</html>
"""

In [None]:
page_template = """
<html>
 <head>
  <title>KB HIT</title>
  <meta content='text/html'/>
  <script type='text/javascript' src='https://s3.amazonaws.com/mturk-public/externalHIT_v1.js'></script>
 </head>
 <body>
    <p>We are constructing a large knowledge base (KB) about elementary science and commonsense knowledge, to help computers answer questions more reliably. We are planning to release the KB as a free, open source resource for the community when it is complete. Your work here will help us assemble this KB and contribute to this effort.</p>

    <p>Below, the computer has automatically extracted some candidate facts from text for possible inclusion in the KB. However, some are weird, false, or nonsensical. This task will help us distinguish the good facts, to include in the KB, from the bad.</p>
     <form name='mturk_form' method='post' id='mturk_form' action='https://workersandbox.mturk.com/mturk/externalSubmit'>
      <input type='hidden' value='' name='assignmentId' id='assignmentId'/>		 
      <table>
        <tr><th></th><th>Commonsense Knowledge</th></tr>
        {% for n in input_data %}
            <tr><td>{{n.sentence}}</td><td nowrap>
            <!--these break-->
            <!--<input type="hidden" name="{{n.sentence_id}}" id="assignmentId" value="ASSIGNMENT_NOT_AVAILABLE" />-->
            <!--<input type="hidden" name="assignmentId" id="assignmentId" value="ASSIGNMENT_NOT_AVAILABLE" />-->
            <!--this is in the official documentation but breaks anyway!-->
            <!--<input type='hidden' value='' name='assignmentId' id='assignmentId'/>-->
            <!--this works:-->
            <input name="{{n.sentence_id}}" type="radio" value="true-act" />EXPECTED ACTION
            <input name="{{n.sentence_id}}" type="radio" value="false-act" />RARE/FALSE ACTION
            <input name="{{n.sentence_id}}" type="radio" value="true-prop" />TRUE PROPERTY
            <input name="{{n.sentence_id}}" type="radio" value="false-prop" />RARE/FALSE PROPERTY
            <input name="{{n.sentence_id}}" type="radio" value="nonsense" />NONSENSE
            <input name="{{n.sentence_id}}" type="radio" value="unknown" />DON'T KNOW
            </td></tr>
        {% endfor %}
      </table>
      <p><input type="submit" id="submitButton" value="Submit" /></p>
   </form>
  <script language="Javascript">turkSetAssignmentID();</script>
 </body>
</html>

"""

In [None]:
subject = 'life-science' 
# lesson = '15.1 Understanding Animal Behavior'
random_lesson = np.random.choice(flexbook_ds[subject].keys(), 1)[0]
display_lesson_html(flexbook_ds[subject], random_lesson)