# Table of Contents
* &nbsp;
	* [imports](#imports)
	* [simple functions](#simple-functions)
* [Load data](#Load-data)
	* [setting paths](#setting-paths)
	* [parsed content and raw questions and descriptions](#parsed-content-and-raw-questions-and-descriptions)
	* [localization and recognition](#localization-and-recognition)
	* [building spellings and grammar](#building-spellings-and-grammar)
* [Clean and prepare data](#Clean-and-prepare-data)
	* [extract media links](#extract-media-links)
	* [remove non-conforming content](#remove-non-conforming-content)
		* [code](#code)
		* [run](#run)
	* [remove recognition and localization errors](#remove-recognition-and-localization-errors)
* [Add image annotations](#Add-image-annotations)
	* [localization](#localization)
	* [recognition](#recognition)
		* [code](#code)
		* [run](#run)
		* [hide](#hide)
* [Integrate diagram questions and descriptions](#Integrate-diagram-questions-and-descriptions)
	* [match diagram topics to lessons](#match-diagram-topics-to-lessons)
		* [code](#code)
		* [run](#run)
		* [hide](#hide)
	* [merge questions](#merge-questions)
		* [code](#code)
		* [run](#run)
		* [hide](#hide)
	* [merge descriptions](#merge-descriptions)
		* [hide](#hide)
	* [Apply spelling and grammar fixes](#Apply-spelling-and-grammar-fixes)
		* [code](#code)
		* [run](#run)
		* [hide](#hide)
* [Topic key collisions](#Topic-key-collisions)
* [Refinements to make](#Refinements-to-make)
* [End](#End)


## imports

In [172]:
%%capture
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
# %base16_mplrc light solarized
%base16_mplrc dark solarized
plt.rcParams['grid.linewidth'] = 0
plt.rcParams['figure.figsize'] = (16.0, 10.0)

import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats.mstats import mode

import itertools
import math
from collections import Counter, defaultdict, OrderedDict
%load_ext autoreload
%autoreload 2

import cv2
import pprint
import pickle
import json
import requests
import io
import sys
import os
from binascii import b2a_hex
import base64
from wand.image import Image as WImage
from IPython.display import display
from IPython.core.display import HTML
import PIL.Image as Image
from copy import deepcopy
import glob
import random

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
import language_check
import enchant
import difflib
import diff_match_patch
import fuzzywuzzy.fuzz as fuzz
import re
import jsonschema
from pdfextraction.ck12_schema import ck12_schema as schema

## simple functions

In [173]:
def write_file(filename, data_dict, output_dir='output_data_from_nbs'):
    with open(os.path.join(output_dir, filename), 'w') as f:
        json.dump(data_dict, f, indent=4, sort_keys=True)
        
def get_img_n(image_name):
    return [re.findall("[0-9]+", image_name)][0][0]

def clean_list(dir_path):
    hidden_removed = filter(lambda f: not f.startswith('.'), os.listdir(dir_path))
    return [topic.replace('_diagram', '') for topic in hidden_removed]

# Load data

load last_checkpoint

In [174]:
# with open('build_v5.pkl', 'r') as f:
#     latest_checkpoint = pickle.load(f)

## setting paths

In [175]:
output_dir = 'output_data_from_nbs/'
raw_data_dir = '../spare5_produced_data/data/'
raw_dq_file = 'ai2_testquestions_20161005.csv'
s5_raw_decriptions = 'ai2_diagramdescriptions_20161018.csv'
ai2_raw_decriptions = 'our_description.csv'
glossary_path = os.path.join(output_dir, 'flexbook_glossary.pkl')

turk_proc_dir = '/Users/schwenk/wrk/stb/diagram_questions/turk_processing/'
metadata_dir = turk_proc_dir + 'store_hit_results_metadata/'
lc_results_dir = 'loc_group_3'
box_loc_joined = 'loc_annotations'
recog_results_dir = 'group_latest_combined'

box_choices_1_dir = 'final_text_boxes_fixed'
box_choices_2_dir = 'final_text_boxes_pass_2'
none_agree = 'no_turkers_agree_lookup.pkl'
two_agree_lookup = 'two_turkers_agree_lookup.pkl'
all_agree_lookup = 'user_diag_loopkup.pkl'

recog_performed = '/Users/schwenk/wrk/stb/diagram_questions/turk_processing/final_diagrams/'
all_dir = '/Users/schwenk/wrk/stb/ai2-vision-textbook-dataset/diagrams/tqa_diagrams_v0.9/'
pruned_dir = '/Users/schwenk/wrk/stb/ai2-vision-textbook-dataset/diagrams/dataset_Sep_27/tqa_diagrams_v0.9_question_images/'
description_dir = '/Users/schwenk/wrk/stb/spare5_produced_data/tqa_diagrams_v0.9_inbook/'

## parsed content and raw questions and descriptions

In [176]:
%%capture
# load complete text v 3.5, raw diagram questions and descriptions
with open(output_dir + 'ck12_dataset_beta_v3_5.json', 'r') as f:
    ck12_combined_dataset_raw = json.load(f)
with open(output_dir + 'ck12_flexbook_only_beta_v3.json', 'r') as f:
    flexbook_ds = json.load(f)
with open(output_dir + 'ck12_lessons_only_beta_v3.json', 'r') as f:
    lessons_ds = json.load(f)

# loading questions
desc_df = pd.read_csv(raw_data_dir + s5_raw_decriptions, encoding='latin-1')
desc_df['diagram'] = desc_df['reference_id'].apply(lambda x: x.split('/')[-1])

ai2_raw_decriptions_df = pd.read_csv(raw_data_dir + ai2_raw_decriptions, encoding='latin-1')
ai2_written_df_completed = ai2_raw_decriptions_df[['Topic', 'Image Path', 'Description']]
ai2_written_df_completed['diagram'] = ai2_written_df_completed['Image Path'].apply(lambda x: x.split('/')[-1])
ai2_written_df_completed['topic'] = ai2_written_df_completed['Topic']
del  ai2_written_df_completed['Topic']

#loading questions
q_col = '03_write_question'
r_ans_col = '04_write_right_answer'
w_ans_col = '05_write_wrong_answers'
data_cols = [q_col, r_ans_col, w_ans_col]
raw_dq_df = pd.read_csv(raw_data_dir + raw_dq_file, encoding='latin-1')
dr_proc_df = raw_dq_df.copy()
dr_proc_df['wac_list'] = dr_proc_df[w_ans_col].apply(lambda x: json.loads(x))
dr_proc_df['diagram'] = dr_proc_df['reference_id'].apply(lambda x: x.split('/')[-1])
dr_proc_df['topic'] = dr_proc_df['reference_id'].apply(lambda x: x.split('/')[-1].rsplit('_', maxsplit=1)[0])

with open('../diagram_questions/topic_match_terms.json', 'r') as f:
    topic_term_match = json.load(f)    

with open(glossary_path, 'rb') as f:
    flexbook_glossary = pickle.load(f)

### localization and recognition

In [177]:
loc_res_df = pd.read_pickle(os.path.join(metadata_dir, lc_results_dir, 'complete_df.pkl'))
recog_res_df = pd.read_pickle(os.path.join(metadata_dir, recog_results_dir, 'recog_df.pkl'))

## building spellings and grammar

In [178]:
# loading spelling defs
with open(output_dir + 'ck_12_vocab_words.pkl', 'rb') as f:
    ck_12_vocab = set(pickle.load(f))
with open(output_dir + 'ck_12_all_words.pkl', 'rb') as f:
    ck_12_corp = set(pickle.load(f))
    
with open(output_dir + 'spellings_to_rev.txt', 'r') as f:
    whitelisted_words = f.read().split('\n')[:-1]    
with open(output_dir + './desc_spellings_to_rev.txt', 'r') as f:
    whitelisted_words += f.read().split('\n')[:-1]
with open(output_dir + './ck_12_spelling_rev.txt', 'r') as f:
    whitelisted_words += f.read().split('\n')[:-1]
with open('diagram_rec_corp.pkl', 'rb') as f:
    diagram_rec_corpus = pickle.load(f)
    
ck_12_corp.update(ck_12_vocab)
ck_12_corp.update(whitelisted_words)
ck_12_corp.update(diagram_rec_corpus)

# build spelling dict updated with words from science corpus
edict = enchant.Dict("en_US")
anglo_edict = enchant.Dict("en_UK")
cached_sw = stopwords.words("english") + list(string.punctuation)
for word in ck_12_corp:
    if word.isalpha() and len(word) > 3:
        edict.add(word)
        
# grammaer checker
gram_checker = language_check.LanguageTool('en-US')
gram_checker.disabled = set(['SENT_START_CONJUNCTIVE_LINKING_ADVERB_COMMA', 'POSSESSIVE_APOSTROPHE', 'A_PLURAL'])
gram_checker.disable_spellchecking()

# Clean and prepare data

## extract media links

In [179]:
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

In [180]:
pat_str = "(?:https?:\/\/(?:www\.).*?\s)"
web_link_patern=re.compile(pat_str)

def clean_content_text(content_str, web_link_patern):
    removed_links = web_link_patern.findall(content_str)
    if not removed_links:
        return '', ''
    split_txt = web_link_patern.split(content_str)
    cleaned_text = ' '.join([txt for txt in split_txt if txt])
    return cleaned_text, [link.strip() for link in removed_links]

def extract_links(complete_ds):
    for subject, lessons in complete_ds.items():
        for lesson_title, lesson in lessons.items():
            for topic, content in lesson['topics'].items():
                content_str = content['content']['text']
                new_text, links = clean_content_text(content_str, web_link_patern)
                content['content']['mediaLinks'] = []
                if links:
                    content['content']['text'] = new_text
                    content['content']['mediaLinks'].extend(links)

In [181]:
extract_links(ck12_combined_dataset)

## remove non-conforming content

### code

In [182]:
def validate_schema(dataset_json):
    errors = []
    try:
        validator = jsonschema.Draft4Validator(schema)
        for error in sorted(validator.iter_errors(dataset_json), key=lambda x: x.absolute_path[0]):
            errors.append([error.message, list(error.absolute_path)[:4]])
    except jsonschema.ValidationError as e:
        errors.append("Error in schema --%s-", e.message)
    return errors

def validate_dataset(dataset_json):
    for subject, flexbook in dataset_json.items():
        schema_errors = validate_schema(flexbook)
        for lesson_name, lesson in flexbook.items():
            ac_errors = check_ac_counts(lesson, subject, lesson_name)
        all_errors = schema_errors + ac_errors
        if not all_errors:
            return 'all validation test passed'
        else:
            return all_errors

def check_ac_counts(lesson_content, subject, lesson_name):
    errors = []
    for qid, question in lesson_content['questions']['nonDiagramQuestions'].items():
        if question['type'] == 'Multiple Choice':
            if len(question['answerChoices']) != 4:
                errors.append([subject, lesson_name, qid + ' mc error'])
        if question['type'] == 'True or False':
            if len(question['answerChoices']) != 2:
                errors.append([subject, lesson_name, qid + ' tf error'])
    return errors

def record_validation_errors(dataset):
    qs_removed = []
    for subject, flexbook in dataset.items():
        validator = jsonschema.Draft4Validator(schema)
        for error in sorted(validator.iter_errors(flexbook), key=lambda x: x.absolute_path[0]):
            lesson, quest, question_class, q_number = list(error.absolute_path)[:4]
            problem_q_section = dataset[subject][lesson][quest][question_class]
            if q_number in problem_q_section.keys():
#                 print(dataset[subject][lesson][quest][question_class].pop(q_number))
                qs_removed.append(dataset[subject][lesson][quest][question_class].pop(q_number))
    return qs_removed

### run

In [183]:
validate_dataset(ck12_combined_dataset)

'all validation test passed'

In [184]:
qs_rem = record_validation_errors(ck12_combined_dataset)
len(qs_rem)

0

## remove recognition and localization errors

In [185]:
diagram_image_names = clean_list(recog_performed)

rec_files = glob.glob(all_dir + '*/*')
more_paths = glob.glob(all_dir + '*/*')
pruned_paths = glob.glob(pruned_dir + '*/*')
more_files = [fp.split('/')[-1] for fp in more_paths]
pruned_files = [fp.split('/')[-1] for fp in pruned_paths]
desc_paths = glob.glob(description_dir + '*/*')
desc_files = [fp.split('/')[-1] for fp in desc_paths]


pruned_nums = set([get_img_n(name) for name in pruned_files])
all_nums = set([get_img_n(name) for name in more_files])
rec_nums = set([get_img_n(name) for name in diagram_image_names])
desc_nums = set([get_img_n(name) for name in desc_files])

removed_images = all_nums.difference(pruned_nums.union(desc_nums))

removed_image_names = []
for img_n in removed_images:
    for image_name in more_files:
        if img_n == get_img_n(image_name):
            removed_image_names.append(image_name)

name_change_lookup = {}
for image_name in more_files:
    img_n = get_img_n(image_name)
    for newer_name in pruned_files:
        if img_n == get_img_n(newer_name) and newer_name != image_name:
            name_change_lookup[image_name] = newer_name

removed_image_names = sorted(removed_image_names)

In [186]:
blacklisted_topics = ['periodic_table', 'em_spectrum', 'hydrocarbons', 'geologic_time'] + ['lewis_dot_idapgrams', 'circuits']  # correct this mispelling in future round

In [187]:
len(removed_image_names)

9

# Add image annotations

## localization

In [188]:
loc_res_df.head(1)

Unnamed: 0,diagram,rectangle,hit_id,assignment_id,worker_id
0,parts_cell_1182.png,"[[283, 192], [447, 238]]",3SA4EMRVJV39U1MGLCYP6KPFULH0PX,3BDCF01OGXVJNV1XRULS5F5Z4B6LYG,A1017VP86SLXRB


In [189]:
loc_anno = clean_list(os.path.join(turk_proc_dir, box_loc_joined))
loc_anno_images = [fig.split('.json')[0]  for fig in loc_anno]
keep_figures = [fig for fig in loc_anno_images if fig not in removed_image_names]

loc_box_path = os.path.join(turk_proc_dir, box_loc_joined)

diag_loc_annotations = {}
for diagram_name in keep_figures:
    anno_file_path = os.path.join(loc_box_path, diagram_name + '.json')
    if not os.path.exists(anno_file_path):
        diagram_name = diagram_name.replace('optics_rays', 'optics_ray_diagrams')
        anno_file_path = os.path.join(loc_box_path, diagram_name  + '.json')
    with open(anno_file_path, 'r') as f:
        diag_loc_annotations[diagram_name] = json.load(f)

combined_master_file_list = pruned_files + desc_files
combined_master_file_list_whitelisted = [file for file in combined_master_file_list if file in keep_figures]
files_still_needing_localisation = sorted(list(set(combined_master_file_list).difference(set(diag_loc_annotations))))
len(files_still_needing_localisation)

142

## recognition

### code

In [190]:
def most_common_strict(image_response):
    """
    returns the consensus response of the three raw response strings for a given image
    """
    most_common = image_response[1]['raw_text'].mode()
    if most_common.empty:
        most_common = 'nonconsensus'
        noncon.append(image_response[1]['raw_text'])
    else:
        most_common = most_common.values[0]
    return most_common

def most_common_lax(image_response, strings_denoting_missing_image=[]):
    """
    returns the consensus response after stripping white space and converting the reponses to lower case
    """
    simple_sanitizer = lambda x : x.lower().strip().lstrip()
    ind_responses = image_response[1]['raw_text'].values
    probobly_blanks = [response for response in ind_responses if response in strings_denoting_missing_image]
    if probobly_blanks:
        return 'skip'
    most_common = image_response[1]['raw_text'].apply(simple_sanitizer).mode()
    if most_common.empty:
        most_common = 'no consensus'
        noncon[image_response[0][0]].extend(image_response[1]['raw_text'])
    else:
        most_common = most_common.values[0]
    return most_common

def find_transcriptions_matches(batch_results_df, response_matcher):
    """
    returns a pandas series with the consunsus response for each image
    """
    agreed_responses = pd.DataFrame()
    for image_response in batch_results_df.groupby(['diagram', 'box_diag_idx']):
        diagram_and_idx = image_response[0]
        most_common = response_matcher(image_response, strings_denoting_missing_image=[])
        if most_common == 'skip':
            continue
        this_row = pd.DataFrame(list(diagram_and_idx) + [most_common, image_response[1]['rectangle'].iloc[0], image_response[1]['assignment_id'].iloc[0]]).T
        agreed_responses = pd.concat([agreed_responses, this_row])
        # The reindex below is needed to match the original df index after the groupby operation
    agreed_responses.columns = ['diagram', 'box_diag_idx', 'consensus_res', 'rectangle', 'assignment_id']
    return agreed_responses

### run

In [191]:
recog_performed_on = set(pd.unique(recog_res_df['diagram']).tolist())
len(recog_performed_on)

2190

In [192]:
files_still_needing_recognition = sorted(list(set(pruned_files).difference(set(recog_performed_on))))
print(len(files_still_needing_recognition))
file_with_loc_no_recog = set(files_still_needing_recognition).difference(files_still_needing_localisation)
print(len(file_with_loc_no_recog))

131
7


In [193]:
noncon = defaultdict(list)
transcription_results_lax = find_transcriptions_matches(recog_res_df, most_common_lax)

In [194]:
noncon_entries = [entries for entries in noncon.values()]
flattened_noncon = [item for sublist in noncon_entries for item in sublist]

In [195]:
curated_no_image_strings = set(['*no image showing*', '', ' ', 'NA', '?', 'na', '0', 'No image found', 'blank', 'Nothing showing', "where is the images , i can't see anything", 'NO IMAGE', ''])

In [196]:
non_blank_no_consensus = {d_name: rec_res for d_name, rec_res in noncon.items() if not curated_no_image_strings.intersection(set(rec_res))}
blank_no_consensus = {d_name: rec_res for d_name, rec_res in noncon.items() if curated_no_image_strings.intersection(set(rec_res))}
print(len(non_blank_no_consensus))
print(len(blank_no_consensus))

408
99


In [197]:
flattened_noncon_no_blank = [item for sublist in non_blank_no_consensus.values() for item in sublist]
build_diagram_rec_corpus  = [words.split() for words in transcription_results_lax['consensus_res'].values.tolist()]
diagram_rec_corpus = set([item.lower().strip() for sublist in build_diagram_rec_corpus for item in sublist if item.isalpha() and len(item) > 3])

### hide

In [198]:
# with open('diagram_rec_corp.pkl', 'wb') as f:
#     pickle.dump(diagram_rec_corpus, f)

In [199]:
# strings_denoting_missing_image = list(pd.Series(flattened_noncon).value_counts()[:20].index)
# Image.open('../ai2-vision-textbook-dataset/diagrams/turk_data/optics_ray_diagrams_9170.png')

In [200]:
len(diagram_rec_corpus)

4073

# Integrate diagram questions and descriptions

## match diagram topics to lessons

first need to match diagram topics to flexbook lessons

### code

In [201]:
def make_topic_matches(topic_list, combined_topics):
    topic_matches = {}
    for diagram_topic in topic_list:
        topic_matches[diagram_topic] = []
        for terms in topic_term_match[diagram_topic]:
            lev_dist_threshed = [topic for topic in combined_topics.keys() if fuzz.ratio(topic, terms) > 85]
            topic_matches[diagram_topic] += lev_dist_threshed
        if not topic_matches[diagram_topic]:
                for terms in topic_term_match[diagram_topic]:
                    lev_dist_threshed = [topic for topic in combined_topics.keys() if fuzz.token_set_ratio(topic, terms) > 80]
                    topic_matches[diagram_topic] += lev_dist_threshed
    return topic_matches

def make_lesson_matches(ck12_dataset, diagram_topic_name, topic_matches):
    lesson_matches = defaultdict(list)
    lessons_seen = set()
    content_topics =  topic_matches[diagram_topic_name]
    for topic in sorted(content_topics):
        associated_lesson =combined_topics[topic]['lesson']
        if associated_lesson not in lessons_seen:
            lessons_seen.add(associated_lesson)
            lesson_matches[diagram_topic_name].append(associated_lesson)
    return dict(lesson_matches)

### run

The pruned directory is the tqa 0.91 set assmbled by Ani on Sept 27th. It should be treated as definitive

In [202]:
diagram_topic_list = clean_list(pruned_dir)

In [203]:
es_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['earth-science'].values()] for item in sublist]
ps_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['physical-science'].values()] for item in sublist]
ls_lesson_names = [item for sublist in [val['topics'].keys() for val in ck12_combined_dataset['life-science'].values()] for item in sublist]

combined_lessons = es_lesson_names + ps_lesson_names + ls_lesson_names
topic_series = pd.Series(combined_lessons).value_counts()
# the 17 here found by inspection- any "topic" appearing many times is something general like review, vocab, etc
topics_to_remove = list(topic_series[:17].index)

In [204]:
combined_topics = defaultdict(dict)
for subject, book in ck12_combined_dataset.items():
    for lesson, material in book.items():
        for topic, text in material['topics'].items():
            if topic in topics_to_remove:
                continue
            combined_topics[topic.lower()]['lesson'] = lesson

In [205]:
topic_matches = make_topic_matches(diagram_topic_list, combined_topics)
missing= []
for k, v in topic_matches.items():
    if not v:
        missing.append(k)

In [206]:
matching_lessons = {}
for topic in diagram_topic_list:
    matched_lessons = make_lesson_matches(ck12_combined_dataset, topic, topic_matches)
    matching_lessons.update(matched_lessons)

In [207]:
diagram_lesson_lookup = {}
for d_topic, lessons in matching_lessons.items():
    diagram_lesson_lookup[d_topic] = sorted(lessons)[0]

In [208]:
#manually correct name changes made since diagrams were assembled
diagram_lesson_lookup['lewis_dot_diagrams'] = diagram_lesson_lookup['lewis_dots']
diagram_lesson_lookup['optics_ray_diagrams'] = diagram_lesson_lookup['optics_rays']

### hide

In [209]:
lessons_seen = []
dupe_lessons = []
for k, v in diagram_lesson_lookup.items():
    if v not in lessons_seen:
        lessons_seen.append(v)
    else:
        dupe_lessons.append(v)

In [210]:
dupe_topics = defaultdict(list)
for k, v in diagram_lesson_lookup.items():
    if v in dupe_lessons:
        dupe_topics[v].append(k)
# dupe_topics

In [211]:
missing

[]

In [212]:
len(diagram_lesson_lookup.keys())

len(set(diagram_lesson_lookup.values()))

for k, v in sorted(matching_lessons.items()):
    print(k)
    print(sorted(v))
    print()

acid_rain_formation
['22.2 Effects of Air Pollution']

aquifers
['13.3 Groundwater']

atomic_mass_number
['5.1 Inside the Atom', 'atomic number', 'plasma']

atomic_structure
['5.1 Inside the Atom']

biomes
['climate zones and biomes']

blastocyst
['22.3 Reproduction and Life Stages']

cell_division
['5.1 Cell Division']

cellular_respiration
['9.4 Biochemical Reactions']

chemical_bonding_covalent
['7.3 Covalent Bonds']

chemical_bonding_ionic
['7.2 Ionic Bonds']

circuits
['23.3 Electric Circuits']

continental_drift
['6.2 Continental Drift']

convection_of_air
['18.2 Transfer of Thermal Energy']

cycle_carbon
['18.2 Cycles of Matter']

cycle_nitrogen
['18.2 Cycles of Matter']

cycle_rock
['4.1 Types of Rocks', 'rocks and processes of the rock cycle']

cycle_water
['importance of the oceans']

dna
['nucleic acid classification']

earth_day_night
['rotation of earth']

earth_eclipses
['24.4 The Sun and the EarthMoon System']

earth_magnetic_field
['24.2 Earth as a Magnet']

earth_moon_

In [213]:
# pprint.pprint(dict(dupe_topics))

## merge questions

### code

In [214]:
dq_image_folder = 'diagram-question-images/'
td_image_folder = 'diagram-teaching-images/'

def make_question_entry(qdf_row):
    ask = qdf_row[qdf_row.index == '03_write_question'].values[0]
    answer = qdf_row[qdf_row.index == '04_write_right_answer'].values[0]
    wrong_answers = qdf_row[qdf_row.index == 'wac_list'].values[0]
    q_topic = qdf_row[qdf_row.index == 'lesson_assigned_to'].values[0]
    image_uri = qdf_row[qdf_row.index == 's3_uri'].values[0]
    image_name = qdf_row[qdf_row.index == 'diagram'].values[0]
    
    def make_answer_choices(answer_choices):
        build_answer_choices = {}
        letter_options = list('abcd')
        random.shuffle(answer_choices)
        for idx, answer_choice in enumerate(answer_choices):
            answer_choice_dict = {
                "idStructural": letter_options[idx] + '.',
                "rawText": answer_choice,
                "processedText": answer_choice
            }
            build_answer_choices[letter_options[idx]] = answer_choice_dict
        return build_answer_choices
    a_choices = make_answer_choices(wrong_answers + [answer])
    single_q_dict = {
        "id": 'q',
        "type": 
            "Diagram Multiple Choice",
        "beingAsked": {
            "rawText": ask,
            "processedText": ask.encode('ascii', 'ignore').decode('utf-8')
        },
        "correctAnswer": {
            "rawText": answer,
            "processedText": answer.encode('ascii', 'ignore').decode('utf-8')
        },
        "answerChoices": a_choices,
        "imageUri": image_uri,
        "imageName": image_name
    }
    build_questions[q_topic].append(single_q_dict)
    
    
def refine_question_formats(raw_questions):
    reformatted_dq_ds = {}
    for topic, topic_questions in raw_questions.items():
        reformatted_topic = {topic: {'questions': {'diagramQuestions': {}}}}
        reformatted_questions = {}
        for idx, question in enumerate(topic_questions):
            question = deepcopy(question)
            question['id'] += str(idx + 1).zfill(4)
            reformatted_questions[question['id']] = question
        reformatted_topic[topic]['questions']['diagramQuestions'] = reformatted_questions
        reformatted_dq_ds.update(reformatted_topic)
    return reformatted_dq_ds

s3_base = 'https://s3.amazonaws.com/ai2-vision-textbook-dataset/diagrams/' + dq_image_folder
s3_base_descriptions = 'https://s3.amazonaws.com/ai2-vision-textbook-dataset/diagrams/' + td_image_folder

def make_image_link(old_url, s3_base=s3_base):
    image_name = old_url.split('/')[-1]
    new_url = s3_base + image_name
    return new_url

### run

In [215]:
dr_proc_df['s3_uri'] = dr_proc_df['reference_id'].apply(make_image_link)
dr_proc_df['lesson_assigned_to'] = dr_proc_df['topic'].apply(lambda x: diagram_lesson_lookup[x])

In [216]:
build_questions = defaultdict(list)
_ = dr_proc_df.apply(make_question_entry, axis=1)

In [217]:
refined_questions = refine_question_formats(build_questions)

In [218]:
for subject, lessons in ck12_combined_dataset.items():
    for l_name, lesson in lessons.items():
        if l_name in refined_questions.keys():        
            lesson['questions']['diagramQuestions'] = refined_questions[l_name]['questions']['diagramQuestions']
        else:
            lesson['questions']['diagramQuestions']  = {}

### hide

In [219]:
refined_questions = dict(refine_question_formats(build_questions))

refined_questions['10.4 Erosion and Deposition by Glaciers']['questions'].keys()

dict_keys(['diagramQuestions'])

In [220]:
refined_questions['10.4 Erosion and Deposition by Glaciers']

len(ck12_combined_dataset['earth-science']['10.4 Erosion and Deposition by Glaciers']['questions']['diagramQuestions'])

len(ck12_combined_dataset['earth-science']['10.4 Erosion and Deposition by Glaciers']['questions']['nonDiagramQuestions'])

val_counts=dr_proc_df['lesson_assigned_to'].value_counts()

In [221]:
val_counts

10.1 Introduction to Plants                     1146
24.1 Flow of Energy                             1006
3.2 Cell Structures                              916
12.4 Insects and Other Arthropods                719
17.3 The Digestive System                        569
6.1 Inside Earth                                 405
24.4 The Sun and the EarthMoon System            389
24.1 Planet Earth                                361
20.1 The Nervous System                          337
19.1 The Respiratory System                      291
10.2 Evolution and Classification of Plants      281
22.3 Vision                                      267
8.3 Types of Volcanoes                           213
25.1 Introduction to the Solar System            197
22.2 Optics                                      195
11.3 Nuclear Energy                              192
18.1 Overview of the Cardiovascular System       181
4.2 Photosynthesis                               177
22.1 Male Reproductive System                 

## merge descriptions

In [222]:
def make_description_entry(qdf_row):
    description = qdf_row[qdf_row.index == 'Description'].values[0]
    q_topic = qdf_row[qdf_row.index == 'lesson_assigned_to'].values[0]
    image_uri = qdf_row[qdf_row.index == 's3_uri'].values[0]
    image_name = qdf_row[qdf_row.index == 'diagram'].values[0]
    image_key = image_name.replace('.png', '')
    single_desc_dict = {
        "imageUri": image_uri,
        "imageName": image_name,
        "rawText": description,
        "processedText": description.encode('ascii', 'ignore').decode('utf-8')
        }
    if image_key not in build_descriptions[q_topic].keys():
        build_descriptions[q_topic].update({image_key: single_desc_dict})
    # I've found the longest description is usually best
    elif len(single_desc_dict['processedText']) > len(build_descriptions[q_topic][image_key]['processedText']):
        build_descriptions[q_topic].update({image_key: single_desc_dict})

In [223]:
%%capture
ai2_written_df_completed['lesson_assigned_to'] = ai2_written_df_completed['topic'].apply(lambda x: diagram_lesson_lookup[x])
ai2_written_df_completed['s3_uri'] = ai2_written_df_completed['Image Path'].apply(make_image_link)
ai2_written_df_completed = ai2_written_df_completed.dropna()

desc_df['topic'] = desc_df['diagram'].apply(lambda x: x.rsplit('_', maxsplit=1)[0])
desc_df['lesson_assigned_to'] = desc_df['topic'].apply(lambda x: diagram_lesson_lookup[x])
desc_df['s3_uri'] = desc_df['reference_id'].apply(make_image_link)
desc_df['Description'] = desc_df['01_write_description']             

In [224]:
build_descriptions = defaultdict(dict)
_ = desc_df.apply(make_description_entry, axis=1)
_ = ai2_written_df_completed.apply(make_description_entry, axis=1)

In [225]:
# this adds the descriptions to the combined dataset
for subject, lessons in ck12_combined_dataset.items():
    for l_name, lesson in lessons.items():
        if l_name in build_descriptions.keys():
            lesson['instructionalDiagrams'] = build_descriptions[l_name]
        else:
            lesson['instructionalDiagrams'] = {}

### hide

In [226]:
pd.unique(desc_df['lesson_assigned_to']).shape

(82,)

In [227]:
build_descriptions.keys()

dict_keys(['6.2 Continental Drift', '7.1 Stress in Earths Crust', '4.1 Types of Rocks', '21.3 First Two Lines of Defense', '21.3 The Electromagnetic Spectrum', '14.1 Introduction to the Oceans', '20.1 The Nervous System', 'processes of the water cycle', '8.3 Types of Volcanoes', '10.1 Introduction to Plants', '9.2 Fungi', '20.2 The Senses', '6.1 Inside Earth', '22.2 Optics', '9.1 Protists', '6.4 Theory of Plate Tectonics', '7.1 Introduction to Chemical Bonds', '16.2 The Integumentary System', '22.3 Reproduction and Life Stages', 'rotation of earth', '22.1 Male Reproductive System', '23.3 Electric Circuits', 'nails and hair', '7.2 Ionic Bonds', '13.3 Groundwater', '9.4 Biochemical Reactions', '25.2 Using Electromagnetism', '22.2 Effects of Air Pollution', '7.3 Covalent Bonds', '17.3 The Digestive System', '24.1 Planet Earth', '17.1 Climate and Its Causes', '22.1 Air Pollution', 'magnetic evidence for seafloor spreading', 'clouds', '14.2 Ocean Movements', '25.1 Introduction to the Solar 

In [228]:
# with open(output_dir + 'ck12_dataset_beta_v4.json', 'w') as f:
#     json.dump(ck12_combined_dataset, f, indent=4, sort_keys=True)

In [229]:
# with open(output_dir + 'ck12_dataset_beta_v4.json', 'r') as f:
#     ck12_combined_dataset = json.load(f)

## Test spelling and grammar fixes

### code

In [662]:
def check_mispelled(word):
    return word and word.isalpha() and not (edict.check(word) or anglo_edict.check(word) or edict.check(word[0].upper() + word[1:]))

def correct_spelling_error(misspelled_word, suggested_spellings):
    highest_ratio = 0
    closest_match = None
    for word in suggested_spellings:
        match_r = fuzz.ratio(misspelled_word, word)
        if match_r >= highest_ratio and (word[0] == misspelled_word[0] or not check_mispelled(word[0] + misspelled_word)) and len(misspelled_word) <= len(word):
            highest_ratio = match_r
            closest_match = word
            break
    spell_changes[misspelled_word] = closest_match
    return closest_match

def apply_spelling_fix(orig_text):
    orig_text_tokens = orig_text.split()
    processed_tokens = []
    for token in orig_text_tokens:
        norm_token = token.lower()
        if len(norm_token) < 4:
            processed_tokens.append(token)
            continue
        if check_mispelled(norm_token):
            suggested_replacements = edict.suggest(token)
            replacement_text = correct_spelling_error(norm_token, suggested_replacements)
            if replacement_text:
                if norm_token[0].isupper():
                    replacement_text = upper(replacement_text[0]) + replaced_text[1:]
                processed_tokens.append(replacement_text)
            else:
                processed_tokens.append(token)
        else:
            processed_tokens.append(token)
    return ' '.join(processed_tokens)

def diff_corrected_text(orig_text, corrected_text):
    diff = dmp.diff_main(orig_text, corrected_text)
    return HTML(dmp.diff_prettyHtml(diff))

def specify_lesson_q_path(lesson):
    pass
    


### run

In [663]:
dmp = diff_match_patch.diff_match_patch()

In [664]:
ck12_spell_gramm_fix_test = deepcopy(ck12_combined_dataset)

In [665]:
gram_checker = language_check.LanguageTool('en-US')
gram_checker.disabled = set(['SENT_START_CONJUNCTIVE_LINKING_ADVERB_COMMA', 'POSSESSIVE_APOSTROPHE', 'A_PLURAL'])
gram_checker.disable_spellchecking()

punc_set_space = set([',', ':', ';', '/"'])
punc_set_nospace = set(['-', '\'', '-', '?', '.', '!'])
question_enders = set(['.', '?', ':'])

In [666]:
#check descriptions
spell_changes = {}
unaltered_text = []
replaced_text = []
for lesson in list(ck12_spell_gramm_fix_test['life-science'].values()):
    if lesson['instructionalDiagrams']:
        for diagram, description in lesson['instructionalDiagrams'].items():
            orig_text = description['processedText']
            spell_fixed_text = apply_spelling_fix(orig_text)
            for punc_char in punc_set_nospace:
                spell_fixed_text = spell_fixed_text.replace(' ' + punc_char + ' ' , punc_char)
            for punc_char in punc_set_space:
                spell_fixed_text = spell_fixed_text.replace(' ' + punc_char + ' ' , punc_char + ' ')
            gram_fixed = gram_checker.correct(spell_fixed_text)
            if gram_fixed != orig_text:
                unaltered_text.append(orig_text)
                replaced_text.append(gram_fixed)

In [688]:
spell_changes = {}
unaltered_text = []
replaced_text = []
for lesson in list(ck12_spell_gramm_fix_test['life-science'].values()):
    if lesson['questions']['nonDiagramQuestions']:
        for diagram, description in lesson['questions']['diagramQuestions'].items():
            orig_text = description['beingAsked']['processedText']
            spell_fixed_text = apply_spelling_fix(orig_text)
            gram_fixed = gram_checker.correct(spell_fixed_text)
            for punc_char in punc_set_nospace:
                gram_fixed = gram_fixed.replace(' ' + punc_char + ' ' , punc_char)
                gram_fixed = gram_fixed.replace(' ' + punc_char, punc_char)
            for punc_char in punc_set_space:
                gram_fixed = gram_fixed.replace(' ' + punc_char + ' ' , punc_char + ' ')
            if gram_fixed[-1] not in question_enders:
                if gram_fixed.split()[0] in ['Identify', 'Name'] or '__' in gram_fixed:
                    gram_fixed += '.'
                else:
                    gram_fixed += '?'
            if gram_fixed != orig_text:
                unaltered_text.append(orig_text)
                replaced_text.append(gram_fixed)

In [691]:
comp_text = list(zip(unaltered_text, replaced_text))

print(len(spell_changes))
print(len(comp_text))
# spell_changes

21
1189


In [711]:
rand_idx = np.random.randint(len(comp_text))
print(unaltered_text[rand_idx])
print()
print(replaced_text[rand_idx])
diff_corrected_text(*comp_text[rand_idx])

how many parts of the plant cell are labeled?

How many parts of the plant cell are labeled?


### hide

In [None]:
# with open(output_dir + 'ck12_dataset_beta_v4.json', 'r') as f:
#     ck12_combined_dataset = json.load(f)

# Topic key collisions

In [240]:
flexbook_ds.keys()

dict_keys(['life-science', 'earth-science', 'physical-science'])

In [241]:
build_website_lessons = [list(lesson.keys()) for lesson in lessons_ds.values()]
website_lessons= sorted([item for sublist in build_website_lessons for item in sublist])

build_flexbook_lessons = [list(lesson.keys()) for lesson in flexbook_ds.values()]
flexbook_lessons= [item for sublist in build_flexbook_lessons for item in sublist]
flexbook_lessons = sorted([lesson.split(maxsplit=1)[1].strip().lower() for lesson in flexbook_lessons])
fbls = set(flexbook_lessons)
wsls = set(website_lessons)

In [242]:
len(flexbook_lessons)

247

In [243]:
print(len(flexbook_lessons))
print(len(set(flexbook_lessons)))

247
243


In [244]:
print(len(website_lessons))
print(len(set(website_lessons)))

829
829


In [245]:
len(set(website_lessons).union(set(flexbook_lessons)))

1024

# Refinements to make

### todo

global ids to everything

linking lessons

remove ck12 lesson numbers from keys and image names

### file i/o

In [660]:
write_file('build_v5.json', full_test_ds)

In [657]:
# with open('build_v5.pkl', 'wb') as f:
#     pickle.dump(ck12_combined_dataset ,f)

### code

In [757]:
#print(topics_to_remove) #specicied in match topics section above, explictly set here

structural_topics = ['Summary', 'Review', 'References', 'Explore More', 'Lesson Summary', 'Lesson Objectives', 'Points to Consider', 'Introduction',
                    'Recall', 'Apply Concepts', 'Think Critically', 'Resources', 'Explore More II', 'Explore More I', 'Explore More III']

vocab_topics = ['Lesson Vocabulary', 'Vocabulary']

In [988]:
def iterate_over_all_material(complete_ds, apply_function):
    lesson_returns = []
    for subject, lessons in complete_ds.items():
        for lesson_name, lesson_content in lessons.items():
            response = apply_function(lesson_name, lesson_content)
            if response:
                lesson_returns.append(response)
    return lesson_returns
            
def apply_fixes(lesson_name, lesson_content):
    struct_content, vocab_content = iterate_over_text(lesson_content['topics'])
    lesson_content['adjunctTopics'] = struct_content
    lesson_content['adjunctTopics']['Vocabulary'] = vocab_content 
    iterate_over_text_questions(lesson_content['questions']['nonDiagramQuestions'])
    if lesson_content['instructionalDiagrams']:
        if not lesson_content['questions']['diagramQuestions']:
            print(lesson_name + ' missing questions')
        iterate_over_diagram_questions(lesson_content['questions']['diagramQuestions'])
        iterate_over_diagram_descriptions(lesson_content['instructionalDiagrams'])
            
def iterate_over_text(topic_sections):
    structural_content = {}
    vocab_section = {}
    topics_to_remove = []
    for topic, content in topic_sections.items():
        if content['content']['figures']:
            iterate_over_textbook_figs(content['content']['figures'])
        if topic in vocab_topics:
            vocab_section.update(add_defintions_to_vocab(content))
            topics_to_remove.append(topic)
        elif topic in structural_topics:
            structural_content[topic] = content
            topics_to_remove.append(topic)
    for topic in topics_to_remove:
        topic_sections.pop(topic)
    return structural_content, vocab_section
    

def iterate_over_text_questions(text_questions):
    for qid, question in text_questions.items():
        replace_local_ids_w_global(question, 'text', True)
            
def iterate_over_diagram_questions(diagram_questions):
    for qid, question in diagram_questions.items():
        replace_uri_with_path(question, 'question_images')
        orig_question= question['beingAsked']['processedText']
        fixed_question = apply_spelling_and_grammar_fixes(orig_question)
        if fixed_question:
            question['beingAsked']['processedText'] = fixed_question
        if detect_abc_question(question):
            standardize_abc_question(question)
    replace_local_ids_w_global(question, 'diagram', True)

def iterate_over_diagram_descriptions(diagram_descriptions, description_path_prefix=None):
    for diagram_name, diagram_content in diagram_descriptions.items():
        replace_uri_with_path(diagram_content, 'teaching_images')
        orig_description = diagram_content['processedText']
        fixed_description = apply_spelling_and_grammar_fixes(orig_description)
        if fixed_description:
            diagram_content['processedText'] = fixed_description
            
def add_defintions_to_vocab(vocab_section): #done
    lesson_vocab = {}
    for word in vocab_section['content']['text'].split('\n'):
        if word in flexbook_glossary.keys():
            lesson_vocab[word] = flexbook_glossary[word]
        elif word:
            lesson_vocab[word] = ''
    return lesson_vocab

def add_global_ids(data_object, object_type, zero_padding=6): #done
    id_prefix = {'text': 'NDQ_', 'diagram': 'DQ_', 'lesson': 'L_', 'description': 'DD_', 'topics': 'T_'}
    global_ids_counters[object_type] += 1
    data_object['globalID'] = id_prefix[object_type] + str(global_ids_counters[object_type]).zfill(zero_padding)

def detect_abc_question(question):
    abc_choices = [ac['processedText'] for ac in question['answerChoices'].values() if ac['processedText'] in list(string.ascii_letters)]
    return len(abc_choices) == 4

def standardize_abc_question(question): #done
    question['correctAnswer']['processedText'] = question['correctAnswer']['processedText'].upper()
    for ac in question['answerChoices'].values():
        ac['processedText'] = ac['processedText'].upper()
    
def iterate_over_textbook_figs(figure_content): # done
    for figure in figure_content:
        replace_uri_with_path(figure, 'textbook_images')
        
def replace_uri_with_path(image_content, path_prefix): # done
    image_key_str = 'imageUri'
    if 'image_uri' in image_content.keys():
        image_key_str = 'image_uri'
    image_name = image_content[image_key_str].split('/')[-1]
    image_content.pop(image_key_str)
    image_content['imagePath'] = os.path.join(path_prefix, image_name)
      
def apply_spelling_and_grammar_fixes(orig_text): #done
    spell_fixed_text = apply_spelling_fix(orig_text)
    for punc_char in punc_set_nospace:
        spell_fixed_text = spell_fixed_text.replace(' ' + punc_char + ' ' , punc_char)
    for punc_char in punc_set_space:
        spell_fixed_text = spell_fixed_text.replace(' ' + punc_char + ' ' , punc_char + ' ')
    gram_fixed = gram_checker.correct(spell_fixed_text)
    if gram_fixed != orig_text:
        return gram_fixed
    else:
        return None

def replace_local_ids_w_global(data_unit, unit_type, keys_too=False, id_key='orderID'):
    for k, v in sorted(data_unit.items(), key=lambda x: x[1][id_key]):
        add_global_ids(v, unit_type, zero_padding=4)
        v.pop(id_key)
        if keys_too:
            data_unit[v['globalID']] = v
            data_unit.pop(k)
            if unit_type == 'topics':
                name_field = unit_type[:-1] + 'Name'
                v[name_field] = k
    pass

def rename_lesson(lesson_name):
    if lesson_name[0].isdigit():
        return lesson_name.split(maxsplit=1)[1].strip().lower()
    else:
        return lesson_name

def flatten_complete_ds(lesson_name, lesson_content):
    lesson_content['lessonName'] =lesson_name
    add_global_ids(lesson_content, 'lesson', 4)
    lesson_content['lessonName'] = rename_lesson(lesson_name)
    obj_key = 'topics'
    replace_local_ids_w_global(lesson_content[obj_key], obj_key, True)
    return lesson_content

### run

In [989]:
# full_test_ds = deepcopy(ck12_combined_dataset)
test_vals= {k: v  for k, v in list(full_test_ds['earth-science'].items())[:20]}
buid_test_cds = {}
buid_test_cds['earth-science'] = test_vals
test_cds = deepcopy(buid_test_cds)

In [990]:
dq_gid = 0
ndq_gid = 0
l_id = 0
t_id = 0

global_ids_counters = {'text': ndq_gid, 'diagram': ndq_gid, 'lesson': l_id, 'topics': t_id}
_ = iterate_over_all_material(test_cds, apply_fixes)

TypeError: string indices must be integers

In [991]:
complete_flat_ds = iterate_over_all_material(test_cds, flatten_complete_ds)

In [982]:
complete_flat_ds[0].keys()

dict_keys(['globalID', 'topics', 'questions', 'lessonName', 'hidden', 'adjunctTopics', 'instructionalDiagrams'])

In [985]:
sorted(complete_flat_ds[0]['topics'].items(), key=lambda x: x[1]['globalID'])

[('T_0001',
  {'content': {'figures': [],
    'mediaLinks': [],
    'text': 'Earth formed at the same time as the other planets. The history of Earth is part of the history of the Solar System. '},
   'globalID': 'T_0001',
   'topicName': 'Formation of Earth'}),
 ('T_0002',
  {'content': {'figures': [],
    'mediaLinks': [],
    'text': 'Earth came together (accreted) from the cloud of dust and gas known as the solar nebula nearly 4.6 billion years ago, the same time the Sun and the rest of the solar system formed. Gravity caused small bodies of rock and metal orbiting the proto-Sun to smash together to create larger bodies. Over time, the planetoids got larger and larger until they became planets. '},
   'globalID': 'T_0002',
   'topicName': 'Planets Form'}),
 ('T_0003',
  {'content': {'figures': [],
    'mediaLinks': [],
    'text': 'When Earth first came together it was really hot, hot enough to melt the metal elements that it contained. Earth was so hot for three reasons: Gravitati

In [986]:
complete_flat_ds[0]['questions']

{'diagramQuestions': {},
 'nonDiagramQuestions': {'q01': {'answerChoices': {'a': {'idStructural': 'a)',
     'processedText': 'at the same time as the other planets.',
     'rawText': 'a) At the same time as the other planets.'},
    'b': {'idStructural': 'b)',
     'processedText': 'about 500 million years later than the sun.',
     'rawText': 'b) About 500 million years later than the sun.'},
    'c': {'idStructural': 'c)',
     'processedText': 'at the same time as the moon.',
     'rawText': 'c) At the same time as the moon.'},
    'd': {'idStructural': 'd)',
     'processedText': 'all of the above',
     'rawText': 'd) All of the above'}},
   'beingAsked': {'processedText': 'earth formed',
    'rawText': '1) Earth formed'},
   'correctAnswer': {'processedText': 'at the same time as the other planets.'},
   'globalID': 'NDQ_000002',
   'id': 'q01',
   'idStructural': '1)',
   'type': 'Multiple Choice'},
  'q02': {'answerChoices': {'a': {'idStructural': 'a)',
     'processedText': '

In [987]:
sorted(complete_flat_ds[0]['questions']['nonDiagramQuestions'].items(), key=lambda x: x[1]['globalID'])

[('q08',
  {'answerChoices': {'a': {'idStructural': 'a)',
     'processedText': 'zircon crystals',
     'rawText': 'a) Zircon crystals'},
    'b': {'idStructural': 'b)',
     'processedText': 'meteorites',
     'rawText': 'b) Meteorites'},
    'c': {'idStructural': 'c)',
     'processedText': 'lunar rocks',
     'rawText': 'c) Lunar rocks'},
    'd': {'idStructural': 'd)',
     'processedText': 'all of the above',
     'rawText': 'd) All of the above'}},
   'beingAsked': {'processedText': 'what material helps scientists describe the geologic composition of early earth?',
    'rawText': '8) What material helps scientists describe the geologic composition of early Earth?'},
   'correctAnswer': {'processedText': 'all of the above'},
   'globalID': 'NDQ_000001',
   'id': 'q08',
   'idStructural': '8)',
   'type': 'Multiple Choice'}),
 ('q01',
  {'answerChoices': {'a': {'idStructural': 'a)',
     'processedText': 'at the same time as the other planets.',
     'rawText': 'a) At the same time

In [820]:
# complete_flat_ds[0]['adjunctTopics']

In [819]:
# list(complete_flat_ds[0]['topics'].items())[5]

In [874]:
complete_flat_ds[9]

{'adjunctTopics': {'Apply Concepts': {'content': {'figures': [],
    'mediaLinks': [],
    'text': '6. An ocean current flows from north to south off the western coast of a continent. The current flows close to land at 50 N latitude. Predict how the current affects the climate of the coast at that latitude. Explain your prediction. '},
   'orderID': 't_18'},
  'Introduction': {'content': {'figures': [{'caption': 'FIGURE 17.1 Cold and snow are typical for Chicago in the winter.',
      'imagePath': 'textbook_images/17.1_Climate_and_Its_Causes_Introduction_0438_fig_17.1.png'}],
    'mediaLinks': [],
    'text': 'One winter day in Chicago, the temperature hit 20 C (68 F). This would be normal for Miami in the winter, but in Chicago, it felt like a heat wave. The scene in Figure 17.1 is more typical for Chicago in the winter. The heat wave on that winter day is an example of weather. The typical temperature for that day is part of Chicagos climate. '},
   'orderID': 't_03'},
  'Lesson Obje

### hide

In [397]:
t_ids_seen = []
q_ids_seen = []
for lesson in list(list(test_cds.values())[0].values()):
    for que in lesson['questions']['nonDiagramQuestions'].values():
        t_ids_seen.append(que['globalID'])  
for lesson in list(list(test_cds.values())[0].values()):
    for que in lesson['questions']['diagramQuestions'].values():
        q_ids_seen.append(que['globalID'])  

In [None]:
list(test_cds['earth-science'].values())[0].keys()

In [None]:
# list(test_cds['earth-science'].values())[0]['adjunctTopics']

In [None]:
#             print(lesson_content['topics'].keys())

# Splitting experiments

# End

In [None]:
# flexbook_glossary.keys()

In [None]:
# print(list(diagram_lesson_lookup.values()))

In [None]:
test_lesson = ck12_combined_dataset['earth-science']['24.1 Planet Earth']

In [None]:
# pprint.pprint(test_lesson['instructionalDiagrams'])

In [None]:
list(test_lesson['instructionalDiagrams'].values())[0]['processedText']

In [None]:
test_lesson['topics'].keys()

In [None]:
test_vocab = test_lesson['topics']['Vocabulary']['content']['text'].split('\n')

In [None]:
for word in test_vocab:
    if word in flexbook_glossary:
        print(flexbook_glossary[word])
        print()

In [None]:
# write_file('ck12_v4_5.json', ck12_combined_dataset, 'experimental_output')