# Table of Contents
* [move diagrams to 0.91](#move-diagrams-to-0.91)
* [load all localization](#load-all-localization)
* [load all recognition](#load-all-recognition)
* [load diagram questions and descriptions](#load-diagram-questions-and-descriptions)


In [1]:
%%capture
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict, OrderedDict
%load_ext autoreload
%autoreload 2

import cv2
import pprint
import pickle
import json
import requests
import io
import sys
import os
from binascii import b2a_hex
import base64
from wand.image import Image as WImage
from IPython.display import display
import PIL.Image as Image
from copy import deepcopy
import glob

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import re
import os
import jsonschema
from pdfextraction.ck12_schema import ck12_schema as schema

In [2]:
def write_file(filename, date, output_dir='output_data_from_nbs'):
    with open(os.path.join(output_dir, filename), 'w') as f:
        json.dump(date, f, indent=4, sort_keys=True)

In [53]:
def validate_schema(dataset_json):
    errors = []
    try:
        validator = jsonschema.Draft4Validator(schema)
        for error in sorted(validator.iter_errors(flexbook), key=lambda x: x.absolute_path[0]):
            errors.append([error.message, list(error.absolute_path)[:4]])
    except jsonschema.ValidationError as e:
        errors.append("Error in schema --%s-", e.message)
    return errors

def validate_dataset(dataset_json):
    for subject, flexbook in dataset_json.items():
        schema_errors = validate_schema(flexbook)
        for lesson_name, lesson in flexbook.items():
            ac_errors = check_ac_counts(lesson, subject, lesson_name)
        all_errors = schema_errors + ac_errors
        if not all_errors:
            return 'all validation test passed'
        else:
            return all_errors

def check_ac_counts(lesson_content, subject, lesson_name):
    errors = []
    for qid, question in lesson_content['questions']['nonDiagramQuestions'].items():
        if question['type'] == 'Multiple Choice':
            if len(question['answerChoices']) != 4:
                errors.append([subject, lesson_name, qid + ' mc error'])
        if question['type'] == 'True or False':
            if len(question['answerChoices']) != 2:
                errors.append([subject, lesson_name, qid + ' tf error'])
    return errors

# load text

In [23]:
output_dir = 'output_data_from_nbs/'
with open(output_dir + 'ck12_flexbook_only_beta_v3.json', 'r') as f:
    flexbook_ds = json.load(f)
with open(output_dir + 'ck12_lessons_only_beta_v3.json', 'r') as f:
    lessons_ds = json.load(f)
with open(output_dir + 'ck12_dataset_beta_v3.json', 'r') as f:
    ck12_combined_dataset = json.load(f)

In [24]:
pat_str = "(?:https?:\/\/(?:www\.).*?\s)"
web_link_patern=re.compile(pat_str)

def clean_content_text(content_str, web_link_patern):
    removed_links = web_link_patern.findall(content_str)
    if not removed_links:
        return '', ''
    split_txt = web_link_patern.split(content_str)
    cleaned_text = ' '.join([txt for txt in split_txt if txt])
    return cleaned_text, [link.strip() for link in removed_links]

In [25]:
def extract_links(complete_ds):
    for subject, lessons in complete_ds.items():
        for lesson_title, lesson in lessons.items():
            for topic, content in lesson['topics'].items():
                content_str = content['content']['text']
                new_text, links = clean_content_text(content_str, web_link_patern)
                content['content']['mediaLinks'] = []
                if links:
                    content['content']['text'] = new_text
                    content['content']['mediaLinks'].extend(links)
    return

In [26]:
ck12_combined_dataset_cleaned = deepcopy(ck12_combined_dataset)

In [27]:
extract_links(ck12_combined_dataset_cleaned)

In [36]:
write_file('ck12_dataset_v3_5.json', ck12_combined_dataset_cleaned)

In [54]:
def record_validation_errors(dataset):
    qs_removed = []
    for subject, flexbook in dataset.items():
        validator = jsonschema.Draft4Validator(schema)
        for error in sorted(validator.iter_errors(flexbook), key=lambda x: x.absolute_path[0]):
            lesson, quest, question_class, q_number = list(error.absolute_path)[:4]
            problem_q_section = dataset[subject][lesson][quest][question_class]
            if q_number in problem_q_section.keys():
#                 print(dataset[subject][lesson][quest][question_class].pop(q_number))
                qs_removed.append(dataset[subject][lesson][quest][question_class].pop(q_number))
    return qs_removed

In [None]:
qs_rem = record_validation_errors(ck12_combined_dataset_cleaned)
len(qs_rem)

In [55]:
validate_dataset(ck12_combined_dataset_cleaned)

'all validation test passed'

# move diagrams to 0.91

In [58]:
import glob

def get_img_n(image_name):
    return [re.findall("[0-9]+", image_name)][0][0]

def clean_list(dir_path):
    hidden_removed = filter(lambda f: not f.startswith('.'), os.listdir(dir_path))
    return [topic.replace('_diagram', '') for topic in hidden_removed]

recog_performed = '/Users/schwenk/wrk/stb/diagram_questions/turk_processing/final_diagrams/'
all_dir = '/Users/schwenk/wrk/stb/ai2-vision-textbook-dataset/diagrams/tqa_diagrams_v0.9/'
pruned_dir = '/Users/schwenk/wrk/stb/ai2-vision-textbook-dataset/diagrams/dataset_Sep_27/tqa_diagrams_v0.9_question_images/'
diagram_image_names = clean_list(recog_performed)

rec_files = glob.glob(all_dir + '*/*')
more_paths = glob.glob(all_dir + '*/*')
pruned_paths = glob.glob(pruned_dir + '*/*')
more_files = [fp.split('/')[-1] for fp in more_paths]
pruned_files = [fp.split('/')[-1] for fp in pruned_paths]

pruned_nums = set([get_img_n(name) for name in pruned_files])
all_nums = set([get_img_n(name) for name in more_files])
rec_nums = set([get_img_n(name) for name in diagram_image_names])

removed_images = all_nums.difference(pruned_nums)

removed_image_names = []
for img_n in removed_images:
    for image_name in more_files:
        if img_n == get_img_n(image_name):
            removed_image_names.append(image_name)

name_change_lookup = {}
for image_name in more_files:
    img_n = get_img_n(image_name)
    for newer_name in pruned_files:
        if img_n == get_img_n(newer_name) and newer_name != image_name:
            name_change_lookup[image_name] = newer_name

removed_image_names = sorted(removed_image_names)

In [145]:
name_change_lookup

{'electromagnetism_9087.png': 'em_spectrum_9087.png',
 'lewis_dot_idagrams_9131.png': 'lewis_dot_diagrams_9131.png',
 'lewis_dot_idagrams_9132.png': 'lewis_dot_diagrams_9132.png',
 'lewis_dot_idagrams_9133.png': 'lewis_dot_diagrams_9133.png',
 'lewis_dot_idagrams_9134.png': 'lewis_dot_diagrams_9134.png',
 'lewis_dot_idagrams_9135.png': 'lewis_dot_diagrams_9135.png',
 'lewis_dot_idagrams_9136.png': 'lewis_dot_diagrams_9136.png',
 'lewis_dot_idagrams_9137.png': 'lewis_dot_diagrams_9137.png',
 'lewis_dot_idagrams_9138.png': 'lewis_dot_diagrams_9138.png',
 'lewis_dot_idagrams_9139.png': 'lewis_dot_diagrams_9139.png',
 'lewis_dot_idagrams_9141.png': 'lewis_dot_diagrams_9141.png',
 'lewis_dot_idagrams_9142.png': 'lewis_dot_diagrams_9142.png',
 'lewis_dot_idagrams_9143.png': 'lewis_dot_diagrams_9143.png',
 'lewis_dot_idagrams_9144.png': 'lewis_dot_diagrams_9144.png',
 'lewis_dot_idagrams_9145.png': 'lewis_dot_diagrams_9145.png',
 'lewis_dot_idagrams_9147.png': 'lewis_dot_diagrams_9147.png',
 

In [146]:
len(removed_image_names)

285

# load all localization

In [166]:
turk_proc_dir = '/Users/schwenk/wrk/stb/diagram_questions/turk_processing/'
metadata_dir = turk_proc_dir + 'store_hit_results_metadata/'
lc_results_dir = 'loc_group_3'
box_choices_1_dir = 'final_text_boxes_fixed'
box_choices_2_dir = 'final_text_boxes_pass_2'
box_loc_joined = 'loc_annotations'

In [167]:
none_agree = 'no_turkers_agree_lookup.pkl'
two_agree_lookup = 'two_turkers_agree_lookup.pkl'
all_agree_lookup = 'user_diag_loopkup.pkl'

In [75]:
loc_res_df = pd.read_pickle(os.path.join(metadata_dir, lc_results_dir, 'complete_df.pkl'))

In [77]:
loc_res_df.head(1)

Unnamed: 0,diagram,rectangle,hit_id,assignment_id,worker_id
0,parts_cell_1182.png,"[[283, 192], [447, 238]]",3SA4EMRVJV39U1MGLCYP6KPFULH0PX,3BDCF01OGXVJNV1XRULS5F5Z4B6LYG,A1017VP86SLXRB


In [76]:
pd.unique(loc_res_df['diagram']).shape

(2307,)

In [86]:
with open(os.path.join(turk_proc_dir, all_agree_lookup), 'rb') as f:
    box_ns = pickle.load(f)

In [168]:
loc_anno = clean_list(os.path.join(turk_proc_dir, box_loc_joined))
loc_anno_images = [fig.split('.json')[0]  for fig in figures_with_locs]

In [169]:
keep_figures = [fig for fig in loc_anno_images if fig not in removed_image_names]

In [191]:
loc_box_path = os.path.join(turk_proc_dir, box_loc_joined)
diag_loc_annotations = {}

for diagram_name in keep_figures:
    anno_file_path = os.path.join(loc_box_path, diagram_name + '.json')
    if not os.path.exists(anno_file_path):
#         print(diagram_name)
        diagram_name = diagram_name.replace('optics_rays', 'optics_ray_diagrams')
        anno_file_path = os.path.join(loc_box_path, diagram_name  + '.json')
#         continue
    with open(anno_file_path, 'r') as f:
        diag_loc_annotations[diagram_name] = json.load(f)

In [190]:
# sorted(diag_loc_annotations)[1000:]

In [192]:
len(diag_loc_annotations)

1935

In [217]:
files_still_needing_localisation = sorted(list(set(pruned_files).difference(set(diag_loc_annotations))))
len(files_still_needing_localisation)

124

# load all recognition

In [201]:
recog_output = '/Users/schwenk/wrk/stb/diagram_questions/turk_processing/'
recog_results_dir = 'group_latest_combined'

In [224]:
from scipy.stats.mstats import mode
noncon = []

def most_common_strict(image_response):
    """
    returns the consensus response of the three raw response strings for a given image
    """
    most_common = image_response[1]['raw_text'].mode()
    if most_common.empty:
        most_common = 'nonconsensus'
        noncon.append(image_response[1]['raw_text'])
    else:
        most_common = most_common.values[0]
    return most_common

def most_common_lax(image_response, strings_denoting_missing_image=[]):
    """
    returns the consensus response after stripping white space and converting the reponses to lower case
    """
    simple_sanitizer = lambda x : x.lower().strip().lstrip()
    ind_responses = image_response[1]['raw_text'].values
    probobly_blanks = [response for response in ind_responses if response in strings_denoting_missing_image]
    if probobly_blanks:
        return 'skip'
    most_common = image_response[1]['raw_text'].apply(simple_sanitizer).mode()
    if most_common.empty:
        most_common = 'no consensus'
        noncon.append(image_response[1]['raw_text'])
    else:
        most_common = most_common.values[0]
    return most_common

def find_transcriptions_matches(batch_results_df, response_matcher):
    """
    returns a pandas series with the consunsus response for each image
    """
    agreed_responses = pd.DataFrame()
    for image_response in batch_results_df.groupby(['diagram', 'box_diag_idx']):
        diagram_and_idx = image_response[0]
        most_common = response_matcher(image_response, strings_denoting_missing_image=[])
        if most_common == 'skip':
            continue
        this_row = pd.DataFrame(list(diagram_and_idx) + [most_common, image_response[1]['rectangle'].iloc[0], image_response[1]['assignment_id'].iloc[0]]).T
        agreed_responses = pd.concat([agreed_responses, this_row])
        # The reindex below is needed to match the original df index after the groupby operation
    agreed_responses.columns = ['diagram', 'box_diag_idx', 'consensus_res', 'rectangle', 'assignment_id']
    return agreed_responses

In [203]:
recog_res_df = pd.read_pickle(os.path.join(metadata_dir, recog_results_dir, 'recog_df.pkl'))

In [210]:
recog_performed_on = set(pd.unique(recog_res_df['diagram']).tolist())

In [213]:
recog_performed_on_keep = [image for image in recog_performed_on if image in keep_figures]

In [214]:
len(recog_performed_on_keep)

1915

In [215]:
files_still_needing_recognition = sorted(list(set(pruned_files).difference(set(recog_performed_on_keep))))

In [219]:
file_with_loc_no_recog = set(files_still_needing_recognition).difference(files_still_needing_localisation)

In [222]:
# file_with_loc_no_recog

In [226]:
transcription_results_lax = find_transcriptions_matches(recog_res_df, most_common_lax)

In [274]:
noncon_entries = [entries.values.tolist() for entries in noncon]
flattened_noncon = [item for sublist in noncon_entries for item in sublist]

In [277]:
len(noncon_entries)

1358

# load diagram questions and descriptions

# End

In [None]:
for subject, flexbook in ck12_combined_dataset_cleaned.items():
    validator = jsonschema.Draft4Validator(schema)
    for error in sorted(validator.iter_errors(flexbook), key=lambda x: x.absolute_path[0]):
        print(error.message)
        print(list(error.absolute_path)[:4])
        print('\n' * 2)