# Table of Contents
* [Load data](#Load-data)
* [dataset validation](#dataset-validation)
	* [schema validation](#schema-validation)
	* [other validation test](#other-validation-test)
* [exploring dataset](#exploring-dataset)
* [html viz](#html-viz)
* [End](#End)


In [1]:
%%capture
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict, OrderedDict
%load_ext autoreload
%autoreload 2

import cv2
import pprint
import pickle
import json
import requests
import io
import sys
import os
from binascii import b2a_hex
import base64
from wand.image import Image as WImage
from IPython.display import display
import PIL.Image as Image
from copy import deepcopy
import glob

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pdfextraction.ck12_flex_extract as ck_ex

# Load data

__pieces from flexbooks and webbsite lessons seperated__

In [9]:
with open('ck12_flexbook_only_beta_v1.json', 'r') as f:
    flexbook_ds = json.load(f)
with open('ck12_lessons_only_beta_v1.json', 'r') as f:
    lessons_ds = json.load(f)

__combined dataset__

load or assemble from new pieces

In [10]:
# ck12_combined_dataset = {k: dict(v, **flexbook_ds[k]) for k, v in lessons_ds.items()}
with open('ck12_dataset_beta_v1.json', 'r') as f:
    ck12_combined_dataset = json.load(f)

# dataset validation

## schema validation

In [14]:
for subject, joined_content in ck12_combined_dataset.items():
    ds_assembler = ck_ex.CK12DataSetAssembler()
    print subject
    ds_assembler.validate_schema(joined_content)
    print

earth-science
Additional properties are not allowed (u'Air Pollution in the U.S.' was unexpected)
Additional properties are not allowed (u'Energy Use in the U.S.' was unexpected)

life-science

physical-science
'correctAnswer' is a required property



## other validation test

# exploring dataset

In [124]:
lesson_names = [item for sublist in [val['topics'].keys() for val in flexbook_ds['earth-science'].values()] for item in sublist]
pd.Series(lesson_names).value_counts()[:11]

Points to Consider            87
Lesson Summary                87
Vocabulary                    87
Lesson Objectives             87
Apply Concepts                86
Recall                        86
Think Critically              85
Introduction                  85
Lesson Review Questions       85
Divergent Plate Boundaries     3
Meteorites                     2
dtype: int64

# html viz

In [24]:
import jinja2
from IPython.core.display import HTML

jnjenv = jinja2.Environment()

In [203]:
def make_lesson_data(lesson_json):
    nested_text = []    
    for topic, content in sorted(lesson_json['topics'].items(), key=lambda (k,v): v['orderID']):
        nested_text.append((topic, content['content']['text']))
    return nested_text

In [187]:
def make_page_html(lesson_data, page_html):
    return jnjenv.from_string(page_html).render(lesson=lesson_data[0], topics=lesson_data[1])

In [190]:
def display_lesson_html(flexbook, lesson):
    lesson_json = flexbook[lesson]
    lesson_data = (lesson, make_lesson_data(lesson_json))
    lesson_html = make_page_html(lesson_data, page_html)
    return HTML(lesson_html)

In [204]:
display_lesson_html(flexbook_ds['life-science'], '15.1 Understanding Animal Behavior')

In [184]:
page_html = """
<!DOCTYPE html>
<html>
  <head>
    <style type="text/css">
    </style>
  </head>
  <body>
    <div class="container">
      <h1>Lesson: {{lesson}}</h1>
      <ul>
        {% for topic in topics %}
        <p>
        </p>
        <h3>{{topic.0}}</h3>
        <p>{{
        topic.1
        }}</p>
        {% endfor %}
      </ul>
    </div>
    <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
    <script src="http://netdna.bootstrapcdn.com/bootstrap/3.0.0/js/bootstrap.min.js"></script>
  </body>
</html>
"""

# End

In [None]:

stat_data = {'Number of Entities':stats_counter, 'Average Number per image': stats_fract}
count = 2
html = "<table>"
# add header row

html += "<tr><th>"
for k in stat_data.keys():
    html += "<th>"+k

html += "<tr><th>Entity Category"
for j in range(count):
    html += "<th>"

for k, v in stats_counter.items():
    html += "<tr><th>"+k
#     for j in range(count):
    html += "<td>" + str(v)
    html += "<td>" + "%.2f" % stats_fract[k]
html += '<tr>'
    
# for k, v in stats_fract.items():
#     html += "<tr><th>"+k
#     for j in range(count):
#         html += "<td>" + str(v)
        
html += "</table>"
HTML(html)

In [25]:
page_html = """
<html>
<head>
<title>{{ title }}</title>
</head>
<body>
Hello.
</body>
</html>
"""

In [18]:
page_template = """
<html>
 <head>
  <title>KB HIT</title>
  <meta content='text/html'/>
  <script type='text/javascript' src='https://s3.amazonaws.com/mturk-public/externalHIT_v1.js'></script>
 </head>
 <body>
    <p>We are constructing a large knowledge base (KB) about elementary science and commonsense knowledge, to help computers answer questions more reliably. We are planning to release the KB as a free, open source resource for the community when it is complete. Your work here will help us assemble this KB and contribute to this effort.</p>

    <p>Below, the computer has automatically extracted some candidate facts from text for possible inclusion in the KB. However, some are weird, false, or nonsensical. This task will help us distinguish the good facts, to include in the KB, from the bad.</p>
     <form name='mturk_form' method='post' id='mturk_form' action='https://workersandbox.mturk.com/mturk/externalSubmit'>
      <input type='hidden' value='' name='assignmentId' id='assignmentId'/>		 
      <table>
        <tr><th></th><th>Commonsense Knowledge</th></tr>
        {% for n in input_data %}
            <tr><td>{{n.sentence}}</td><td nowrap>
            <!--these break-->
            <!--<input type="hidden" name="{{n.sentence_id}}" id="assignmentId" value="ASSIGNMENT_NOT_AVAILABLE" />-->
            <!--<input type="hidden" name="assignmentId" id="assignmentId" value="ASSIGNMENT_NOT_AVAILABLE" />-->
            <!--this is in the official documentation but breaks anyway!-->
            <!--<input type='hidden' value='' name='assignmentId' id='assignmentId'/>-->
            <!--this works:-->
            <input name="{{n.sentence_id}}" type="radio" value="true-act" />EXPECTED ACTION
            <input name="{{n.sentence_id}}" type="radio" value="false-act" />RARE/FALSE ACTION
            <input name="{{n.sentence_id}}" type="radio" value="true-prop" />TRUE PROPERTY
            <input name="{{n.sentence_id}}" type="radio" value="false-prop" />RARE/FALSE PROPERTY
            <input name="{{n.sentence_id}}" type="radio" value="nonsense" />NONSENSE
            <input name="{{n.sentence_id}}" type="radio" value="unknown" />DON'T KNOW
            </td></tr>
        {% endfor %}
      </table>
      <p><input type="submit" id="submitButton" value="Submit" /></p>
   </form>
  <script language="Javascript">turkSetAssignmentID();</script>
 </body>
</html>

"""