# Table of Contents
* [Investigating programatically recovering hierarchy](#Investigating-programatically-recovering-hierarchy)
* [End](#End)


In [1]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict, OrderedDict
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import json
import glob
import pprint

In [3]:
import pdfextraction.merge as me
import pdfextraction.unmerge as um
import pdfextraction.amt_boto_modules as amt

# Investigating programatically recovering hierarchy

In [310]:
base_path = 'ai2-vision-turk-data/textbook-annotation-test/labeled-questions/'
test_path = 'ai2-vision-turk-data/textbook-annotation-test/hierarchy-ext-test/'

In [5]:
question_pages_df = pd.read_pickle('pages_w_questions.pkl')
question_pages = question_pages_df['page'].tolist()

In [6]:
book_breakdowns, page_ranges = amt.load_book_info()

In [7]:
to_review = pd.read_pickle('rev_seq.pkl').tolist()

In [16]:
verified_pages = to_review
verified_pages_minus_bad = [page for page in verified_pages if page not in bad_pages.tolist()]
print len(verified_pages)
print len(verified_pages_minus_bad)

856
468


In [284]:
def print_contents(box):
    print ''
    print 'actual ' + box['group_n']
    print 'predicted ' + str(box['predicted_group_n'])
    print box['contents']
    print ''
    
def make_box_row(row_dict, pos_idx):
    new_row = row_dict.copy()
    rect = row_dict['rectangle']
    new_row['start_x'] = rect[0][0]
    new_row['start_y'] = rect[0][1]
    new_row['end_x'] = rect[1][0]
    new_row['end_y'] = rect[1][1]
    new_row['vert_order'] = pos_idx
    del new_row['rectangle']
    return new_row

def assign_group_numbers(ordered_question_boxes):
    current_group_n = 0
    current_outer_indent = ordered_question_boxes[0]['start_x']
    last_seen_question_type = 'Unlabeled'
    indent_tolerance = 20
    last_box_end_y = 0
    
    for idx, box in enumerate(ordered_question_boxes):
        if box['category'] == 'Question' or box['category'] == 'Unlabeled':
            box['predicted_group_n'] = 0
            last_seen_question_type = box['category']
            last_box_end_y = box['end_y']
            continue
#             return 0
        
        type_changed = last_seen_question_type != box['category']
        last_seen_question_type = box['category'] if type_changed else last_seen_question_type
        vertically_seperated = box['start_y'] > last_box_end_y
        last_box_end_y = box['end_y']
#         print 'vs ' + str(vertically_seperated)
#         print 'tc ' + str(type_changed)

        if vertically_seperated and (type_changed or box['start_x'] - indent_tolerance < current_outer_indent ):
            current_group_n += 1
            
        box['predicted_group_n'] = current_group_n
    return ordered_question_boxes

def check_group_numbers(ordered_boxes_w_pred):
    n_wrong = 0 
    for box in ordered_boxes_w_pred:
#         print_contents(box)
        if int(box['group_n']) != box['predicted_group_n']:
            n_wrong += 1
    return n_wrong, len(ordered_boxes_w_pred), 1- n_wrong / float(len(ordered_boxes_w_pred))

In [285]:
def predict_and_verify_groups(pages):
    results_by_page = []
    total_num_boxes = 0
    for page in pages:
        page_file_path = base_path + page.replace('jpeg', 'json')
        with open(page_file_path) as f:
            page_boxes = json.load(f)

        for qn, qv in page_boxes['question'].items():
            total_num_boxes += 1
            del qv['source']
            del qv['score']
            del qv['v_dim']
            
        q_series = page_boxes['question']
        vertically_ordered_question = sorted(q_series.values(), key=lambda x: (x['rectangle'][0][1], x['rectangle'][0][0]))
        vertically_ordered_question_feat = [make_box_row(box, idx) for idx, box in enumerate(vertically_ordered_question)]
        boxes_w_predicts = assign_group_numbers(vertically_ordered_question_feat)
        if boxes_w_predicts:
            n_wrong, box_n_this_page, fract_right = check_group_numbers(boxes_w_predicts) 
            total_num_boxes += box_n_this_page
            if fract_right:
                results_by_page.append((page, n_wrong, fract_right, boxes_w_predicts))

    total_wrong = sum([res[1] for res in results_by_page])
    overall_accuracy = 1 - total_wrong / float(total_num_boxes)
    return overall_accuracy, total_wrong, total_num_boxes, results_by_page

In [312]:
def write_predicted_groups(page_results):
    page = page_results[0]
    pred_boxes = page_results[-1]
    page_file_path = base_path + page.replace('jpeg', 'json')
    new_file_path =  test_path + page.replace('jpeg', 'json')

    with open(page_file_path) as f:
        all_page_boxes = json.load(f)    

    for box in pred_boxes:
        all_page_boxes['question'][box['box_id']]['group_n'] = box['predicted_group_n']

    with open(new_file_path, 'w') as f:
        json.dump(all_page_boxes, f)

In [286]:
accuracy_total, errors_total, box_totals, ind_page_results = predict_and_verify_groups(verified_pages_minus_bad)
print errors_total
print box_totals
print accuracy_total

996
11602
0.914152732288


In [313]:
for page_res in ind_page_results:
    write_predicted_groups(page_res)

In [243]:
problem_pages = [page[0] for page in ind_page_results if page[2] < 1.0]

In [257]:
problem_pages[:5]

[u'Daily_Science_Grade_6_(Daily_Practice_Books)_Evan_Moor_181.jpeg',
 u'Daily_Science_Grade_5_Evan_Moor_54.jpeg',
 u'Daily_Science_Grade_2_Evan_Moor_77.jpeg',
 u'Daily_Science_Grade_4_Evan_Moor_63.jpeg',
 u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_184.jpeg']

In [258]:
problem_pagess = pd.Series(problem_pages)
problem_pagess.to_pickle('problem_pages.pkl')

In [265]:
page_n = 0

In [281]:
cur_page = problem_pages[page_n:page_n + 1]
accuracy_total, errors_total, box_totals, ind_page_results = predict_and_verify_groups(cur_page)
page_n += 1
print ''
print cur_page
print '{} {}'.format('page', page_n)
print '{} {}'.format('number wrong=', errors_total)
print '{} {}'.format('out of', box_totals)
print '{} {}'.format('accuracy', accuracy_total)


actual 1
predicted 1
A. Write true or false.


actual 1
predicted 1
1. A conductor allows electric current to


actual 1
predicted 1
flow through it easily


actual 1
predicted 1
2. Electrical energy can make heat energy.


actual 1
predicted 1
3. Resistors convert less energy into heat


actual 1
predicted 1
than conductors do


actual 1
predicted 1
4. All metals conduct electricity


actual 1
predicted 1
equally we


actual 2
predicted 2
B. What is the main difference between a conductor and a resistor?


actual 2
predicted 3
C. How did resistors help the people who invented toasters?


actual 2
predicted 3
Explain your answer


[u'Daily_Science_Grade_4_Evan_Moor_131.jpeg']
page 9
number wrong= 2
out of 22
accuracy 0.909090909091


## Dataframe stuff

In [118]:
# single_page_df = pd.DataFrame(vertically_ordered_question)

In [119]:
# single_page_df['rectangle'].iloc[0]

In [10]:
qa_df = pd.read_pickle('store_hit_results_metadata/question_anno/group_latest_combined/consensus_df.pkl')

In [11]:
bad_pages = pd.unique(qa_df.query('category == "No Consensus" or group_n == "No Consensus"')['page'])

In [12]:
consensus_only_df = qa_df[~qa_df['page'].isin(bad_pages)]

In [13]:
pd.unique(consensus_only_df['page']).shape

(468,)

In [15]:
consensus_only_df.head(4)

Unnamed: 0,page,box_id,category,hit_id,group_n
20,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q11,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
21,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q13,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
22,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q14,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
23,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q15,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1


# End

In [363]:
# vertically_ordered_question = sorted(q_series.values(), key=lambda x: x['rectangle'][0][1])

# vertically_ordered_question_feat = [make_box_row(box, idx) for idx, box in enumerate(vertically_ordered_question)]

# boxes_w_predicts = assign_group_numbers(vertically_ordered_question_feat) 