# Table of Contents
* [Investigating programatically recovering hierarchy](#Investigating-programatically-recovering-hierarchy)
* [End](#End)


In [3]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict, OrderedDict
%load_ext autoreload
%autoreload 2

In [4]:
import pickle
import json
import glob
import pprint

In [5]:
import pdfextraction.merge as me
import pdfextraction.unmerge as um
import pdfextraction.amt_boto_modules as amt

# Investigating programatically recovering hierarchy

In [6]:
base_path = 'ai2-vision-turk-data/textbook-annotation-test/labeled-questions/'

In [7]:
question_pages_df = pd.read_pickle('pages_w_questions.pkl')
question_pages = question_pages_df['page'].tolist()

In [8]:
book_breakdowns, page_ranges = amt.load_book_info()

In [9]:
to_review = pd.read_pickle('rev_seq.pkl').tolist()

In [10]:
verified_pages = to_review[22:27]

In [107]:
def make_box_row(row_dict, pos_idx):
    new_row = row_dict.copy()
    rect = row_dict['rectangle']
    new_row['start_x'] = rect[0][0]
    new_row['start_y'] = rect[0][1]
    new_row['end_x'] = rect[1][0]
    new_row['end_y'] = rect[1][1]
    new_row['vert_order'] = pos_idx
    del new_row['rectangle']
    return new_row

def assign_group_numbers(ordered_question_boxes):
    current_group_n = 0
    current_outer_indent = ordered_question_boxes[0]['start_x']
    last_seen_question_type = 0
    indent_tolerance = 20
            
    
    for idx, box in enumerate(ordered_question_boxes):
        if box['category'] == 'Question':
            box['predicted_group_n'] = 0
            continue
            
        if not last_seen_question_type and box['category']:
            last_seen_question_type = box['category']
        
        type_changed = last_seen_question_type != box['category']
        last_seen_question_type = box['category'] if type_changed else last_seen_question_type

        if type_changed or box['start_x'] - indent_tolerance < current_outer_indent:
            current_group_n += 1
            
        box['predicted_group_n'] = current_group_n
    return ordered_question_boxes

def check_group_numbers(ordered_boxes_w_pred):
    n_wrong = 0 
    for box in ordered_boxes_w_pred:
        if int(box['group_n']) != box['predicted_group_n']:
            n_wrong += 1
    return n_wrong, 1- n_wrong / float(len(ordered_boxes_w_pred))

In [108]:
def predict_and_verify_groups(pages):
    results_by_page = []
    for page in pages:
        page_file_path = base_path + page.replace('jpeg', 'json')
        with open(page_file_path) as f:
            page_boxes = json.load(f)

        for qn, qv in page_boxes['question'].items():
            del qv['source']
            del qv['score']
            del qv['v_dim']
            
        q_series = page_boxes['question']
        vertically_ordered_question = sorted(q_series.values(), key=lambda x: x['rectangle'][0][1])
        vertically_ordered_question_feat = [make_box_row(box, idx) for idx, box in enumerate(vertically_ordered_question)]
        boxes_w_predicts = assign_group_numbers(vertically_ordered_question_feat)
        n_wrong, fract_right = check_group_numbers(boxes_w_predicts)
        results_by_page.append((page, n_wrong, fract_right))
    return results_by_page

In [110]:
predict_and_verify_groups(verified_pages)

[(u'Daily_Science_Grade_6_(Daily_Practice_Books)_Evan_Moor_176.jpeg', 0, 1.0),
 (u'Daily_Science_Grade_4_Evan_Moor_89.jpeg', 0, 1.0),
 (u'Daily_Science_Grade_3_(Daily_Practice_Books)_Evan_Moore_137.jpeg', 0, 1.0),
 (u'Daily_Science_Grade_1_Evan_Moor_40.jpeg', 0, 1.0),
 (u'Daily_Science_Grade_4_Evan_Moor_88.jpeg', 0, 1.0)]

## Dataframe stuff

In [150]:
single_page_df = pd.DataFrame(vertically_ordered_question)

In [155]:
single_page_df['rectangle'].iloc[0]

[123, 750]

In [50]:
qa_df = pd.read_pickle('store_hit_results_metadata/question_anno/group_latest_combined/consensus_df.pkl')

In [86]:
bad_pages = pd.unique(qa_df.query('category == "No Consensus" or group_n == "No Consensus"')['page'])

In [93]:
consensus_only_df = qa_df[~qa_df['page'].isin(bad_pages)]

In [95]:
pd.unique(consensus_only_df['page']).shape

(468,)

In [104]:
consensus_only_df.head(40)

Unnamed: 0,page,box_id,category,hit_id,group_n
20,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q11,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
21,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q13,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
22,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q14,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
23,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q15,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
24,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q16,Fill-in-the-Blank,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,1
25,Daily_Science_Grade_1_Evan_Moor_100.jpeg,Q17,Question,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,0
26,Daily_Science_Grade_1_Evan_Moor_100.jpeg,T1,unlabeled,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,0
27,Daily_Science_Grade_1_Evan_Moor_100.jpeg,T10,Definition,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,0
28,Daily_Science_Grade_1_Evan_Moor_100.jpeg,T12,Definition,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,0
29,Daily_Science_Grade_1_Evan_Moor_100.jpeg,T18,unlabeled,3VDVA3ILIDG3ODEZECC2QGUXV88G1P,0


# End

In [363]:
# vertically_ordered_question = sorted(q_series.values(), key=lambda x: x['rectangle'][0][1])

# vertically_ordered_question_feat = [make_box_row(box, idx) for idx, box in enumerate(vertically_ordered_question)]

# boxes_w_predicts = assign_group_numbers(vertically_ordered_question_feat) 