In [1]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import json
import os
from copy import deepcopy
from tqa_utils import Evaluator


In [25]:
all_headers = ["id","parentId","isArchived","questionText","answerText","points","isTest","isMultipleChoice","hasDiagram","examName","examGrade","examYear","notes","tags","legacyId","importedQuestionId"]
headers = ["questionText","answerText","points","isTest","isMultipleChoice","hasDiagram","examName","examGrade","examYear", "notes", "legacyId", "lessonName", "importedQuestionId"]

In [3]:
dataset_root_dir = '/Users/schwenk/wrk/stb/dataset_releases/data_release_beta6/'
file_name = 'tqa_dataset_beta7_5.json'
data_file =  dataset_root_dir + file_name
quest_evaluator = Evaluator(data_file)

In [4]:
with open(os.path.join(dataset_root_dir, file_name), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)
ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

In [5]:
all_ndqs = quest_evaluator.build_question_lookup(by_type=True)['nonDiagramQuestions']
mc_questions = quest_evaluator.build_questions_by_subtype(all_ndqs)['Multiple Choice']
all_diagram_qs = quest_evaluator.build_question_lookup(by_type=True)['diagramQuestions']

In [26]:
all_questions = []
for lesson in ck12_combined_dataset:
    for question in list(lesson['questions']['nonDiagramQuestions'].values()) + list(lesson['questions']['diagramQuestions'].values()):
        question['lessonName'] = lesson['lessonName']
        question['lessonID'] = lesson['globalID']
        all_questions.append(question)

In [27]:
len(all_questions)

31496

In [44]:
build_frame = []
s3_prefix = 'https://s3.amazonaws.com/aristo-data/images/TQA/'
for question in all_questions:

    if 'imagePath' in question.keys():
        has_diagram = True
        image_name = question['imagePath'].split('/')[-1]
        image_prefix = '{' + s3_prefix + image_name + '}'
    else:
        has_diagram = False
        image_prefix = ''
    if 'answerChoices' in question.keys():
         answer_choices = ['(' + sid.upper() + ') ' + ac['processedText'] for sid, ac in sorted(question['answerChoices'].items())]
    else:
        answer_choices = []
    
    if 'questionSubType' not in question.keys():
        q_sub_type = question['questionType']
    else:
        q_sub_type = question['questionSubType']
    
    new_question_row = {
        "questionText": ' '.join([image_prefix, question['beingAsked']['processedText']] + answer_choices),
        "answerText": question['correctAnswer']['processedText'],
        "points": 1,
        "isTest": False,
        "isMultipleChoice": question['questionType'] in ['Multiple Choice', 'Diagram Multiple Choice'],
        "hasDiagram": has_diagram,
        "examName": "TQA",
        "examGrade": "8",
        "examYear": "2016",
        "legacyId": '_'.join(["TQA", question['lessonID'], question['globalID']]),
        "lessonName": question['lessonName'],
        "importedQuestionId": question['globalID'],
        "notes": q_sub_type
    }
    build_frame.append(new_question_row)
qdf = pd.DataFrame(build_frame)
ndqdf = qdf[headers]

In [32]:
ndqdf.head(300).tail(200).to_csv('tqa_aristo_format_v1.csv', index=False)

In [38]:
ndqdf.to_csv('tqa_aristo_format_v2.csv', index=False)

In [33]:
ndqdf.head(300).tail(200)

Unnamed: 0,questionText,answerText,points,isTest,isMultipleChoice,hasDiagram,examName,examGrade,examYear,notes,legacyId,lessonName,importedQuestionId
100,Erosion is always followed by (A) deposition....,a,1,False,True,False,TQA,8,2016,Multiple Choice,TQA_L_0003_NDQ_000102,erosion and deposition by flowing water,NDQ_000102
101,Large curves eroded in the channel of a slow-...,meanders,1,False,False,False,TQA,8,2016,Fill in the Blank,TQA_L_0003_NDQ_000093,erosion and deposition by flowing water,NDQ_000093
102,hole on the surface of the ground that forms ...,f,1,False,True,False,TQA,8,2016,Matching,TQA_L_0003_NDQ_000098,erosion and deposition by flowing water,NDQ_000098
103,"When flowing water slows down, which of the f...",a,1,False,True,False,TQA,8,2016,Multiple Choice,TQA_L_0003_NDQ_000105,erosion and deposition by flowing water,NDQ_000105
104,Which statement about stalactites is false? (...,a,1,False,True,False,TQA,8,2016,Multiple Choice,TQA_L_0003_NDQ_000106,erosion and deposition by flowing water,NDQ_000106
105,Floodplains are poor places for growing crops...,b,1,False,True,False,TQA,8,2016,True or False,TQA_L_0003_NDQ_000084,erosion and deposition by flowing water,NDQ_000084
106,Sinkholes are caused by groundwater erosion. ...,a,1,False,True,False,TQA,8,2016,True or False,TQA_L_0003_NDQ_000086,erosion and deposition by flowing water,NDQ_000086
107,{https://s3.amazonaws.com/aristo-data/images/T...,c,1,False,True,True,TQA,8,2016,Diagram Multiple Choice,TQA_L_0003_DQ_000026,erosion and deposition by flowing water,DQ_000026
108,{https://s3.amazonaws.com/aristo-data/images/T...,d,1,False,True,True,TQA,8,2016,Diagram Multiple Choice,TQA_L_0003_DQ_000002,erosion and deposition by flowing water,DQ_000002
109,{https://s3.amazonaws.com/aristo-data/images/T...,c,1,False,True,True,TQA,8,2016,Diagram Multiple Choice,TQA_L_0003_DQ_000015,erosion and deposition by flowing water,DQ_000015


In [40]:
count = 0
for question in all_ndqs.values():
    count += question['questionSubType'] == 'Multiple Choice'

In [41]:
count + len(all_diagram_qs)

19149

In [42]:
ndqdf.shape

(31496, 13)