# Table of Contents
* [Explore TQA](#Explore-TQA)
	* [load](#load)
* [extract sentences](#extract-sentences)
	* [counts](#counts)
* [Explore existing](#Explore-existing)


In [1]:
import numpy as np
import pandas as pd
import scipy as st

import pickle
from collections import defaultdict
import json
import os
import random
from copy import deepcopy

In [2]:
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
%%capture
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light solarized
#%base16_mplrc dark solarized
plt.rcParams['grid.linewidth'] = 0
plt.rcParams['figure.figsize'] = (16.0, 10.0)

# Explore TQA

## load

In [4]:
dataset_root_dir = './data_sources/'
tqa_file = 'tqa_v10_no_sa.json'
data_path =  os.path.join(dataset_root_dir, tqa_file)

with open(os.path.join(data_path), 'r') as f:
    ck12_combined_dataset_raw = json.load(f)

ck12_combined_dataset = deepcopy(ck12_combined_dataset_raw)

## extract sentences

In [5]:
def dict_key_extract(key, var):
    if hasattr(var, 'items'):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, dict):
                for result in dict_key_extract(key, v):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in dict_key_extract(key, d):
                        yield result

In [207]:
vocabs = [list(dict_key_extract('Vocabulary', lesson))[0] for lesson in ck12_combined_dataset]

vocab_dict = dict(pair for d in vocabs for pair in d.items())
vocab_with_def = {w: d for w, d in vocab_dict.items() if d}

# with open('tqa_vocab_dict.json', 'w') as f:
#     json.dump(vocab_with_def, f, sort_keys=True, indent=4)

In [600]:
import copy

In [601]:
preserve_lesson_paragraphs = copy.deepcopy(lessons_paragraphs)

In [6]:
lessons_paragraphs = {lesson['globalID']: list(dict_key_extract('text', lesson)) for lesson in ck12_combined_dataset}

In [None]:
cleaned_lesso

In [9]:
for topic in list(lessons_paragraphs.values()):
    for par in topic:
        topic_sentences = sent_tokenize(par)
        topic_sentences  = [sent for sent in topic_sentences if  len(sent) > 10 and sent[-1] == '.']
        all_sentences += topic_sentences

In [193]:
cause_phrases = [' cause', 'leads to', 'produce', 'generates', ' create', 'due to']

In [12]:
leads_sents = [sent for sent in all_sentences if ' leads to' in sent.lower()]

In [13]:
produces_sents = [sent for sent in all_sentences if ' produces' in sent.lower()]

In [14]:
generates_sents = [sent for sent in all_sentences if 'generates' in sent.lower()]

In [15]:
all_sents = [sent for sent in all_sentences if any(phrase in sent.lower() for phrase in cause_phrases)]
len(all_sents)

2242

In [346]:
cause_sents = []
cause_and_proceed = []
for idx, sent in enumerate(all_sentences):
    if ' cause' in sent.lower() and any(x in sent.lower().split()[0] for x in ['this', 'it ']):
        cause_and_proceed.append((all_sentences[idx - 1], sent))
    elif ' causes' in sent.lower() or ' caused' in sent.lower():
        cause_sents.append(sent)

In [347]:
def build_cause_effect_rel(cause_sentences):
    effect = ' '.join(cause_sentences[1].split()[2:])
    cause = cause_sentences[0]
    return {'cause': cause, 'effect': effect, 'source': ' '.join(cause_sentences)}

def build_cause_effect_rel_1(cause_sentence):
    sent_parts = cause_sentence.lower().split(' cause')
    print(sent_parts)
    if sent_parts[1][0] == 's':
        cause = sent_parts[0]
        effect = sent_parts[1][2:]
    elif sent_parts[1][0] == 'd':
        effect = sent_parts[0]
        cause = sent_parts[1][2:]
    else:
        return {}
    return {'cause': cause, 'effect': effect, 'source': cause_sentence}

In [350]:
c_and_e_relationships = [build_cause_effect_rel(ce) for ce in cause_and_proceed]
len(c_and_e_relationships)

112

In [378]:
all_causes_and_effects = c_and_e_relationships + c_and_e_relationships_1

In [380]:
tqa_c_and_e_df = pd.DataFrame(all_causes_and_effects)

In [381]:
tqa_c_and_e_df.to_csv('tqa_exctracted_rel_v0.tsv', index=False, sep='\t')

In [599]:
tqa_c_and_e_df

Unnamed: 0,cause,effect,source
0,A mountain stream flows very quickly because o...,a lot of erosion and very little deposition.,A mountain stream flows very quickly because o...
1,The water erodes the softer rock faster than t...,"the stream bed to drop down, like a step, crea...",The water erodes the softer rock faster than t...
2,Most waves strike the shore at an angle.,longshore drift.,Most waves strike the shore at an angle. This ...
3,Dust flew into the atmosphere and blocked sunl...,a deep freeze and ended photosynthesis.,Dust flew into the atmosphere and blocked sunl...
4,As a result: Water on the side of Earth facing...,a bulge of water on that side of Earth.,As a result: Water on the side of Earth facing...
5,"If the water freezes, it expands.",causes the rocks to crack.,"If the water freezes, it expands. This eventua..."
6,Earth is spinning as air moves over its surface.,the Coriolis effect.,Earth is spinning as air moves over its surfac...
7,An ice age is a period when temperatures are c...,glaciers to spread to lower latitudes.,An ice age is a period when temperatures are c...
8,Figure 17.23 shows how much less sea ice there...,cause sea level to rise even higher.,Figure 17.23 shows how much less sea ice there...
9,"During an El Nio, the western Pacific Ocean is...",the trade winds to change direction.,"During an El Nio, the western Pacific Ocean is..."


## extract_questions

In [549]:
build_ndqs = [list(dict_key_extract('nonDiagramQuestions', lesson))[0] for lesson in ck12_combined_dataset]
build_dqs = [list(dict_key_extract('diagramQuestions', lesson))[0] for lesson in ck12_combined_dataset]

In [550]:
ndqs = dict(pair for d in build_ndqs for pair in d.items())
dqs = dict(pair for d in build_dqs for pair in d.items())

In [584]:
def filter_causaulity_question(question):
    if question['questionType'] == 'Multiple Choice' and question['questionSubType'] != 'Multiple Choice':
        return {}
    question_text = question['beingAsked']['processedText']
    answer_options = ' '.join([ac['rawText'] for ac in question['answerChoices'].values()])
    if any(st in question_text for st in cause_phrases):
           return {'question_text': question_text, 'answer_options': answer_options, 'correct_answer': question['correctAnswer']['processedText']}
    else:
        return {}

In [589]:
tqa_questions = [filter_causaulity_question(q) for q in ndqs.values()]
tqa_questions += [filter_causaulity_question(q) for q in dqs.values()]
tqa_questions = [q for q in tqa_questions if q]

In [591]:
tqa_q_df = pd.DataFrame(tqa_questions)

In [592]:
tqa_q_df = tqa_q_df[['question_text', 'answer_options', 'correct_answer']]

In [593]:
tqa_q_df.to_csv('tqa_causality_questions.tsv', index=False, sep='\t')

In [598]:
tqa_q_list = tqa_q_df['question_text'].tolist()

# Explore Omnibus Questions

In [132]:
dataset_root_dir = './data_sources/'
omni_file = 'Omnibus 4 NDMC Train Topic+Category Performance [For a good cause] - Aristo per 2016%2F06%2F24.csv'
omni_path =  os.path.join(dataset_root_dir, omni_file)

omni_df = pd.read_csv(omni_path, error_bad_lines=False)
omni_df = omni_df.fillna('NA')
omni_cause_df

omni_cause_df = omni_df[omni_df['Category'].str.contains('causality')]
omni_df_working = omni_cause_df[['QID', 'AnswerKey', 'QuestionText']]

In [128]:
omni_cause_df_simple = omni_df[omni_df['Category'] == 'causality']
omni_df_working_simple = omni_cause_df_simple[['QID', 'AnswerKey', 'QuestionText']]

In [130]:
# omni_df_working_simple

In [135]:
q_sample = np.random.choice(omni_df_working_simple['QuestionText'], 5).tolist()
_ = [print(q, '\n') for q in q_sample]

As Chita pumps air into her beach ball, the ball gets larger and becomes round. This best explanation for this is that air ___. (A) expands when it gets warm (B) takes up space (C) is lighter than oxygen (D) has definite shape 

Which results in a chemical change? (A) A student smells a flower. (B) A teacher lights a candle. (C) A student colors a paper blue. (D) A teacher feels a rough cloth. 

Automobiles move as a result of which type of energy transformation? (A) Chemical to electrical (B) Mechanical to heat (C) Chemical to mechanical (D) Kinetic to potential 

Which force causes a marble to sink to the bottom of a glass of water? (A) gravity (B) friction (C) magnetism (D) electricity 

A squirrel gathering nuts helps trees (A) grow. (B) reproduce. (C) resist disease. (D) become stronger. 



# Explore other existing questions

In [192]:
%%capture
simple_cause_q_df = pd.read_csv(dataset_root_dir + 'causality-questions-simple-list.tsv', sep='(', header=None, error_bad_lines=False)

In [171]:
simple_cause_q_df.head(2)

Unnamed: 0,0,1,2,3,4
0,"When ice melts, it becomes a",A) gas.,B) solid.,C) liquid.,D) plasma.\t
1,What can a flower become?,A) a fruit,B) a leaf,C) a stem,D) a branch\t


In [143]:
tushar_questions = pd.read_csv( dataset_root_dir + 'causal_questions_with_extracted_relations.tsv', sep='\t')

In [145]:
tushar_questions.shape

(371, 5)

In [150]:
tushar_questions_contains_cause = tushar_questions[tushar_questions['Question text'].str.contains('cause')]

In [153]:
tushar_questions_contains_cause.shape

(86, 5)

In [156]:
# tushar_questions_contains_cause['Complete question']

In [158]:
q_sample = np.random.choice(tushar_questions_contains_cause['Question text'], 5).tolist()
_ = [print(q, '\n') for q in q_sample]

Which would cause the most soil to erode from a river bank in just a few days? 

The Grand Canyon is a perfect example of ___, over millions of years, caused by the flow of the Colorado River. 

What does a mirror do to light that causes objects to appear backwards? 

Desert environments receive very little rainfall each year and water is a very limited resource. Because of this limited amount of water, we would expect 

Physical weathering is caused by water freezing and thawing because water ___. 



Rules:




# Explore existing KBs

In [74]:
cause_rel_tables_df = pd.read_csv(dataset_root_dir + 'cause_relations_from_tables.tsv', sep='\t', header=None)
cause_rel_tables_df.columns = ['cause', '_', 'effect', 'source']
cause_rel_tables_df = cause_rel_tables_df[['cause', 'effect', 'source']]

In [82]:
cause_ferret = pd.read_csv(dataset_root_dir + 'CAUSE-extracted-by-Ferret-from-WebSentences.tsv', sep='\t', header=None)
effect_ferret = pd.read_csv(dataset_root_dir + 'EFFECT-extracted-by-Ferret-from-WebSentences.tsv', sep='\t', header=None)
cause_ferret.columns = ['cause', '_', 'effect', 'source']
cause_ferret = cause_ferret[['cause', 'effect', 'source']]

In [88]:
concept_net = pd.read_csv(dataset_root_dir + 'conceptnet-causal-db.tsv', sep='\t', header=None)
concept_net.columns = ['cause', '_', 'effect']
concept_net = concept_net[['cause', 'effect']]

In [177]:
cause_rel_tables_df.shape

(308, 3)