In [2]:
import sys, os, time
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
import csv
import json
import util
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/fleeb/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [28]:
stop_words = set(stopwords.words('english')) 

txt = "Sukanya, Rajib and Naba are my good friends. " \
    "Sukanya is getting married next year. " \
    "Marriage is a big step in one’s life. " \
    "It is both exciting and frightening. " \
    "But friendship is a sacred bond between people. " \
    "It is a special kind of love between us. " \
    "Many of you must have tried searching for a friend "\
    "but never found the right one."

tokens = word_tokenize(txt)

In [29]:
root = r'/mnt/c/Users/anwan/OneDrive - UW Office 365/Khan/Capstone/aristo-mini/questions'
root = r'aristo-mini/questions'
filenames = ['AI2-Elementary-NDMC-Feb2016-Dev.jsonl', 'AI2-8thGr-NDMC-Feb2016-Dev.jsonl']

In [30]:
questions = []
with open(os.path.join(root, filenames[0]), 'r') as f:
    lines = f.readlines()
    for line in lines:
        questions.append(json.loads(line))
len(questions)

84

In [31]:
questions[0]

{'id': '89629',
 'question': {'stem': 'Which of the following groups of materials would most likely be used to build an electromagnet?',
  'choices': [{'label': 'A', 'text': 'bare wire, plastic rod, battery'},
   {'label': 'B', 'text': 'bare wire, iron rod, light bulb'},
   {'label': 'C', 'text': 'insulated wire, iron rod, battery'},
   {'label': 'D', 'text': 'insulated wire, plastic rod, light bulb'}]},
 'answerKey': 'C'}

In [32]:
questions[0]['question']['stem']

'Which of the following groups of materials would most likely be used to build an electromagnet?'

In [33]:

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return wn.NOUN

def normalize_text(text):

    tags = nltk.pos_tag(word_tokenize(text))
    tokens = []
    for tag in tags:
        wn_tag = penn_to_wn(tag[1])
        tokens.append(WordNetLemmatizer().lemmatize(tag[0],wn_tag))
    return tokens

In [34]:
def filter_tokens(tok):
    return len(tok) > 1 and tok != '___'

def bag_text(text):
    return Counter(tok for tok in normalize_text(text.lower()) if filter_tokens(tok))

def bag_question(q):
    bag = bag_text(q['question']['stem'])
    for ans in q['question']['choices']:
        bag += bag_text(ans['text'])
    return bag

def tfidf(bag, full):
    terms = []
    for x, tf in bag.items():
        df = full[x]
        terms.append((x, tf/df))
    return sorted(terms, reverse=True, key=lambda t: t[1])


In [52]:
#bag_question(questions[0])

In [36]:
questions[0]

{'id': '89629',
 'question': {'stem': 'Which of the following groups of materials would most likely be used to build an electromagnet?',
  'choices': [{'label': 'A', 'text': 'bare wire, plastic rod, battery'},
   {'label': 'B', 'text': 'bare wire, iron rod, light bulb'},
   {'label': 'C', 'text': 'insulated wire, iron rod, battery'},
   {'label': 'D', 'text': 'insulated wire, plastic rod, light bulb'}]},
 'answerKey': 'C'}

In [37]:
bags = [bag_question(q) for q in questions]
len(bags)

84

In [38]:
full = Counter()
for bag in bags:
    full += bag
len(full)

1056

In [44]:
idx = np.random.randint(len(questions))
q, bag = questions[idx], bags[idx]
print(q['question']['stem'])
values = tfidf(bag_text(q['question']['stem']), full)
print(len(values), values[:10])

Solar panels are used to absorb sunlight. Which color panel would absorb the most sunlight?
12 [('absorb', 0.6666666666666666), ('panel', 0.5), ('sunlight', 0.4), ('color', 0.2), ('solar', 0.14285714285714285), ('would', 0.047619047619047616), ('most', 0.045454545454545456), ('use', 0.043478260869565216), ('which', 0.018518518518518517), ('to', 0.011764705882352941)]


In [None]:
values = tfidf(bag_text(q['question']['stem']) + , full)
print(len(values), values[:10])

In [53]:
bag = bag_text(q['question']['stem'])
for choice in q['question']['choices']:
    ans = bag_text(choice['text']) + bag
#     ans.subtract(bag)
#     ans = Counter(w for w, n in ans.items() if n > 0)
    vals = tfidf(ans, full)
    top = [w for w,v in vals[:10]]
    critical = [w for w, v in vals if v > 0.3]
    picks = top if len(top) > len(critical) else critical
    break

In [54]:
picks

['black',
 'absorb',
 'panel',
 'sunlight',
 'color',
 'solar',
 'would',
 'most',
 'use',
 'which']