In [1]:
import io
import operator
import re
import string
from pprint import pprint
import math
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import json
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import xml

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

input_para = "I went into my bedroom and flipped the light switch. Oh, I see that the ceiling lamp is not turning on. It must be that the light bulb needs replacement. I go through my closet and find a new light bulb that will fit this lamp and place it in my pocket. I also get my stepladder and place it under the lamp. I make sure the light switch is in the off position. I climb up the ladder and unscrew the old light bulb. I place the old bulb in my pocket and take out the new one. I then screw in the new bulb. I climb down the stepladder and place it back into the closet. I then throw out the old bulb into the recycling bin. I go back to my bedroom and turn on the light switch. I am happy to see that there is again light in my room."

vocab = []

# sw = stopwords.words('english')
punc = string.punctuation

In [3]:
def preprocess(text):
    global vocab
    texts = sent_tokenize(text)
    all_text = []
    # remove stop words
    for text in texts:
        # text = text.replace('.', '')
        text = re.findall(r"[\w']+|[.,!?;]", text)
        # print(len(text))
        text = [wordnet_lemmatizer.lemmatize(word.lower()) for word in text]
        text = [word.lower() for word in text if word and (word not in punc)]  # and (word.lower() not in sw)
        # print(len(text))
        all_text += text
    # print((len(set(all_text))))
    # print(sorted(list(set(all_text))))
    all_text = list(all_text)
    vocab += all_text

    return all_text

In [4]:
def read_json_data(path='all_scripts.json'):
    all_scripts = {}
    with io.open('all_scripts.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        for key in tqdm(sorted(data.keys())):
            scripts = data[key]
            for script in scripts:
                all_scripts[key + '.' + script['id']] = preprocess(script['text'])
                # print(all_scripts)
    return all_scripts

In [5]:
def idf(corpus_data, train_data, vocab):
    idf_values = {}
    """
    for k, v in data.items():
        all_tokens_set += list(set([item for item in v]))
    all_tokens_set = set(all_tokens_set)
    """
    for tkn in tqdm(vocab):
        contains_token = 0
        for k, v in corpus_data.items():
            contains_token += v.count(tkn)

        for k, v in train_data.items():
            contains_token += v.count(tkn)
        # contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        if contains_token == 0:
            contains_token = 1
        idf_values[tkn] = 1 + math.log((len(train_data.keys()) + len(corpus_data.keys())) / contains_token)
    return idf_values


def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection) / len(union)


def tf(term, tokenized_document):
    return tokenized_document.count(term)


def sublinear_term_frequency(term, doc):
    count  = doc.count(term)
    if count == 0:
        return 0
    return 1 + math.log(count)


def augmented_term_frequency(term, doc):
    max_count = max([tf(t, doc) for t in doc])
    return 0.5 + ((0.5 * tf(term, doc)) / max_count)


def tf_idf(corpus_data, train_data):
    idf_val = idf(corpus_data, train_data, vocab)
    tfidf_corpus = {}
    tfidf_train = {}
    for doc_id, doc in tqdm(corpus_data.items()):
        doc_tfidf = []
        for term in idf_val.keys():
            tf = sublinear_term_frequency(term, doc)
            doc_tfidf.append(tf * idf_val[term])
        tfidf_corpus[doc_id] = doc_tfidf
    for doc_id, doc in tqdm(train_data.items()):
        doc_tfidf = []
        for term in idf_val.keys():
            tf = sublinear_term_frequency(term, doc)
            doc_tfidf.append(tf * idf_val[term])
        tfidf_train[doc_id] = doc_tfidf
    return tfidf_corpus, tfidf_train


def cosine_similarity(vector1, vector2):
    dot_product = sum(p * q for p, q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val ** 2 for val in vector1])) * math.sqrt(sum([val ** 2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product / magnitude

In [6]:
def load_train_data(path='data/train-data.xml'):
    root = xml.etree.ElementTree.parse(path).getroot()
    """
    <data>
      <instance id="0">
        <text>I went into my bedroom and flipped the light switch. Oh, I see that the ceiling lamp is not turning on. It must be that the light bulb needs replacement. I go through my closet and find a new light bulb that will fit this lamp and place it in my pocket. I also get my stepladder and place it under the lamp. I make sure the light switch is in the off position. I climb up the ladder and unscrew the old light bulb. I place the old bulb in my pocket and take out the new one. I then screw in the new bulb. I climb down the stepladder and place it back into the closet. I then throw out the old bulb into the recycling bin. I go back to my bedroom and turn on the light switch. I am happy to see that there is again light in my room.</text>
        <questions>
          <question id="0" text="Which room did the light go out in?">
            <answer correct="False" id="0" text="Kitchen."/>
            <answer correct="True" id="1" text="Bedroom."/>
          </question>
        </questions>
      </instance>
    </data>
    """
    data = {}
    for instance in tqdm(root.findall('instance')):
        data[instance.get('id')] = preprocess(instance.find('text').text)
    return data

In [7]:
corpus_data = read_json_data()

100%|██████████| 248/248 [00:03<00:00, 63.59it/s]


In [8]:
train_data = load_train_data()

100%|██████████| 1470/1470 [00:01<00:00, 1003.97it/s]


In [9]:
vocab = set(vocab)
print("Len of vocab: ", len(set(vocab)))

Len of vocab:  9626


In [10]:
train_data['1']

['the',
 'weather',
 'wa',
 'so',
 'nice',
 'today',
 'that',
 'i',
 'decided',
 'to',
 'have',
 'a',
 'barbecue',
 'i',
 'called',
 'up',
 'some',
 'of',
 'my',
 'friend',
 'to',
 'invite',
 'them',
 'over',
 'i',
 'set',
 'up',
 'some',
 'chair',
 'and',
 'table',
 'outside',
 'for',
 'my',
 'friend',
 'to',
 'sit',
 'around',
 'i',
 'then',
 'took',
 'out',
 'some',
 'hot',
 'dog',
 'from',
 'my',
 'fridge',
 'and',
 'brought',
 'them',
 'over',
 'to',
 'my',
 'barbecue',
 'my',
 'barbecue',
 'is',
 'a',
 'gas',
 'barbecue',
 'so',
 'all',
 'i',
 'had',
 'to',
 'do',
 'wa',
 'press',
 'the',
 'knob',
 'and',
 'turn',
 'it',
 'to',
 'ignite',
 'it',
 'once',
 'i',
 'got',
 'a',
 'little',
 'fire',
 'going',
 'i',
 'put',
 'my',
 'hot',
 'dog',
 'on',
 'the',
 'grill',
 'over',
 'it',
 'to',
 'cook',
 'i',
 'closed',
 'the',
 'lid',
 'and',
 'left',
 'my',
 'hot',
 'dog',
 'to',
 'cook',
 'for',
 'about',
 'ten',
 'minute',
 'after',
 'ten',
 'minute',
 'i',
 'decided',
 'to',
 'check

In [11]:
idf_val = {}
print(len(idf_val.items()))

0


In [12]:
print(jaccard_similarity(corpus_data['access_the_internet.xml.1'], corpus_data['baking a cake.new.xml.2']))

0.05555555555555555


In [13]:
tfidf_corpus, tfidf_train = tf_idf(corpus_data, train_data)

100%|██████████| 9626/9626 [01:16<00:00, 125.85it/s]
100%|██████████| 13175/13175 [01:11<00:00, 183.61it/s]
100%|██████████| 1470/1470 [00:29<00:00, 49.73it/s]


In [14]:
tfidf_corpus['access_the_internet.xml.1']

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.6577890062615677,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

In [15]:
print(cosine_similarity(tfidf_corpus['access_the_internet.xml.1'], tfidf_corpus['baking a cake.new.xml.2']))

0.021007136191537233


In [16]:
len(tfidf_corpus['access_the_internet.xml.1'])

9626

In [20]:
print(cosine_similarity(tfidf_corpus['access_the_internet.xml.1'], tfidf_corpus['access_the_internet.xml.2']))

0.9999894907962942


In [18]:
len(tfidf_corpus.keys()), len(tfidf_train.keys())

(13175, 1470)

In [19]:
print(cosine_similarity(tfidf_corpus['access_the_internet.xml.1'], tfidf_train['1']))

0.0202966679777311


In [37]:
"""
sim = {}
t = tqdm(list(tfidf_train.keys())[:5])
for x in t:
    sim_x = {}
    i = 0
    for y in tfidf_corpus.keys():
        i += 1
        similarity = cosine_similarity(tfidf_train[x], tfidf_corpus[y])
        sim_x[y] = similarity
        #t.set_description("Iteration: %i" % i)
    sim[x] = sim_x

sim
"""

'\nsim = {}\nt = tqdm(list(tfidf_train.keys())[:5])\nfor x in t:\n    sim_x = {}\n    i = 0\n    for y in tfidf_corpus.keys():\n        i += 1\n        similarity = cosine_similarity(tfidf_train[x], tfidf_corpus[y])\n        sim_x[y] = similarity\n        #t.set_description("Iteration: %i" % i)\n    sim[x] = sim_x\n\nsim\n'

In [31]:
type(tfidf_corpus['access_the_internet.xml.1']), type(tfidf_train['1'])

(list, list)

In [32]:
import numpy as np
from scipy import spatial

In [35]:
sim = {}
t = tqdm(list(tfidf_train.keys())[:5])
for x in t:
    sim_x = {}
    i = 0
    for y in tfidf_corpus.keys():
        i += 1
        #similarity = 1 - spatial.distance.cosine(np.asarray(tfidf_train[x], dtype=np.float32), np.asarray(tfidf_corpus[y], dtype=np.float32))
        similarity = 1 - spatial.distance.cosine(tfidf_train[x], tfidf_corpus[y])
        #similarity = cosine_similarity(tfidf_train[x], tfidf_corpus[y])
        sim_x[y] = similarity
        #t.set_description("Iteration: %i" % i)
    sim[x] = sim_x
sim

100%|██████████| 5/5 [00:28<00:00,  5.67s/it]


{'0': {'access_the_internet.xml.1': 0.016687646169071035,
  'access_the_internet.xml.2': 0.016696614263143394,
  'access_the_internet.xml.3': 0.010128879658336531,
  'access_the_internet.xml.4': 0.01207575665058469,
  'access_the_internet.xml.5': 0.010799081875212835,
  'access_the_internet.xml.6': 0.011607362612709005,
  'access_the_internet.xml.7': 0.010254463407398107,
  'access_the_internet.xml.8': 0.017893729356510457,
  'access_the_internet.xml.9': 0.017743524166975888,
  'access_the_internet.xml.10': 0.016746979821175789,
  'access_the_internet.xml.11': 0.011986412510902822,
  'access_the_internet.xml.12': 0.016536760019475905,
  'access_the_internet.xml.13': 0.0,
  'access_the_internet.xml.14': 0.016509937180699263,
  'access_the_internet.xml.15': 0.0091794804707452382,
  'access_the_internet.xml.16': 0.011094461878771611,
  'access_the_internet.xml.17': 0.00077024681265369654,
  'access_the_internet.xml.18': 0.028918172677823928,
  'access_the_internet.xml.19': 0.0081189313078