In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [2]:
# same readData from STS.py
def readData(fileName):

    first_sentence = []
    second_sentence = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):
        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        first_sentence.insert(len(first_sentence),
                              (sentence.split('\t')[1].lower()).rstrip('.'))
        second_sentence.insert(len(first_sentence),
                               (sentence.split('\t')[2].lower()).rstrip('.'))
        # inserting the score as a separate lists
        score.insert(len(first_sentence), (sentence.split('\t')[3]))

    # print(first_sentence)
    return first_sentence, second_sentence, score


def preprocess(fileName):

    first_sentence, second_sentence, score = readData(fileName)
    first_sentence_tokens = []
    second_sentence_tokens = []

    # tokenizing and tagging
    first_sentence_tags = []
    second_sentence_tags = []

    for sentence in first_sentence:
        tokens = nltk.word_tokenize(sentence)
        first_sentence_tokens.insert(len(first_sentence_tokens), tokens)
        first_sentence_tags.insert(
            len(first_sentence_tags), nltk.pos_tag(tokens))
        # print(first_sentence_tokens)

    for sentence in second_sentence:
        tokens = nltk.word_tokenize(sentence)
        second_sentence_tokens.insert(len(second_sentence_tokens), tokens)
        second_sentence_tags.insert(
            len(second_sentence_tags), nltk.pos_tag(tokens))

        # print(second_sentence_tokens)

    # lemmatizing
    first_sentence_lemmas = []
    second_sentence_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in first_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        first_sentence_lemmas.insert(
            len(first_sentence_lemmas), sentence_components)

    for sentence in second_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        second_sentence_lemmas.insert(
            len(second_sentence_lemmas), sentence_components)

    return first_sentence, second_sentence, score, first_sentence_tokens, second_sentence_tokens


In [3]:
s1_arr_train, s2_arr_train, scores_train, s1_tokens_train, s2_tokens_train = preprocess("./data/train-set.txt")

## Feature Engineering

This section includes all the code/functions to create features.

### Cosine Similarity (TF-IDF)

In [4]:
def calc_cosine_similarity(sentence1, sentence2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([sentence1, sentence2])
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
#     print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

### Smooth Inverse Frequency (SIF)

In [5]:
def frequency_distribution(s1_tokens_array, s2_tokens_array):
    freq_dist = FreqDist()
    for i in range(len(s1_tokens_array)):
        for token in (s1_tokens_array[i] + s2_tokens_array[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

In [6]:
freq_dist = frequency_distribution(s1_tokens_train, s2_tokens_train)
print(freq_dist.most_common(40))

[('the', 5169), (',', 3690), ('of', 2497), ('to', 2133), ('and', 1716), ('a', 1615), ('in', 1573), ('is', 891), ('that', 831), ('on', 820), ('for', 756), ('it', 587), ('this', 579), ('we', 531), ('with', 464), ('be', 459), ('by', 443), ('i', 425), ('which', 403), ('have', 384), ('not', 366), ('at', 343), ('as', 334), ('are', 333), ('has', 319), ('said', 316), ('was', 304), ('european', 287), ("'s", 280), ('from', 261), ('``', 252), ("''", 242), ('will', 233), ('.', 229), ('also', 223), ('its', 194), ('but', 193), ('would', 191), ('all', 188), ('percent', 187)]


In [7]:
def calc_sif_similarity(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

In [8]:
calc_sif_similarity('I like some apples', 'I like the pears')

1.4515545128534995e-10

### Normalized Synset+ Overlap

In [9]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = s1_tokens_train+s2_tokens_train
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

In [10]:
def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

In [11]:
def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

In [12]:
s1 = s1_tokens_train[0]
s2 = s2_tokens_train[0]

s1 = remove_stopwords(s1)
s1 = remove_duplicate_tokens(s1)

s2 = remove_stopwords(s2)
s2 = remove_duplicate_tokens(s2)

print(s1)
print(s2)

overlap = 0
encountered_indexes = []
for word in (s1+s2):
    word_index = words_filtered.index(word)
    if word_index in encountered_indexes: # we know we have found an overlap
        overlap += 1
    encountered_indexes.append(word_index)
    
print(overlap)
avg_sentence_len = len(s1+s2) / 2

print(overlap / avg_sentence_len)

['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'hoped', 'see', 'bidders', 'interested', 'individual', 'assets', 'team']
['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'next', 'day', 'two']
9
0.6428571428571429


In [44]:
def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    overlap = 0
    encountered_indexes = []
    for word in (s1_tokens+s2_tokens):
        try:
            word_index = words_filtered.index(word)
            if word_index in encountered_indexes: # we know we have found an overlap
                overlap += 1
            encountered_indexes.append(word_index)
        except ValueError:
            print(word+'not found in lexicon. Skipping...')

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

## Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [45]:
def pipeline(s1_array, s2_array, s1_tokens, s2_tokens):
    # TODO add a check to ensure the lengths of these arrays are the same
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        sif_sim = calc_sif_similarity(s1_array[i], s2_array[i])
        overlap, normalized_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
        data.append([i, cos_sim, sif_sim, overlap, normalized_overlap])
    return data

In [46]:
data = pipeline(s1_arr_train, s2_arr_train, s1_tokens_train, s2_tokens_train)
print(data[0:5])

[[0, 0.5949218057093537, 0.4043188683415115, 9, 0.6428571428571429], [1, 0.474330706497194, 0.37040524322972224, 6, 0.631578947368421], [2, 0.392181175971253, 0.1358693286767868, 5, 0.5], [3, 0.668348418668298, 0.6935512636502701, 11, 0.7333333333333333], [4, 0.12170566815950139, 4.979960298599938e-10, 3, 0.24]]


In [47]:
scores_train[0:5]

['4', '4', '3', '3', '2']

## Models

In this section we fit our feature set to a model.

### Decision Tree

In [48]:
dt_classifier = DecisionTreeClassifier(random_state=14)
dt_classifier.fit(data,scores_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [49]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')
print(f'Accuracy: {dt_classifier.score(data, scores_train)}')

Nodes: 1095
Max Depth: 26
Accuracy: 1.0


### XGBoost

In [50]:
# s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev = preprocess("./data/dev-set.txt")
# dev_data = pipeline(s1_arr_dev, s2_arr_dev)
# dev_predictions = dt_classifier.predict(dev_data)

# xgboost_model = xgb.XGBRegressor(booster='gbtree', 
#                        n_estimators=1000,
#                        n_jobs=4,
#                        learning_rate=.05,
#                        max_depth=3,
#                        random_state=42,
#                        gamma=.05,
#                        early_stopping_rounds = 5)

# xgboost_model.fit(data, scores_train,
# #                   eval_set=[(X_train, y_train), (X_test, y_test)],
#                   eval_metric='logloss')

# Testing

In [54]:
s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev = preprocess("./data/dev-set.txt")
dev_data = pipeline(s1_arr_dev, s2_arr_dev, s1_tokens_dev, s2_tokens_dev)
dev_predictions = dt_classifier.predict(dev_data)
dev_data[0:5]

aujourdnot found in lexicon. Skipping...
huinot found in lexicon. Skipping...
lucknot found in lexicon. Skipping...
referencesnot found in lexicon. Skipping...
paragraphsnot found in lexicon. Skipping...
proposingnot found in lexicon. Skipping...
referencesnot found in lexicon. Skipping...
paragraphsnot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
ferventnot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
5.30not found in lexicon. Skipping...
5.30pmnot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
revertnot found in lexicon. Skipping...
increasesnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
augmentingnot found in lexicon. Skipping...
potencynot 

losersnot found in lexicon. Skipping...
competitivenessnot found in lexicon. Skipping...
regressnot found in lexicon. Skipping...
increasesnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smallernot found in lexicon. Skipping...
revertnot found in lexicon. Skipping...
attendsnot found in lexicon. Skipping...
sittingsnot found in lexicon. Skipping...
faithfullynot found in lexicon. Skipping...
honourablynot found in lexicon. Skipping...
latvianot found in lexicon. Skipping...
latvianot found in lexicon. Skipping...
selectivenot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
subsidynot found in lexicon. Skipping...
assistancesnot found in lexicon. Skipping...
premiumnot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
attendsnot found in lexicon. Skipping...
sittingsnot found in lexicon. Skipping...
faithfullynot found in lexicon. Skipping...

presidenot found in lexicon. Skipping...
increasesnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
detrimentnot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
ferventnot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
ardentnot found in lexicon. Skipping...
5.30not found in lexicon. Skipping...
5.30not found in lexicon. Skipping...
selectivenot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
subsidynot found in lexicon. Skipping...
occasionalnot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
selectivenot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
subsidynot found in lexicon. Skipping...
premiumnot found in lexicon. Skipping...
grassnot found in lexicon. Skipping...
pontificatenot found in lexicon. Skipping...
pridenot found in lexicon. Skipping...
tunisianot found in lexi

increasesnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
detrimentnot found in lexicon. Skipping...
smallnot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
hatrednot found in lexicon. Skipping...
partingnot found in lexicon. Skipping...
rancunenot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
hatrednot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
ill-willnot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
hatrednot found in lexicon. Skipping...
divergenot found in lexicon. Skipping...
owingnot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
vengeancenot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
ferventnot found in lexicon. Skipping...
alliesnot found in lexicon. Skipping...
ardentnot found in lexicon. Skipping...
selectivenot found in lexicon. Skipping...

catastrophenot found in lexicon. Skipping...
avoidnot found in lexicon. Skipping...
losersnot found in lexicon. Skipping...
competitivenessnot found in lexicon. Skipping...
regressnot found in lexicon. Skipping...
catastrophenot found in lexicon. Skipping...
avoidnot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
vannot found in lexicon. Skipping...
ordennot found in lexicon. Skipping...
a5-0241/2000not found in lexicon. Skipping...
vannot found in lexicon. Skipping...
ordennot found in lexicon. Skipping...
a5-0241not found in lexicon. Skipping...
tunisianot found in lexicon. Skipping...
tunisianot found in lexicon. Skipping...
pontificatenot found in lexicon. Skipping...
increasesnot found in lexicon. Skipping...
expensenot found in lexicon. Skipping...
smal

5.30not found in lexicon. Skipping...
5.30not found in lexicon. Skipping...
pontificatenot found in lexicon. Skipping...
pridenot found in lexicon. Skipping...
scarcelynot found in lexicon. Skipping...
comparablenot found in lexicon. Skipping...
transferablenot found in lexicon. Skipping...
hardlynot found in lexicon. Skipping...
comparablenot found in lexicon. Skipping...
transferablenot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
fishermennot found in lexicon. Skipping...
inactivenot found in lexicon. Skipping...
tirednot found in lexicon. Skipping...
underlyingnot found in lexicon. Skipping...
rootnot found in lexicon. Skipping...
underlyingnot found in lexicon. Skipping...
appropriatenot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
hatrednot found in lexicon. Skipping...
accumulatednot found in lexicon. Skipping...
resentmentnot found in lexico

15,000not found in lexicon. Skipping...
300not found in lexicon. Skipping...
databasenot found in lexicon. Skipping...
softnot found in lexicon. Skipping...
pepsiconot found in lexicon. Skipping...
pepnot found in lexicon. Skipping...
spotlightnot found in lexicon. Skipping...
aluminumnot found in lexicon. Skipping...
alcoanot found in lexicon. Skipping...
pepnot found in lexicon. Skipping...
virtualnot found in lexicon. Skipping...
pcnot found in lexicon. Skipping...
boughtnot found in lexicon. Skipping...
connectixnot found in lexicon. Skipping...
virtualnot found in lexicon. Skipping...
pcnot found in lexicon. Skipping...
developernot found in lexicon. Skipping...
connectixnot found in lexicon. Skipping...
taylornot found in lexicon. Skipping...
departurenot found in lexicon. Skipping...
liberianot found in lexicon. Skipping...
mediatorsnot found in lexicon. Skipping...
departurenot found in lexicon. Skipping...
bloodshednot found in lexicon. Skipping...
liberianot found in lexicon.

opinionsnot found in lexicon. Skipping...
listeningnot found in lexicon. Skipping...
melissanot found in lexicon. Skipping...
berryhillnot found in lexicon. Skipping...
lordnot found in lexicon. Skipping...
falconernot found in lexicon. Skipping...
hailednot found in lexicon. Skipping...
beginningnot found in lexicon. Skipping...
crownnot found in lexicon. Skipping...
prosecutionnot found in lexicon. Skipping...
beginningnot found in lexicon. Skipping...
crownnot found in lexicon. Skipping...
prosecutionnot found in lexicon. Skipping...
certifiablenot found in lexicon. Skipping...
mid-julynot found in lexicon. Skipping...
schoolsnot found in lexicon. Skipping...
wontnot found in lexicon. Skipping...
185.9not found in lexicon. Skipping...
schoolsnot found in lexicon. Skipping...
wontnot found in lexicon. Skipping...
carolenot found in lexicon. Skipping...
keetonnot found in lexicon. Skipping...
hartfordnot found in lexicon. Skipping...
2.88not found in lexicon. Skipping...
46.50not foun

frogsnot found in lexicon. Skipping...
mid-1800snot found in lexicon. Skipping...
1926not found in lexicon. Skipping...
frogsnot found in lexicon. Skipping...
mid-1800snot found in lexicon. Skipping...
brandeisnot found in lexicon. Skipping...
heartrendingnot found in lexicon. Skipping...
heartrendingnot found in lexicon. Skipping...
brandeisnot found in lexicon. Skipping...
finishednot found in lexicon. Skipping...
mixednot found in lexicon. Skipping...
directionlessnot found in lexicon. Skipping...
beginsnot found in lexicon. Skipping...
indicatorsnot found in lexicon. Skipping...
spotlightnot found in lexicon. Skipping...
grabbednot found in lexicon. Skipping...
beginsnot found in lexicon. Skipping...
indicatorsnot found in lexicon. Skipping...
spotlightnot found in lexicon. Skipping...
dolphinnot found in lexicon. Skipping...
damagingnot found in lexicon. Skipping...
loggingnot found in lexicon. Skipping...
monitoringnot found in lexicon. Skipping...
controlsnot found in lexicon. S

mandalaynot found in lexicon. Skipping...
eightnot found in lexicon. Skipping...
29.42not found in lexicon. Skipping...
mandalaynot found in lexicon. Skipping...
29.42not found in lexicon. Skipping...
closelynot found in lexicon. Skipping...
decreasednot found in lexicon. Skipping...
reportedlynot found in lexicon. Skipping...
durstnot found in lexicon. Skipping...
plottednot found in lexicon. Skipping...
identitynot found in lexicon. Skipping...
durstnot found in lexicon. Skipping...
murderednot found in lexicon. Skipping...
identitynot found in lexicon. Skipping...
objectionnot found in lexicon. Skipping...
stockholdersnot found in lexicon. Skipping...
attemptingnot found in lexicon. Skipping...
arrangenot found in lexicon. Skipping...
dncnot found in lexicon. Skipping...
mcauliffenot found in lexicon. Skipping...
layoffsnot found in lexicon. Skipping...
arrangenot found in lexicon. Skipping...
mcauliffenot found in lexicon. Skipping...
layoffsnot found in lexicon. Skipping...
staffe

135.13not found in lexicon. Skipping...
135.26not found in lexicon. Skipping...
castlenot found in lexicon. Skipping...
delawarenot found in lexicon. Skipping...
patientnot found in lexicon. Skipping...
62-year-oldnot found in lexicon. Skipping...
babylonnot found in lexicon. Skipping...
suffolknot found in lexicon. Skipping...
baucusnot found in lexicon. Skipping...
disparitynot found in lexicon. Skipping...
baucusnot found in lexicon. Skipping...
reaffirmednot found in lexicon. Skipping...
dominatednot found in lexicon. Skipping...
steelmakernot found in lexicon. Skipping...
16.2not found in lexicon. Skipping...
aknot found in lexicon. Skipping...
16.2not found in lexicon. Skipping...
39not found in lexicon. Skipping...
15.59not found in lexicon. Skipping...
15.44not found in lexicon. Skipping...
2.04not found in lexicon. Skipping...
incorrectnot found in lexicon. Skipping...
timothynot found in lexicon. Skipping...
murisnot found in lexicon. Skipping...
writtennot found in lexicon. 

hormonenot found in lexicon. Skipping...
therapynot found in lexicon. Skipping...
combinesnot found in lexicon. Skipping...
progestinnot found in lexicon. Skipping...
doublesnot found in lexicon. Skipping...
hormonenot found in lexicon. Skipping...
therapynot found in lexicon. Skipping...
increasesnot found in lexicon. Skipping...
combinationnot found in lexicon. Skipping...
progestinnot found in lexicon. Skipping...
surprisinglynot found in lexicon. Skipping...
impassionednot found in lexicon. Skipping...
steelworkersnot found in lexicon. Skipping...
endorsementnot found in lexicon. Skipping...
pickednot found in lexicon. Skipping...
endorsementnot found in lexicon. Skipping...
steelworkersnot found in lexicon. Skipping...
eightnot found in lexicon. Skipping...
eightnot found in lexicon. Skipping...
adoranot found in lexicon. Skipping...
obinot found in lexicon. Skipping...
nwezenot found in lexicon. Skipping...
naacpnot found in lexicon. Skipping...
conclusionnot found in lexicon. Sk

habitsnot found in lexicon. Skipping...
seriouslynot found in lexicon. Skipping...
rethinknot found in lexicon. Skipping...
business-as-usualnot found in lexicon. Skipping...
avoidnot found in lexicon. Skipping...
complacencynot found in lexicon. Skipping...
habitsnot found in lexicon. Skipping...
seriouslynot found in lexicon. Skipping...
rethinknot found in lexicon. Skipping...
business-as-usualnot found in lexicon. Skipping...
four-monthnot found in lexicon. Skipping...
olivetnot found in lexicon. Skipping...
kerkowskinot found in lexicon. Skipping...
fassettnot found in lexicon. Skipping...
olivetnot found in lexicon. Skipping...
vindicatednot found in lexicon. Skipping...
ultimatelynot found in lexicon. Skipping...
vindicatednot found in lexicon. Skipping...
ultimatelynot found in lexicon. Skipping...
despicablenot found in lexicon. Skipping...
faithnot found in lexicon. Skipping...
hatenot found in lexicon. Skipping...
despicablenot found in lexicon. Skipping...
faithnot found in

draxnot found in lexicon. Skipping...
banksnot found in lexicon. Skipping...
hedgingnot found in lexicon. Skipping...
goldmannot found in lexicon. Skipping...
draxnot found in lexicon. Skipping...
banksnot found in lexicon. Skipping...
hedgingnot found in lexicon. Skipping...
thoracicnot found in lexicon. Skipping...
thoracicnot found in lexicon. Skipping...
dissentnot found in lexicon. Skipping...
rehnquistnot found in lexicon. Skipping...
clarencenot found in lexicon. Skipping...
rehnquistnot found in lexicon. Skipping...
justicesnot found in lexicon. Skipping...
antoninnot found in lexicon. Skipping...
scalianot found in lexicon. Skipping...
clarencenot found in lexicon. Skipping...
dissentednot found in lexicon. Skipping...
penaltiesnot found in lexicon. Skipping...
penaltiesnot found in lexicon. Skipping...
575,926not found in lexicon. Skipping...
signaturenot found in lexicon. Skipping...
575,926not found in lexicon. Skipping...
aircraftnot found in lexicon. Skipping...
pilotsnot

expandnot found in lexicon. Skipping...
expandnot found in lexicon. Skipping...
enlistsnot found in lexicon. Skipping...
church-basednot found in lexicon. Skipping...
youthnot found in lexicon. Skipping...
anti-drugnot found in lexicon. Skipping...
programsnot found in lexicon. Skipping...
commercialnot found in lexicon. Skipping...
2.6.0not found in lexicon. Skipping...
kernelnot found in lexicon. Skipping...
commercialnot found in lexicon. Skipping...
releasesnot found in lexicon. Skipping...
kernelnot found in lexicon. Skipping...
distributorsnot found in lexicon. Skipping...
977not found in lexicon. Skipping...
6.52not found in lexicon. Skipping...
1,645.66not found in lexicon. Skipping...
300not found in lexicon. Skipping...
hallanot found in lexicon. Skipping...
briannot found in lexicon. Skipping...
hallanot found in lexicon. Skipping...
natseminot found in lexicon. Skipping...
115.9not found in lexicon. Skipping...
115.9not found in lexicon. Skipping...
mesanot found in lexicon

restrictionsnot found in lexicon. Skipping...
clintonnot found in lexicon. Skipping...
then-presidentnot found in lexicon. Skipping...
clintonnot found in lexicon. Skipping...
vetoednot found in lexicon. Skipping...
talkingnot found in lexicon. Skipping...
talkingnot found in lexicon. Skipping...
high-risknot found in lexicon. Skipping...
cargonot found in lexicon. Skipping...
high-risknot found in lexicon. Skipping...
cargonot found in lexicon. Skipping...
nielsnot found in lexicon. Skipping...
interviewernot found in lexicon. Skipping...
outlivenot found in lexicon. Skipping...
nielsennot found in lexicon. Skipping...
interviewernot found in lexicon. Skipping...
outlivenot found in lexicon. Skipping...
captnot found in lexicon. Skipping...
dougnot found in lexicon. Skipping...
criticalnot found in lexicon. Skipping...
20-year-oldnot found in lexicon. Skipping...
sisternot found in lexicon. Skipping...
allysonnot found in lexicon. Skipping...
severelynot found in lexicon. Skipping...


threatensnot found in lexicon. Skipping...
alienatenot found in lexicon. Skipping...
ashcroftnot found in lexicon. Skipping...
stancenot found in lexicon. Skipping...
threatensnot found in lexicon. Skipping...
alienatenot found in lexicon. Skipping...
ashcroftnot found in lexicon. Skipping...
caseloadnot found in lexicon. Skipping...
4.7not found in lexicon. Skipping...
hivnot found in lexicon. Skipping...
aidsnot found in lexicon. Skipping...
4.7not found in lexicon. Skipping...
hivnot found in lexicon. Skipping...
aidsnot found in lexicon. Skipping...
caseloadnot found in lexicon. Skipping...
eliasnot found in lexicon. Skipping...
tuccinot found in lexicon. Skipping...
emcnot found in lexicon. Skipping...
venturesnot found in lexicon. Skipping...
eliasnot found in lexicon. Skipping...
tuccinot found in lexicon. Skipping...
emcnot found in lexicon. Skipping...
commandernot found in lexicon. Skipping...
masakanot found in lexicon. Skipping...
commandingnot found in lexicon. Skipping...

296not found in lexicon. Skipping...
mainlandnot found in lexicon. Skipping...
5,200not found in lexicon. Skipping...
mainlandnot found in lexicon. Skipping...
300not found in lexicon. Skipping...
5,270not found in lexicon. Skipping...
redesignednot found in lexicon. Skipping...
colourednot found in lexicon. Skipping...
labelsnot found in lexicon. Skipping...
customizednot found in lexicon. Skipping...
dynamicnot found in lexicon. Skipping...
browsingnot found in lexicon. Skipping...
macnot found in lexicon. Skipping...
supportsnot found in lexicon. Skipping...
colourednot found in lexicon. Skipping...
labelsnot found in lexicon. Skipping...
dynamicnot found in lexicon. Skipping...
browsingnot found in lexicon. Skipping...
macnot found in lexicon. Skipping...
torchnot found in lexicon. Skipping...
conceptsnot found in lexicon. Skipping...
marsdennot found in lexicon. Skipping...
repeatednot found in lexicon. Skipping...
torchnot found in lexicon. Skipping...
jetbluenot found in lexicon

msnbcnot found in lexicon. Skipping...
msnbc.comnot found in lexicon. Skipping...
guynot found in lexicon. Skipping...
ashamednot found in lexicon. Skipping...
laughternot found in lexicon. Skipping...
guynot found in lexicon. Skipping...
ashamednot found in lexicon. Skipping...
blamenot found in lexicon. Skipping...
ricknot found in lexicon. Skipping...
photonot found in lexicon. Skipping...
hollingworthnot found in lexicon. Skipping...
examinednot found in lexicon. Skipping...
photographnot found in lexicon. Skipping...
drnot found in lexicon. Skipping...
hollingworthnot found in lexicon. Skipping...
examinednot found in lexicon. Skipping...
395,000not found in lexicon. Skipping...
atkinsonnot found in lexicon. Skipping...
salarynot found in lexicon. Skipping...
361,400not found in lexicon. Skipping...
395,000not found in lexicon. Skipping...
atkinsonnot found in lexicon. Skipping...
salarynot found in lexicon. Skipping...
361,400not found in lexicon. Skipping...
saudisnot found in l

4.1not found in lexicon. Skipping...
attorneysnot found in lexicon. Skipping...
expensesnot found in lexicon. Skipping...
plaintiffsnot found in lexicon. Skipping...
attorneysnot found in lexicon. Skipping...
4.1not found in lexicon. Skipping...
killnot found in lexicon. Skipping...
killnot found in lexicon. Skipping...
expansionnot found in lexicon. Skipping...
660not found in lexicon. Skipping...
afghanistannot found in lexicon. Skipping...
660not found in lexicon. Skipping...
guantanamonot found in lexicon. Skipping...
baynot found in lexicon. Skipping...
afghanistannot found in lexicon. Skipping...
dvoraknot found in lexicon. Skipping...
coversnot found in lexicon. Skipping...
816not found in lexicon. Skipping...
234-7743not found in lexicon. Skipping...
jdvoraknot found in lexicon. Skipping...
@not found in lexicon. Skipping...
kctar.comnot found in lexicon. Skipping...
bradnot found in lexicon. Skipping...
coopernot found in lexicon. Skipping...
johnsonnot found in lexicon. Skipp

governmentsnot found in lexicon. Skipping...
agreementsnot found in lexicon. Skipping...
interviewnot found in lexicon. Skipping...
healeynot found in lexicon. Skipping...
criminologistnot found in lexicon. Skipping...
sentimentnot found in lexicon. Skipping...
legislatorsnot found in lexicon. Skipping...
warinessnot found in lexicon. Skipping...
interviewnot found in lexicon. Skipping...
healeynot found in lexicon. Skipping...
criminologistnot found in lexicon. Skipping...
warinessnot found in lexicon. Skipping...
judge-executivenot found in lexicon. Skipping...
teresanot found in lexicon. Skipping...
bartonnot found in lexicon. Skipping...
firefighternot found in lexicon. Skipping...
lightningnot found in lexicon. Skipping...
frankfortnot found in lexicon. Skipping...
firefighternot found in lexicon. Skipping...
lightningnot found in lexicon. Skipping...
frankfortnot found in lexicon. Skipping...
brignot found in lexicon. Skipping...
gen.not found in lexicon. Skipping...
edwardnot fo

401not found in lexicon. Skipping...
knot found in lexicon. Skipping...
retireesnot found in lexicon. Skipping...
beneficiariesnot found in lexicon. Skipping...
comprisednot found in lexicon. Skipping...
401not found in lexicon. Skipping...
knot found in lexicon. Skipping...
portfoliosnot found in lexicon. Skipping...
overcamenot found in lexicon. Skipping...
hurdlenot found in lexicon. Skipping...
asbestosnot found in lexicon. Skipping...
exposurenot found in lexicon. Skipping...
overcamenot found in lexicon. Skipping...
hurdlenot found in lexicon. Skipping...
asbestosnot found in lexicon. Skipping...
exposurenot found in lexicon. Skipping...
obstaclesnot found in lexicon. Skipping...
certifynot found in lexicon. Skipping...
117.4not found in lexicon. Skipping...
2004-05not found in lexicon. Skipping...
certifynot found in lexicon. Skipping...
overestimatenot found in lexicon. Skipping...
dangersnot found in lexicon. Skipping...
remotenot found in lexicon. Skipping...
rpcnot found in 

criticalnot found in lexicon. Skipping...
criticalnot found in lexicon. Skipping...
1-3/32not found in lexicon. Skipping...
4.30not found in lexicon. Skipping...
4.35not found in lexicon. Skipping...
1500not found in lexicon. Skipping...
3.40not found in lexicon. Skipping...
puretunesnot found in lexicon. Skipping...
puretunesnot found in lexicon. Skipping...
locatednot found in lexicon. Skipping...
suitnot found in lexicon. Skipping...
iacnot found in lexicon. Skipping...
2.81not found in lexicon. Skipping...
7.6not found in lexicon. Skipping...
34.19not found in lexicon. Skipping...
interactivecorpnot found in lexicon. Skipping...
34.19not found in lexicon. Skipping...
2.81not found in lexicon. Skipping...
7.6not found in lexicon. Skipping...
speakingnot found in lexicon. Skipping...
motivesnot found in lexicon. Skipping...
deskboundnot found in lexicon. Skipping...
assumesnot found in lexicon. Skipping...
garbnot found in lexicon. Skipping...
warriornot found in lexicon. Skipping...

miodragnot found in lexicon. Skipping...
zivkovicnot found in lexicon. Skipping...
liberalnot found in lexicon. Skipping...
alliancenot found in lexicon. Skipping...
31not found in lexicon. Skipping...
dragannot found in lexicon. Skipping...
hajdukovicnot found in lexicon. Skipping...
miodragnot found in lexicon. Skipping...
zivkovicnot found in lexicon. Skipping...
pro-independencenot found in lexicon. Skipping...
liberalnot found in lexicon. Skipping...
alliancenot found in lexicon. Skipping...
31not found in lexicon. Skipping...
carlsonnot found in lexicon. Skipping...
recusenot found in lexicon. Skipping...
carlsonnot found in lexicon. Skipping...
recusenot found in lexicon. Skipping...
13.60not found in lexicon. Skipping...
redwoodnot found in lexicon. Skipping...
shores-basednot found in lexicon. Skipping...
13.62not found in lexicon. Skipping...
pearsonnot found in lexicon. Skipping...
pre-taxnot found in lexicon. Skipping...
reportingnot found in lexicon. Skipping...
three-term

[[0, 0.4111217451482082, 0.0001332665010484381, 4, 0.42105263157894735],
 [1, 0.7168117414430619, 0.8164963796577869, 4, 0.5714285714285714],
 [2, 0.34464214103805474, 0.707106779665172, 2, 0.2857142857142857],
 [3, 0.4112070550676187, 9.042867432144338e-05, 3, 0.6],
 [4, 1.0, 1.0, 1, 0.25]]

In [52]:
# make sure our lengths match up
print(len(scores_dev))
print(len(dev_predictions))

1209
1209


In [53]:
correct = 0
for i in range(0, len(dev_predictions)): 
    if dev_predictions[i] == scores_dev[i]:
        correct += 1
    
#     print(dev_predictions[i], scores_dev[i], (dev_predictions[i] == scores_dev[i]))
    
print(correct / len(dev_predictions))

0.19437551695616212
