In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
# import xgboost as xgb

In [45]:
# same readData from STS.py
def readData(fileName):

    first_sentence = []
    second_sentence = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):
        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        first_sentence.insert(len(first_sentence),
                              (sentence.split('\t')[1].lower()).rstrip('.'))
        second_sentence.insert(len(first_sentence),
                               (sentence.split('\t')[2].lower()).rstrip('.'))
        # inserting the score as a separate lists
        score.insert(len(first_sentence), (sentence.split('\t')[3]))

    # print(first_sentence)
    return first_sentence, second_sentence, score


def preprocess(fileName):

    first_sentence, second_sentence, score = readData(fileName)
    first_sentence_tokens = []
    second_sentence_tokens = []

    # tokenizing and tagging
    first_sentence_tags = []
    second_sentence_tags = []

    for sentence in first_sentence:
        tokens = nltk.word_tokenize(sentence)
        first_sentence_tokens.insert(len(first_sentence_tokens), tokens)
        first_sentence_tags.insert(
            len(first_sentence_tags), nltk.pos_tag(tokens))
        # print(first_sentence_tokens)

    for sentence in second_sentence:
        tokens = nltk.word_tokenize(sentence)
        second_sentence_tokens.insert(len(second_sentence_tokens), tokens)
        second_sentence_tags.insert(
            len(second_sentence_tags), nltk.pos_tag(tokens))

        # print(second_sentence_tokens)

    # lemmatizing
    first_sentence_lemmas = []
    second_sentence_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in first_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        first_sentence_lemmas.insert(
            len(first_sentence_lemmas), sentence_components)

    for sentence in second_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        second_sentence_lemmas.insert(
            len(second_sentence_lemmas), sentence_components)

    return first_sentence, second_sentence, score, first_sentence_tokens, second_sentence_tokens


In [46]:
s1_arr_train, s2_arr_train, scores_train, s1_tokens_train, s2_tokens_train = preprocess("./data/train-set.txt")

## Data Inspection

In [48]:
score_list = [0,0,0,0,0,0]
for s in scores_train:
    score_list[int(s)] += 1
for i in range(0, len(score_list)):
    print(i, ': ', score_list[i], score_list[i])

[8, 37, 95, 310, 616, 418]


## Feature Engineering

This section includes all the code/functions to create features.

### Cosine Similarity (TF-IDF)

In [4]:
def calc_cosine_similarity(sentence1, sentence2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([sentence1, sentence2])
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
#     print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

### Smooth Inverse Frequency (SIF)

In [5]:
def frequency_distribution(s1_tokens_array, s2_tokens_array):
    freq_dist = FreqDist()
    for i in range(len(s1_tokens_array)):
        for token in (s1_tokens_array[i] + s2_tokens_array[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

In [6]:
freq_dist = frequency_distribution(s1_tokens_train, s2_tokens_train)
print(freq_dist.most_common(40))

[('the', 5169), (',', 3690), ('of', 2497), ('to', 2133), ('and', 1716), ('a', 1615), ('in', 1573), ('is', 891), ('that', 831), ('on', 820), ('for', 756), ('it', 587), ('this', 579), ('we', 531), ('with', 464), ('be', 459), ('by', 443), ('i', 425), ('which', 403), ('have', 384), ('not', 366), ('at', 343), ('as', 334), ('are', 333), ('has', 319), ('said', 316), ('was', 304), ('european', 287), ("'s", 280), ('from', 261), ('``', 252), ("''", 242), ('will', 233), ('.', 229), ('also', 223), ('its', 194), ('but', 193), ('would', 191), ('all', 188), ('percent', 187)]


In [7]:
def calc_sif_similarity(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

In [8]:
calc_sif_similarity('I like some apples', 'I like the pears')

1.4515545128534995e-10

### Simple Overlap

Unique words that are in both sentences divided by the total number of words in both sentences. Does not include stop words.

In [9]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = s1_tokens_train+s2_tokens_train
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

In [10]:
def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

In [11]:
def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

In [12]:
def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    overlap = 0
    encountered_indexes = []
    for word in (s1_tokens+s2_tokens):
        try:
            word_index = words_filtered.index(word)
            if word_index in encountered_indexes: # we know we have found an overlap
                overlap += 1
            encountered_indexes.append(word_index)
        except ValueError:
            print(word + ' not found in lexicon. Skipping...')

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

### Synset Overlap

In [13]:
def calc_synset_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    print(s2_tokens)
    print(s1_tokens)

    s1_spread = []
    s2_spread = []
    
    for word in s1_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
calc_synset_overlap(s1_tokens_train[0], s2_tokens_train[0])

['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'next', 'day', 'two']
['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'hoped', 'see', 'bidders', 'interested', 'individual', 'assets', 'team']
beginning not found in lexicon. Skipping...
root not found in lexicon. Skipping...
rootage not found in lexicon. Skipping...
source not found in lexicon. Skipping...
seed not found in lexicon. Skipping...
germ not found in lexicon. Skipping...
generator not found in lexicon. Skipping...
reservoir not found in lexicon. Skipping...
stopping_point not found in lexicon. Skipping...
finale not found in lexicon. Skipping...
finis not found in lexicon. Skipping...
finish not found in lexicon. Skipping...
conclusion not found in lexicon. Skipping...
closing_curtain not found in lexicon. Skipping...
close_up not found in lexicon. Skipping...
shut_down not found in lexicon. Skipping...
close_down not found in lexicon. Skipping...
come_toge

(46, 0.23173803526448364)

## Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [33]:
def pipeline(s1_array, s2_array, s1_tokens, s2_tokens):
    # TODO add a check to ensure the lengths of these arrays are the same
    # or add the basic processing to pipeline
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        sif_sim = calc_sif_similarity(s1_array[i], s2_array[i])
        overlap, normalized_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
#         syn_overlap, normalized_syn_overlap = calc_synset_overlap(s1_tokens[i], s2_tokens[i])
        data.append([i, normalized_overlap, sif_sim, cos_sim])
    return data

In [34]:
data = pipeline(s1_arr_train, s2_arr_train, s1_tokens_train, s2_tokens_train)
print(data[0:5])

[[0, 0.6428571428571429, 0.4043188683415115, 0.5949218057093537], [1, 0.631578947368421, 0.37040524322972224, 0.474330706497194], [2, 0.5, 0.1358693286767868, 0.392181175971253], [3, 0.7333333333333333, 0.6935512636502701, 0.668348418668298], [4, 0.24, 4.979960298599938e-10, 0.12170566815950139]]


In [35]:
scores_train[0:5]

['4', '4', '3', '3', '2']

## Models

In this section we fit our feature set to a model.

### Decision Tree

In [36]:
dt_classifier = DecisionTreeClassifier(random_state=14, max_depth=8)
dt_classifier.fit(data,scores_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [37]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')
print(f'Accuracy: {dt_classifier.score(data, scores_train)}')

Nodes: 229
Max Depth: 8
Accuracy: 0.6691374663072777


### XGBoost

In [38]:
s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev = preprocess("./data/dev-set.txt")
dev_data = pipeline(s1_arr_dev, s2_arr_dev)
dev_predictions = dt_classifier.predict(dev_data)

xgboost_model = xgb.XGBRegressor(booster='gbtree', 
                       n_estimators=1000,
                       n_jobs=4,
                       learning_rate=.05,
                       max_depth=3,
                       random_state=42,
                       gamma=.05,
                       early_stopping_rounds = 5)



In [None]:
xgboost_model.fit(data, scores_train,
#                   eval_set=[(X_train, y_train), (X_test, y_test)],
                  eval_metric='logloss')

### Random Forest

In [39]:
rf_classifier = RandomForestClassifier(random_state=14, n_estimators=100)
rf_classifier.fit(data,scores_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)

# Testing

In [40]:
s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev = preprocess("./data/dev-set.txt")
dev_data = pipeline(s1_arr_dev, s2_arr_dev, s1_tokens_dev, s2_tokens_dev)
dt_dev_predictions = dt_classifier.predict(dev_data)
rf_dev_predictions = rf_classifier.predict(dev_data)
dev_data[0:5]

aujourd not found in lexicon. Skipping...
hui not found in lexicon. Skipping...
luck not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
proposing not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
allies not found in lexicon. Skipping...
fervent not found in lexicon. Skipping...
allies not found in lexicon. Skipping...
5.30 not found in lexicon. Skipping...
5.30pm not found in lexicon. Skipping...
fishermen not found in lexicon. Skipping...
inactive not found in lexicon. Skipping...
tired not found in lexicon. Skipping...
fishermen not found in lexicon. Skipping...
inactive not found in lexicon. Skipping...
tired not found in lexicon. Skipping...
revert not found in lexicon. Skipping...
increases not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
small not found in lexicon. Skipping...
augmenting not found in lexicon.

wide not found in lexicon. Skipping...
arbitrariness not found in lexicon. Skipping...
opens not found in lexicon. Skipping...
arbitrary not found in lexicon. Skipping...
tunisia not found in lexicon. Skipping...
tunisia not found in lexicon. Skipping...
fishermen not found in lexicon. Skipping...
inactive not found in lexicon. Skipping...
tired not found in lexicon. Skipping...
fishermen not found in lexicon. Skipping...
inactive not found in lexicon. Skipping...
tired not found in lexicon. Skipping...
disappointment not found in lexicon. Skipping...
attends not found in lexicon. Skipping...
sittings not found in lexicon. Skipping...
faithfully not found in lexicon. Skipping...
increases not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
small not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
smaller not found in lexicon. Skipping...
scarcely not found in lexicon. Skipping...
comparable not found in lexicon. Skipping...
transferable n

losers not found in lexicon. Skipping...
competitiveness not found in lexicon. Skipping...
régresseront not found in lexicon. Skipping...
van not found in lexicon. Skipping...
orden not found in lexicon. Skipping...
a5-0241/2000 not found in lexicon. Skipping...
horsebox not found in lexicon. Skipping...
orden not found in lexicon. Skipping...
a5-0241 not found in lexicon. Skipping...
accumulated not found in lexicon. Skipping...
hatred not found in lexicon. Skipping...
grudge not found in lexicon. Skipping...
tunisia not found in lexicon. Skipping...
tunisia not found in lexicon. Skipping...
pontificate not found in lexicon. Skipping...
losers not found in lexicon. Skipping...
régresseront not found in lexicon. Skipping...
competitiveness not found in lexicon. Skipping...
unanimous not found in lexicon. Skipping...
inherent not found in lexicon. Skipping...
incapacity not found in lexicon. Skipping...
norm not found in lexicon. Skipping...
latent not found in lexicon. Skipping...
inab

cashman not found in lexicon. Skipping...
summarised not found in lexicon. Skipping...
bureaucracy not found in lexicon. Skipping...
cashman not found in lexicon. Skipping...
summarised not found in lexicon. Skipping...
follows not found in lexicon. Skipping...
bureaucracy not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
appropriate not found in lexicon. Skipping...
allies not found in lexicon. Skipping...
fervent not found in lexicon. Skipping...
anxious not found in lexicon. Skipping...
allies not found in lexicon. Skipping...
fervent not found in lexicon. Skipping...
selective not found in lexicon. Skipping...
grass not found in lexicon. Skipping...
subsidy not found in lexicon. Skipping...
punctual not found in lexicon. Skipping...
grass not found in lexicon. Skipping...
latvia not found in lexicon. Skipping...
latvia n

scarcely not found in lexicon. Skipping...
comparable not found in lexicon. Skipping...
transferable not found in lexicon. Skipping...
transferable not found in lexicon. Skipping...
gets not found in lexicon. Skipping...
fast not found in lexicon. Skipping...
increases not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
small not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
smaller not found in lexicon. Skipping...
balance not found in lexicon. Skipping...
balance not found in lexicon. Skipping...
5.30 not found in lexicon. Skipping...
5.30 not found in lexicon. Skipping...
latvia not found in lexicon. Skipping...
latvia not found in lexicon. Skipping...
pontificate not found in lexicon. Skipping...
pride not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
references not found in lexicon. Skipping...
paragraphs not found in lexicon. Skipping...
unanimous not found 

adopts not found in lexicon. Skipping...
increases not found in lexicon. Skipping...
expense not found in lexicon. Skipping...
small not found in lexicon. Skipping...
detriment not found in lexicon. Skipping...
small not found in lexicon. Skipping...
attends not found in lexicon. Skipping...
sittings not found in lexicon. Skipping...
faithfully not found in lexicon. Skipping...
wide not found in lexicon. Skipping...
arbitrariness not found in lexicon. Skipping...
opens not found in lexicon. Skipping...
arbitrary not found in lexicon. Skipping...
underlying not found in lexicon. Skipping...
located not found in lexicon. Skipping...
modification not found in lexicon. Skipping...
maij-weggen not found in lexicon. Skipping...
a5-0323/2000 not found in lexicon. Skipping...
maij-weggen not found in lexicon. Skipping...
a5-0323/2000 not found in lexicon. Skipping...
allies not found in lexicon. Skipping...
fervent not found in lexicon. Skipping...
recalling not found in lexicon. Skipping...
a

gray not found in lexicon. Skipping...
follows not found in lexicon. Skipping...
youths not found in lexicon. Skipping...
62 not found in lexicon. Skipping...
62 not found in lexicon. Skipping...
missiles not found in lexicon. Skipping...
gaza not found in lexicon. Skipping...
rained not found in lexicon. Skipping...
missiles not found in lexicon. Skipping...
gaza not found in lexicon. Skipping...
recruiters not found in lexicon. Skipping...
finding not found in lexicon. Skipping...
seasoned not found in lexicon. Skipping...
merchandising not found in lexicon. Skipping...
trickiest not found in lexicon. Skipping...
hires not found in lexicon. Skipping...
dearth not found in lexicon. Skipping...
recruiters not found in lexicon. Skipping...
finding not found in lexicon. Skipping...
seasoned not found in lexicon. Skipping...
merchandising not found in lexicon. Skipping...
trickiest not found in lexicon. Skipping...
hires not found in lexicon. Skipping...
aaa not found in lexicon. Skipping

none not found in lexicon. Skipping...
sin not found in lexicon. Skipping...
gilroy not found in lexicon. Skipping...
gehring not found in lexicon. Skipping...
cooperative not found in lexicon. Skipping...
gehring not found in lexicon. Skipping...
cooperative not found in lexicon. Skipping...
scribner not found in lexicon. Skipping...
portage not found in lexicon. Skipping...
bay not found in lexicon. Skipping...
lived not found in lexicon. Skipping...
houseboat not found in lexicon. Skipping...
kayaker not found in lexicon. Skipping...
scribner not found in lexicon. Skipping...
houseboat not found in lexicon. Skipping...
portage not found in lexicon. Skipping...
bay not found in lexicon. Skipping...
lunch not found in lexicon. Skipping...
ethel not found in lexicon. Skipping...
passer-by not found in lexicon. Skipping...
ben not found in lexicon. Skipping...
hiding not found in lexicon. Skipping...
dirt not found in lexicon. Skipping...
fork not found in lexicon. Skipping...
taped not

attempting not found in lexicon. Skipping...
arrange not found in lexicon. Skipping...
dnc not found in lexicon. Skipping...
mcauliffe not found in lexicon. Skipping...
layoffs not found in lexicon. Skipping...
arrange not found in lexicon. Skipping...
mcauliffe not found in lexicon. Skipping...
layoffs not found in lexicon. Skipping...
staffers not found in lexicon. Skipping...
headquarters not found in lexicon. Skipping...
giovanni not found in lexicon. Skipping...
bisignani not found in lexicon. Skipping...
geneva-based not found in lexicon. Skipping...
geneva-based not found in lexicon. Skipping...
transport not found in lexicon. Skipping...
director-general not found in lexicon. Skipping...
giovanni not found in lexicon. Skipping...
bisignani not found in lexicon. Skipping...
germany not found in lexicon. Skipping...
poland not found in lexicon. Skipping...
psi not found in lexicon. Skipping...
germany not found in lexicon. Skipping...
poland not found in lexicon. Skipping...
116.

link not found in lexicon. Skipping...
operated not found in lexicon. Skipping...
e2 not found in lexicon. Skipping...
hollins not found in lexicon. Skipping...
royce not found in lexicon. Skipping...
link not found in lexicon. Skipping...
operated not found in lexicon. Skipping...
e2 not found in lexicon. Skipping...
135.13 not found in lexicon. Skipping...
135.26 not found in lexicon. Skipping...
castle not found in lexicon. Skipping...
delaware not found in lexicon. Skipping...
patient not found in lexicon. Skipping...
62-year-old not found in lexicon. Skipping...
babylon not found in lexicon. Skipping...
suffolk not found in lexicon. Skipping...
baucus not found in lexicon. Skipping...
disparity not found in lexicon. Skipping...
baucus not found in lexicon. Skipping...
reaffirmed not found in lexicon. Skipping...
dominated not found in lexicon. Skipping...
steelmaker not found in lexicon. Skipping...
16.2 not found in lexicon. Skipping...
ak not found in lexicon. Skipping...
16.2 n

rollover not found in lexicon. Skipping...
crashes not found in lexicon. Skipping...
82 not found in lexicon. Skipping...
rollover not found in lexicon. Skipping...
crashes not found in lexicon. Skipping...
82 not found in lexicon. Skipping...
6.1 not found in lexicon. Skipping...
discounted not found in lexicon. Skipping...
slight not found in lexicon. Skipping...
6.1 not found in lexicon. Skipping...
pound not found in lexicon. Skipping...
highs not found in lexicon. Skipping...
1.6789 not found in lexicon. Skipping...
pound not found in lexicon. Skipping...
flexed not found in lexicon. Skipping...
1.6672 not found in lexicon. Skipping...
schools not found in lexicon. Skipping...
joel not found in lexicon. Skipping...
accountability not found in lexicon. Skipping...
schools not found in lexicon. Skipping...
joel not found in lexicon. Skipping...
skin not found in lexicon. Skipping...
tripled not found in lexicon. Skipping...
1950s not found in lexicon. Skipping...
norway not found in

instant not found in lexicon. Skipping...
messaging not found in lexicon. Skipping...
communicating not found in lexicon. Skipping...
inputting not found in lexicon. Skipping...
instant-messaging not found in lexicon. Skipping...
conversation not found in lexicon. Skipping...
typing not found in lexicon. Skipping...
o'neill not found in lexicon. Skipping...
hyundai not found in lexicon. Skipping...
successor not found in lexicon. Skipping...
o'neill not found in lexicon. Skipping...
hyundai not found in lexicon. Skipping...
habits not found in lexicon. Skipping...
seriously not found in lexicon. Skipping...
rethink not found in lexicon. Skipping...
business-as-usual not found in lexicon. Skipping...
avoid not found in lexicon. Skipping...
complacency not found in lexicon. Skipping...
habits not found in lexicon. Skipping...
seriously not found in lexicon. Skipping...
rethink not found in lexicon. Skipping...
business-as-usual not found in lexicon. Skipping...
four-month not found in le

immediate not found in lexicon. Skipping...
immediate not found in lexicon. Skipping...
aug. not found in lexicon. Skipping...
aug. not found in lexicon. Skipping...
juvenile not found in lexicon. Skipping...
yeates not found in lexicon. Skipping...
protective not found in lexicon. Skipping...
gillespie not found in lexicon. Skipping...
leslie not found in lexicon. Skipping...
moonves not found in lexicon. Skipping...
asking not found in lexicon. Skipping...
historical not found in lexicon. Skipping...
disclaimer not found in lexicon. Skipping...
ed not found in lexicon. Skipping...
gillespie not found in lexicon. Skipping...
leslie not found in lexicon. Skipping...
moonves not found in lexicon. Skipping...
white not found in lexicon. Skipping...
ari not found in lexicon. Skipping...
fleischer not found in lexicon. Skipping...
iranians not found in lexicon. Skipping...
capturing not found in lexicon. Skipping...
al-qa'ida not found in lexicon. Skipping...
insufficient not found in lexi

chosen not found in lexicon. Skipping...
commercial not found in lexicon. Skipping...
aircraft not found in lexicon. Skipping...
shoulder-fired not found in lexicon. Skipping...
thwart not found in lexicon. Skipping...
shoulder-fired not found in lexicon. Skipping...
commercial not found in lexicon. Skipping...
aircraft not found in lexicon. Skipping...
tamil not found in lexicon. Skipping...
blame not found in lexicon. Skipping...
liberation not found in lexicon. Skipping...
tigers not found in lexicon. Skipping...
eelam not found in lexicon. Skipping...
ltte not found in lexicon. Skipping...
killings not found in lexicon. Skipping...
revive not found in lexicon. Skipping...
liberation not found in lexicon. Skipping...
tigers not found in lexicon. Skipping...
tamil not found in lexicon. Skipping...
eelam not found in lexicon. Skipping...
ltte not found in lexicon. Skipping...
britons not found in lexicon. Skipping...
sandy not found in lexicon. Skipping...
mitchell not found in lexico

glover not found in lexicon. Skipping...
peculiar not found in lexicon. Skipping...
slipknot not found in lexicon. Skipping...
ligature not found in lexicon. Skipping...
strangle not found in lexicon. Skipping...
glover not found in lexicon. Skipping...
peculiar not found in lexicon. Skipping...
slipknot not found in lexicon. Skipping...
coaxial not found in lexicon. Skipping...
extension not found in lexicon. Skipping...
strangle not found in lexicon. Skipping...
jane not found in lexicon. Skipping...
doe not found in lexicon. Skipping...
verizon not found in lexicon. Skipping...
withhold not found in lexicon. Skipping...
subpoena not found in lexicon. Skipping...
jane not found in lexicon. Skipping...
doe not found in lexicon. Skipping...
subpoena not found in lexicon. Skipping...
verizon not found in lexicon. Skipping...
withhold not found in lexicon. Skipping...
radar not found in lexicon. Skipping...
pitch not found in lexicon. Skipping...
supportive not found in lexicon. Skipping

yeager not found in lexicon. Skipping...
suspect not found in lexicon. Skipping...
tendencies not found in lexicon. Skipping...
offender not found in lexicon. Skipping...
yeager not found in lexicon. Skipping...
suspect not found in lexicon. Skipping...
tendencies not found in lexicon. Skipping...
offender not found in lexicon. Skipping...
55 not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
fastest-growing not found in lexicon. Skipping...
55 not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
fastest-growing not found in lexicon. Skipping...
sammy not found in lexicon. Skipping...
sosa not found in lexicon. Skipping...
eight not found in lexicon. Skipping...
baseball not found in lexicon. Skipping...
corked not found in lexicon. Skipping...
sammy not found in lexicon. Skipping...
sosa not found in lexicon. Skipping...
eight not found in lexicon. Skipping...
baseball not found in lexicon. Skipping...
corked not found in lexicon. Skip

616.5 not found in lexicon. Skipping...
610.6 not found in lexicon. Skipping...
tad not found in lexicon. Skipping...
610.6 not found in lexicon. Skipping...
616.5 not found in lexicon. Skipping...
8.79 not found in lexicon. Skipping...
0.96 not found in lexicon. Skipping...
929.06 not found in lexicon. Skipping...
1,516 not found in lexicon. Skipping...
deutsch not found in lexicon. Skipping...
d-fla. not found in lexicon. Skipping...
deutsch not found in lexicon. Skipping...
d-fla. not found in lexicon. Skipping...
iowa not found in lexicon. Skipping...
athletic not found in lexicon. Skipping...
van not found in lexicon. Skipping...
velde not found in lexicon. Skipping...
iowa not found in lexicon. Skipping...
athletic not found in lexicon. Skipping...
van not found in lexicon. Skipping...
velde not found in lexicon. Skipping...
lebo not found in lexicon. Skipping...
probable not found in lexicon. Skipping...
gta not found in lexicon. Skipping...
globally not found in lexicon. Skippi

teacher not found in lexicon. Skipping...
master not found in lexicon. Skipping...
kelley not found in lexicon. Skipping...
salary not found in lexicon. Skipping...
65,000 not found in lexicon. Skipping...
30th not found in lexicon. Skipping...
teacher not found in lexicon. Skipping...
master not found in lexicon. Skipping...
kelley not found in lexicon. Skipping...
30th not found in lexicon. Skipping...
teaching not found in lexicon. Skipping...
65,000 not found in lexicon. Skipping...
vienna not found in lexicon. Skipping...
iaea not found in lexicon. Skipping...
elbaradei not found in lexicon. Skipping...
vienna not found in lexicon. Skipping...
atomic not found in lexicon. Skipping...
elbaradei not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
mainland not found in lexicon. Skipping...
guangdong not found in lexicon. Skipping...
truthfully not found in lexicon. Skipping...
prc not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
gu

malyasian not found in lexicon. Skipping...
diplomat not found in lexicon. Skipping...
met not found in lexicon. Skipping...
suu not found in lexicon. Skipping...
kyi not found in lexicon. Skipping...
lakeside not found in lexicon. Skipping...
yangon not found in lexicon. Skipping...
razali not found in lexicon. Skipping...
ismail not found in lexicon. Skipping...
met not found in lexicon. Skipping...
90 not found in lexicon. Skipping...
suu not found in lexicon. Skipping...
kyi not found in lexicon. Skipping...
prize not found in lexicon. Skipping...
lakeside not found in lexicon. Skipping...
flamboyant not found in lexicon. Skipping...
entrepreneur not found in lexicon. Skipping...
flagged not found in lexicon. Skipping...
tourism not found in lexicon. Skipping...
hockey not found in lexicon. Skipping...
speaking not found in lexicon. Skipping...
tourism not found in lexicon. Skipping...
hockey not found in lexicon. Skipping...
greenspan not found in lexicon. Skipping...
embarking no

scrambling not found in lexicon. Skipping...
vast not found in lexicon. Skipping...
countryside not found in lexicon. Skipping...
live not found in lexicon. Skipping...
scrambling not found in lexicon. Skipping...
countryside not found in lexicon. Skipping...
live not found in lexicon. Skipping...
environment not found in lexicon. Skipping...
88 not found in lexicon. Skipping...
abortions not found in lexicon. Skipping...
girls not found in lexicon. Skipping...
younger not found in lexicon. Skipping...
environment not found in lexicon. Skipping...
11,844 not found in lexicon. Skipping...
abortions not found in lexicon. Skipping...
tightened not found in lexicon. Skipping...
spotty not found in lexicon. Skipping...
labored not found in lexicon. Skipping...
feet not found in lexicon. Skipping...
outage not found in lexicon. Skipping...
feather-light not found in lexicon. Skipping...
regrouped not found in lexicon. Skipping...
biggest-ever not found in lexicon. Skipping...
outage not foun

payroll not found in lexicon. Skipping...
accurate not found in lexicon. Skipping...
picture not found in lexicon. Skipping...
larger not found in lexicon. Skipping...
sample not found in lexicon. Skipping...
payroll not found in lexicon. Skipping...
accurate not found in lexicon. Skipping...
picture not found in lexicon. Skipping...
larger not found in lexicon. Skipping...
sample not found in lexicon. Skipping...
alive not found in lexicon. Skipping...
husband not found in lexicon. Skipping...
artificially not found in lexicon. Skipping...
schiavo not found in lexicon. Skipping...
alive not found in lexicon. Skipping...
artificially not found in lexicon. Skipping...
inter-radio not found in lexicon. Skipping...
qualify not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
qualify not found in lexicon. Skipping...
epidemic not found in lexicon. Skipping...
jean-francois not found in lexicon. Skipping...
mattei not found in lexicon. Skipping...
inter not found in 

pared not found in lexicon. Skipping...
compensate not found in lexicon. Skipping...
sluggish not found in lexicon. Skipping...
sluggish not found in lexicon. Skipping...
pared not found in lexicon. Skipping...
drafted not found in lexicon. Skipping...
chris not found in lexicon. Skipping...
lubanski not found in lexicon. Skipping...
kennedy-kenrick not found in lexicon. Skipping...
royals not found in lexicon. Skipping...
chose not found in lexicon. Skipping...
chris not found in lexicon. Skipping...
lubanski not found in lexicon. Skipping...
.528 not found in lexicon. Skipping...
sterling not found in lexicon. Skipping...
1.5875 not found in lexicon. Skipping...
gbp= not found in lexicon. Skipping...
115.97 not found in lexicon. Skipping...
41.61 not found in lexicon. Skipping...
0.44 not found in lexicon. Skipping...
9,415.82 not found in lexicon. Skipping...
41.61 not found in lexicon. Skipping...
beginning not found in lexicon. Skipping...
crosby not found in lexicon. Skipping...


chips not found in lexicon. Skipping...
drained not found in lexicon. Skipping...
optimism not found in lexicon. Skipping...
driven not found in lexicon. Skipping...
drained not found in lexicon. Skipping...
optimism not found in lexicon. Skipping...
driven not found in lexicon. Skipping...
5.91 not found in lexicon. Skipping...
0.29 not found in lexicon. Skipping...
2,053.27 not found in lexicon. Skipping...
technology-focused not found in lexicon. Skipping...
0.30 not found in lexicon. Skipping...
2,053 not found in lexicon. Skipping...
erasing not found in lexicon. Skipping...
losses not found in lexicon. Skipping...
tokyo not found in lexicon. Skipping...
jp:8306 not found in lexicon. Skipping...
3,000 not found in lexicon. Skipping...
0.65 not found in lexicon. Skipping...
456,000 not found in lexicon. Skipping...
sumitomo not found in lexicon. Skipping...
mitsui not found in lexicon. Skipping...
jp:8316 not found in lexicon. Skipping...
198,000 not found in lexicon. Skipping...
k

yucaipa not found in lexicon. Skipping...
dominick not found in lexicon. Skipping...
yucaipa not found in lexicon. Skipping...
bought not found in lexicon. Skipping...
dominick not found in lexicon. Skipping...
693 not found in lexicon. Skipping...
1.8 not found in lexicon. Skipping...
tenth not found in lexicon. Skipping...
6.1 not found in lexicon. Skipping...
1994 not found in lexicon. Skipping...
ticked not found in lexicon. Skipping...
6.1 not found in lexicon. Skipping...
delainey not found in lexicon. Skipping...
dave not found in lexicon. Skipping...
delainey not found in lexicon. Skipping...
villafranca not found in lexicon. Skipping...
austin not found in lexicon. Skipping...
ratcliffe not found in lexicon. Skipping...
laredo not found in lexicon. Skipping...
pete not found in lexicon. Skipping...
slover not found in lexicon. Skipping...
laredo not found in lexicon. Skipping...
gromer not found in lexicon. Skipping...
jeffers not found in lexicon. Skipping...
raise not found 

[[0, 0.42105263157894735, 0.0001332665010484381, 0.4111217451482082],
 [1, 0.5714285714285714, 0.8164963796577869, 0.7168117414430619],
 [2, 0.2857142857142857, 0.707106779665172, 0.34464214103805474],
 [3, 0.6, 9.042867432144338e-05, 0.4112070550676187],
 [4, 0.25, 1.0, 1.0]]

In [41]:
# make sure our lengths match up
print(len(scores_dev))
print(len(dt_dev_predictions))
print(len(rf_dev_predictions))

1209
1209
1209


In [42]:
def calc_accuracy(predictions, scores):
    correct = 0
    for i in range(0, len(predictions)): 
        if predictions[i] == scores[i]:
            correct += 1
    return correct / len(predictions) 



    
print('DECISION TREE: ', calc_accuracy(dt_dev_predictions, scores_dev))
print('RANDOM FOREST: ', calc_accuracy(rf_dev_predictions, scores_dev))

DECISION TREE:  0.2109181141439206
RANDOM FOREST:  0.28287841191067
