# Imports

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# XGBOOST
import xgboost as xgb
# from xgboost import XGBClassifier

# SPACY IMPORT
import spacy
nlp = spacy.load("en_core_web_lg")

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
from xgboost import XGBClassifier

# Read Data and PreProcess

In [3]:
# same readData from STS.py
def readData(fileName):

    s1 = []
    s2 = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):

        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        
        s1.insert(len(s1), (sentence.split('\t')[1].lower()).rstrip('.'))
        s2.insert(len(s1), (sentence.split('\t')[2].lower()).rstrip('.'))
        
        # inserting the score as a separate lists
        score.insert(len(s1), (sentence.split('\t')[3]))

    # print(s1)
    return s1, s2, score


def preprocess(fileName):

    s1, s2, scores = readData(fileName)
    s1_toks = []
    s2_toks = []

    # tokenizing and tagging
    s1_tags = []
    s2_tags = []

    for sentence in s1:
        tokens = nltk.word_tokenize(sentence)
        s1_toks.insert(len(s1_toks), tokens)
        s1_tags.insert(
            len(s1_tags), nltk.pos_tag(tokens))

    for sentence in s2:
        tokens = nltk.word_tokenize(sentence)
        s2_toks.insert(len(s2_toks), tokens)
        s2_tags.insert(
            len(s2_tags), nltk.pos_tag(tokens))
    
    # Remove the unnecessary tuple and keep just the tags
    for i, tag_list in enumerate(s1_tags):
        s1_tags[i] = [tup[1] for tup in tag_list]
    for i, tag_list in enumerate(s2_tags):
        s2_tags[i] = [tup[1] for tup in tag_list]

    # lemmatizing
    s1_lemmas = []
    s2_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in s1_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s1_lemmas.insert(
            len(s1_lemmas), sentence_components)

    for sentence in s2_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s2_lemmas.insert(
            len(s2_lemmas), sentence_components)

        
    # Zipping it all together into one object for each word
    s1_word_lists = []
    s2_word_lists = []
    
    for tok_list, lem_list, tag_list in zip(s1_toks, s1_lemmas, s1_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s1_word_lists.append(sentence_words) 
        
    for tok_list, lem_list, tag_list in zip(s2_toks, s2_lemmas, s2_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s2_word_lists.append(sentence_words)  
              
    
    # Create a corpus object to represent our corpus
    corpus = {}
    corpus["s1"] = {}
    corpus["s2"] = {}
    corpus['scores'] = [int(i) for i in scores]
    
    corpus["s1"]["sentences"] = s1
    corpus["s2"]["sentences"] = s2
    
    corpus["s1"]["tokens"] = s1_toks
    corpus["s2"]["tokens"] = s2_toks
    
    corpus["s1"]["lemmas"] = s1_lemmas
    corpus["s2"]["lemmas"] = s2_lemmas
    
    corpus["s1"]["tags"] = s1_tags
    corpus["s2"]["tags"] = s2_tags
    
    corpus["s1"]["words"] = s1_word_lists
    corpus["s2"]["words"] = s2_word_lists
    
    return corpus


In [4]:
train_data = preprocess("./data/train-set.txt")
#train_data = preprocess("./data/time-set.txt") # only 100 sentences

In [5]:
train_data['s2']['words'][0][0:5]

[{'tok': 'but', 'lem': 'but', 'tag': 'CC'},
 {'tok': 'other', 'lem': 'other', 'tag': 'JJ'},
 {'tok': 'sources', 'lem': 'source', 'tag': 'NNS'},
 {'tok': 'close', 'lem': 'close', 'tag': 'RB'},
 {'tok': 'to', 'lem': 'to', 'tag': 'TO'}]

### Stop Words

In [6]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = train_data['s1']['tokens']+train_data['s2']['tokens']
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

# Data Inspection

### Frequency Distribution

In [7]:
def frequency_distribution(corpus):
    s1_toks = corpus['s1']['tokens']
    s2_toks = corpus['s2']['tokens']    
    freq_dist = FreqDist()
    for i in range(len(s1_toks)):
        for token in (s1_toks[i] + s2_toks[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

In [8]:
freq_dist = frequency_distribution(train_data)

print(freq_dist.most_common(40))

[('the', 5169), (',', 3690), ('of', 2497), ('to', 2133), ('and', 1716), ('a', 1615), ('in', 1573), ('is', 891), ('that', 831), ('on', 820), ('for', 756), ('it', 587), ('this', 579), ('we', 531), ('with', 464), ('be', 459), ('by', 443), ('i', 425), ('which', 403), ('have', 384), ('not', 366), ('at', 343), ('as', 334), ('are', 333), ('has', 319), ('said', 316), ('was', 304), ('european', 287), ("'s", 280), ('from', 261), ('``', 252), ("''", 242), ('will', 233), ('.', 229), ('also', 223), ('its', 194), ('but', 193), ('would', 191), ('all', 188), ('percent', 187)]


### Score Distribution

# TODO
this data is super imbalanced. we likely need to balance it in preprocessing. `https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html`

In [9]:
score_list = [0,0,0,0,0,0]
for s in train_data['scores']:
    score_list[int(s)] += 1
for i in range(0, len(score_list)):
    print("% 1d: % 4d % 6.2f per" %(i, score_list[i], 100*score_list[i]/len(train_data['scores']))) 

 0:    8   0.54 per
 1:   37   2.49 per
 2:   95   6.40 per
 3:  310  20.89 per
 4:  616  41.51 per
 5:  418  28.17 per


## Feature Engineering

This section includes all the code/functions to create features.

### Helper Functions

In [10]:
def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

In [11]:
def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

### Cosine Similarity (TF-IDF)

This is the same as the spacy similarity. This one is probably less accurate though as I don't believe it's trained from the GloVe w2v model.

In [12]:
def calc_cos_sim(s1, s2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([s1, s2])
    
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

In [13]:
calc_cos_sim('I like some apples', 'I like the pears')

0.33609692727625756

### Spacy Similarity

In [14]:
def calc_spacy_sim(s1, s2):
    s2 = nlp(s2)
    s1 = nlp(s1)
    return s1.similarity(s2)


In [15]:
calc_spacy_sim('I like some apples', 'I like the pears')

0.9361166303666173

### Smooth Inverse Frequency (SIF)

In [16]:
def calc_sif_sim(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

In [17]:
calc_sif_sim('I like some apples', 'I like the pears')

1.4515545128534995e-10

### Simple Overlap

Unique words that are in both sentences divided by the total number of words in both sentences. Does not include stop words.

In [18]:
def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    overlap = 0
    encountered_indexes = []
    for word in (s1_tokens+s2_tokens):
        try:
            word_index = words_filtered.index(word)
            if word_index in encountered_indexes: # we know we have found an overlap
                overlap += 1
            encountered_indexes.append(word_index)
        except ValueError:
            # print(word + ' not found in lexicon. Skipping...')
            continue

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

### Synset Overlap

# TODO * * * *

We may be able to incorporate POS and dependency parsing here as right now i'm just taking the first synset.

In [19]:
def calc_synset_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
#     print(s2_tokens)
#     print(s1_tokens)

    s1_spread = []
    s2_spread = []
    
    for word in s1_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
calc_synset_overlap(train_data['s1']['tokens'][0], train_data['s2']['tokens'][0])

(46, 0.23173803526448364)

In [20]:
def remove_duplicate_words(word_list):
    blank_list = []
    tokens_seen = []
    for w in word_list:
        if w['lem'] not in tokens_seen:
            blank_list.append(w)
            tokens_seen.append(w['lem']) # mark that we've seen this base word
    return blank_list

def calc_synset_overlap_by_pos(s1_words, s2_words):
    s1_spread = []
    s2_spread = []
    
    for word in s1_words:
        if word['lem'] in stop_words:
            continue;
        print(word['tag'])
        for synset in wn.synsets(word['lem']):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_words:
        if word['lem'] in stop_words:
            continue;
        print(word['tag'])
        for synset in wn.synsets(word['lem']):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
calc_synset_overlap_by_pos(train_data['s1']['words'][0], train_data['s2']['words'][0])
# train_data['s2']['words'][0]
# wn.synsets('dog')
# remove_duplicate_words(train_data['s1']['words'][0])

NNS
RB
NN
VBD
NN
VBD
VBG
NN
JJ
NNS
VBD
VB
NNS
JJ
JJ
NNS
VBP
NNS
RB
NN
VBD
NN
VBD
VBG
NN
JJ
NNS
JJ
NN
CD


(46, 0.22885572139303484)

## Dependency Parsing Features

In [21]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [22]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [23]:
# Similarity of roots of noun phrases in a sentence

def get_noun_chunk_roots(sentence):
    noun_chunk_roots = []
    doc = nlp(sentence)
    for chunk in doc.noun_chunks:
        noun_chunk_roots.append(chunk.root.head.text)
    return noun_chunk_roots

t1 = train_data['s1']['sentences'][0]
t2 = train_data['s2']['sentences'][0]

print(get_noun_chunk_roots(t1))
print(get_noun_chunk_roots(t2))

def calc_noun_chunk_root_overlap(s1, s2):
    s1_roots = get_noun_chunk_roots(s1)
    s2_roots = get_noun_chunk_roots(s2)    
    
    overlap = 0
    roots_seen = []
    for r in s1_roots + s2_roots:
        if r in roots_seen:
            overlap += 1
        else:
            roots_seen.append(r)
    return (overlap*2)/ len(s1_roots+s2_roots)

print(calc_noun_chunk_root_overlap(t1, t2))

['said', 'to', 'keeping', 'keeping', 'to', 'see', 'in']
['said', 'to', 'keeping', 'keeping', 'for', 'in']
1.0769230769230769


## Rule Based Morphology

## Shruti Features

In [37]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

In [38]:
def postag_to_synsettag(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None

In [39]:
def tag_to_synset(word, tag):
    wn_tag = postag_to_synsettag(tag)
    if wn_tag is None:
        #print(1)
        return None
    else:
        try:
            return wn.synsets(word, wn_tag)[0]
        except:
            #print(3)
            return None

In [40]:
def get_synsets(sentence1, sentence2):
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tag_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tag_to_synset(*tagged_word) for tagged_word in sentence2]

    # print(synsets1)
    # print(synsets2)
    
    # Filter out the Nones
    synsets1 = [synset1 for synset1 in synsets1 if synset1]
    synsets2 = [synset2 for synset2 in synsets2 if synset2]
    
    # print(synsets1)
    # print(synsets2)
    
    return synsets1, synsets2

get_synsets("Dolphins are swimming mammals.", "Dolphins can swim.")

([Synset('dolphinfish.n.02'),
  Synset('be.v.01'),
  Synset('swim.v.01'),
  Synset('mammal.n.01')],
 [Synset('dolphinfish.n.02'), Synset('swim.v.01')])

In [44]:
def sentence_path_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            if synset.path_similarity(synset2) is not None and synset.path_similarity(synset2) > best:
                try:
                    best = synset.path_similarity(synset2)
                except TypeError:
                    continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score

In [45]:
def sentence_wup_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            if synset.wup_similarity(synset2) is not None and synset.wup_similarity(synset2) > best:
                try:
                    best = synset.wup_similarity(synset2)
                except TypeError:
                    continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score

In [46]:
def sentence_lin_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            try:
                if synset.lin_similarity(synset2, semcor_ic) is not None and synset.lin_similarity(synset2, semcor_ic) > best:
                    try:
                        best = synset.lin_similarity(synset2, semcor_ic)
                    except TypeError:
                        continue
            except:
                continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score

In [47]:
def sentence_res_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')

    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            try:
                if synset.res_similarity(synset2, semcor_ic) is not None and synset.res_similarity(synset2, semcor_ic) > best:
                    try:
                        best = synset.res_similarity(synset2, semcor_ic)
                    except TypeError:
                        continue
            except:
                continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score

In [48]:
def sentence_lch_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            try:
                if synset.lch_similarity(synset2) is not None and synset.lch_similarity(synset2) > best:
                    try:
                        best = synset.lch_similarity(synset2)
                    except TypeError:
                        continue
            except:
                continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
#     print(score)
    return score

In [49]:
def sentence_jcn_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            try:
                if synset.jcn_similarity(synset2, semcor_ic) is not None and synset.jcn_similarity(synset2, semcor_ic) > best:
                    try:
                        best = synset.jcn_similarity(synset2, semcor_ic)
                    except TypeError:
                        continue
            except:
                continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score

In [50]:
def symmetric_sentence_path_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_path_similarity(sentence1, sentence2) + sentence_path_similarity(sentence2, sentence1)) / 2 

def symmetric_sentence_wup_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_wup_similarity(sentence1, sentence2) + sentence_wup_similarity(sentence2, sentence1)) / 2 

def symmetric_sentence_res_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_res_similarity(sentence1, sentence2) + sentence_res_similarity(sentence2, sentence1)) / 2 

def symmetric_sentence_lch_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_lch_similarity(sentence1, sentence2) + sentence_lch_similarity(sentence2, sentence1)) / 2 

def symmetric_sentence_lin_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_lin_similarity(sentence1, sentence2) + sentence_lin_similarity(sentence2, sentence1)) / 2 

def symmetric_sentence_jcn_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_jcn_similarity(sentence1, sentence2) + sentence_jcn_similarity(sentence2, sentence1)) / 2 

 
focus_sentence = "Dolphins are swimming mammals."
sentence = "Dolphins can swim."
 

print("Path Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_path_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_path_similarity(sentence, focus_sentence))
print("LCH Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_lch_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_lch_similarity(sentence, focus_sentence))
print("LIN Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_lin_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_lin_similarity(sentence, focus_sentence))
print("JCN Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_jcn_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_jcn_similarity(sentence, focus_sentence))
print("RES Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_res_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_res_similarity(sentence, focus_sentence))
print("WUP Similarity: ")
print(focus_sentence, sentence, symmetric_sentence_wup_similarity(focus_sentence, sentence))
print(sentence, focus_sentence, symmetric_sentence_wup_similarity(sentence, focus_sentence))
print()

Path Similarity: 
Dolphins are swimming mammals. Dolphins can swim. 1.0
Dolphins can swim. Dolphins are swimming mammals. 1.0
LCH Similarity: 
3.6375861597263857
3.6375861597263857
Dolphins are swimming mammals. Dolphins can swim. 3.6375861597263857
3.6375861597263857
3.6375861597263857
Dolphins can swim. Dolphins are swimming mammals. 3.6375861597263857
LIN Similarity: 
Dolphins are swimming mammals. Dolphins can swim. 1.0
Dolphins can swim. Dolphins are swimming mammals. 1.0
JCN Similarity: 
Dolphins are swimming mammals. Dolphins can swim. 1e+300
Dolphins can swim. Dolphins are swimming mammals. 1e+300
RES Similarity: 
Dolphins are swimming mammals. Dolphins can swim. 1e+300
Dolphins can swim. Dolphins are swimming mammals. 1e+300
WUP Similarity: 
Dolphins are swimming mammals. Dolphins can swim. 1.0
Dolphins can swim. Dolphins are swimming mammals. 1.0



# Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [52]:
def pipeline(corpus):
    # TODO add a check to ensure the lengths of these arrays are the same
    # or add the basic processing to pipeline
    
    s1_array = corpus['s1']['sentences']
    s2_array = corpus['s2']['sentences']
    s1_tokens = corpus['s1']['tokens']
    s2_tokens = corpus['s2']['tokens']
    s1_lemmas = corpus['s1']['lemmas']
    s2_lemmas = corpus['s2']['lemmas']
    
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cos_sim(s1_array[i], s2_array[i])
        sif_sim = calc_sif_sim(s1_array[i], s2_array[i])
        w_overlap, w_norm_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
        l_overlap, l_norm_overlap = calc_basic_overlap(s1_lemmas[i], s2_lemmas[i])
        spacy_sim = calc_spacy_sim(s1_array[i], s2_array[i])
        syn_overlap, normalized_syn_overlap = calc_synset_overlap(s1_tokens[i], s2_tokens[i])
        path_sim = symmetric_sentence_path_similarity(s1_array[i], s2_array[i]) 
        wup_sim = symmetric_sentence_wup_similarity(s1_array[i], s2_array[i])
        # jcn_sim = symmetric_sentence_jcn_similarity(s1_array[i], s2_array[i])
        lch_sim = symmetric_sentence_lch_similarity(s1_array[i], s2_array[i])
        lin_sim = symmetric_sentence_lin_similarity(s1_array[i], s2_array[i])
        # res_sim = symmetric_sentence_res_similarity(s1_array[i], s2_array[i])
        # noun_chunk_root_overlap = calc_noun_chunk_root_overlap(s1_array[i], s2_array[i]) # made things worse
        data.append(
            [w_norm_overlap, 
             l_norm_overlap, 
             sif_sim, cos_sim, 
             spacy_sim, 
             syn_overlap, 
             normalized_syn_overlap,
             path_sim,
             wup_sim,
             # jcn_sim,
             lch_sim,
             lin_sim,
             # res_sim
            ])
    return data

In [53]:
train_input = pipeline(train_data)


3.44613425658289
3.3577718397474317
3.6375861597263857
3.6375861597263848
3.637586159726385
2.899906810710964
3.0313217997719875
3.209634846817398
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.6375861597263848
3.190409092946193
3.11355722379231
3.6375861597263848
3.33895958152171
3.3289878381149105
3.6375861597263848
3.6375861597263848
3.6375861597263857
3.3889702179590087
3.637586159726384
3.6375861597263843
3.637586159726384
3.6375861597263843
3.6375861597263843
3.3577718397474317
3.4395441081378286
3.6375861597263857
3.6375861597263857
3.314891398844219
3.4162172137318585
2.78899026648061
2.9100689277811087
3.2738275437537467
3.233409919756787
3.5743378894422353
3.0489914987847877
3.6375861597263857
3.6375861597263857
3.4749477504242843
2.66634412069143
2.4415426993048936
3.3060982373698846
2.8440001169965883
3.6375861597263857
2.904743486165947
2.8040417687174424
3.485790311044424
3.485790311044424
3.334453979749186
3.550011631640638
3.6375861597263848
3.63758615972638

3.586987543499064
3.586987543499064
3.5833733566256853
3.561688235385405
3.447841348873934
3.1220316950717857
3.2731949150638338
3.6375861597263857
3.6375861597263857
3.1828878897605875
3.5340889901705026
3.561688235385404
3.319182491403645
3.149078892711253
3.637586159726385
3.6375861597263857
3.6375861597263857
3.637586159726385
2.8832475376065374
2.8349570766242063
3.6083946503644695
3.1977432714571754
3.6375861597263857
2.8985352045398787
3.637586159726384
3.637586159726384
3.6375861597263857
3.6375861597263857
2.8620124348443707
2.6814092091376343
2.6468340196861324
2.9668107068191123
3.6375861597263843
3.637586159726384
3.337042893896928
3.3367027091765973
3.6375861597263843
3.6375861597263843
2.661726301319352
2.0792723843733407
3.2738275437537467
3.1913702494516345
3.3097286712750864
3.3688403578403836
3.207022664417795
3.2429029556935105
2.859891149884099
3.129365666575979
3.637586159726384
3.637586159726384
3.258096538021483
3.2580965380214826
3.6375861597263843
3.63758615972

3.306896508842168
3.0779575197684794
3.6375861597263848
3.6375861597263843
3.5110896191580845
2.380240849355566
3.6375861597263857
3.6375861597263848
3.2738275437537467
3.233409919756787
3.406537099539737
3.6375861597263857
2.90823481759356
2.904743486165947
2.6070940191515755
3.149078892711253
3.6375861597263843
3.637586159726385
3.637586159726384
3.637586159726384
3.385331305037128
3.186267091089979
3.637586159726385
3.043460004960718
3.6375861597263843
3.082116536789271
2.461847765286366
2.9067735782836146
3.5340889901705026
3.302829844607112
3.5427137543001597
3.5291605535249846
3.1828878897605875
3.6375861597263848
2.6488185970918376
3.0562362170363313
3.590149957013273
3.595420646203619
3.6375861597263843
3.637586159726385
2.905177967280979
3.085384681995719
3.2738275437537467
3.2035917255102633
3.3697811076309656
3.2723974523235415
3.5427137543001597
3.5532551326808512
3.4604642294960737
3.6375861597263848
3.6375861597263857
2.3745841153715443
3.2738275437537467
3.47184219854813

3.6375861597263857
3.306896508842168
3.4098923867034436
3.1372280176504335
3.306896508842168
3.306896508842168
2.488378732279476
2.6934061698083394
3.306896508842168
3.353522613283217
3.3777585768887852
3.3577718397474317
2.702901626460047
3.6375861597263843
3.3777585768887852
3.395080415744625
3.6375861597263848
3.6375861597263857
2.963016473501983
2.6807534170816765
3.6375861597263843
3.6375861597263843
2.904743486165947
2.90823481759356
3.1600182122496783
3.01838898236708
2.907504334618816
3.011535479967454
3.6375861597263848
3.6375861597263843
2.935723068452973
3.096072661313309
3.6375861597263843
3.2723974523235415
2.847628550423219
2.8297597137588375
3.6375861597263843
3.6375861597263843
2.7894963997594178
3.6375861597263857
3.1770691411275767
3.117930994051188
3.117930994051188
3.6375861597263857
3.4478413488739337
3.4305918206146204
3.6375861597263848
3.353453654849384
3.4329864260483203
3.3672623352932347
3.5685880466891304
3.5291605535249846
2.9051779672809794
2.6959184837251

3.5189956529436017
3.465090877133247
3.486020069737784
3.637586159726384
3.637586159726383
3.6375861597263834
3.3577718397474317
3.306896508842168
3.4351712210141043
3.3862148426294842
3.3688403578403836
3.275962971701112
3.373171395471317
3.410237024743485
3.233409919756787
3.233409919756787
3.432753512457404
3.285929053859798
3.2113874024108844
3.275962971701112
3.4602139304816224
3.559791322338331
3.4995899336518748
3.5340889901705026
3.3777585768887852
3.3577718397474317
3.637586159726384
3.6375861597263843
3.5110896191580845
3.5110896191580845
3.5743378894422335
3.5743378894422335
3.156192838066559
3.2927066070619344
3.637586159726384
3.637586159726384
1.55814461804655
1.5581446180465495
3.6375861597263843
3.6375861597263843
3.5110896191580845
3.5110896191580845
3.427901611196359
3.2853128968666168
3.6375861597263843
3.637586159726385
3.6375861597263843
3.6375861597263848
3.6375861597263857
3.6375861597263857
3.626086474220174
3.6264247002644745
2.5555110437613755
2.93113882331423

3.4447397374864406
3.454395934595709
3.6375861597263843
3.6375861597263843
3.618611678641139
3.514639601557147
3.6375861597263848
3.6375861597263848
3.6375861597263843
3.3777585768887852
3.379159316644259
3.5954206462036162
3.637586159726384
3.637586159726384
3.085966500385107
3.5110896191580845
3.520820122278722
3.5340889901705026
3.3259772215808323
3.4405059050099833
2.6807534170816765
2.999697664629913
3.637586159726384
3.637586159726384
2.4110851438611802
2.571961438298158
3.5743378894422335
3.5706174029549307
3.12143561682935
3.2580965380214826
3.2113874024108844
3.275962971701112
3.3785266308179396
3.391907752239768
3.233409919756787
3.233409919756787
3.5340889901705026
3.491628612916806
3.3777585768887852
3.3577718397474317
3.448112787258871
3.4722413342842753
3.4813257272596596
3.4655334618209457
3.5706174029549307
3.5901499570132716
3.6375861597263843
3.637586159726384
3.4643677711679843
3.559683574107773
3.3802911744924593
2.8051958383909437
3.637586159726384
3.63758615972638

3.6375861597263857
3.6375861597263848
3.5110896191580845
3.4478413488739337
2.7281896197947892
2.7281896197947892
3.5340889901705026
3.407119975326952
3.373171395471317
3.44613425658289
3.6375861597263843
3.395080415744625
3.637586159726384
3.6375861597263843
3.3933325262188183
3.3933325262188183
3.6245003107020763
3.6240329589512084
3.637586159726384
3.6375861597263848
3.233409919756787
3.2738275437537467
3.637586159726384
3.6375861597263848
3.3577718397474317
3.3577718397474317
2.6929769741658465
2.80331309129817
3.6375861597263848
3.426428743958812
3.4332455941929756
3.447841348873934
3.4332455941929756
3.447841348873934
3.4769031284691323
3.4462968368010847
3.4812941126745667
3.4009479146351116
3.0313217997719875
2.9762068579579517
3.4354980397415846
3.4557068517400644
3.6375861597263843
3.637586159726384
3.637586159726384
3.637586159726384
2.54631031180847
2.814453691185958
3.637586159726385
3.6375861597263843
3.5833733566256853
3.253821977560712
3.6375861597263857
3.6375861597263

In [54]:
print(train_input[0:5])
train_data['scores'][0:5]

[[0.6428571428571429, 0.6, 0.4043188683415115, 0.5949218057093537, 0.9699586985720816, 46, 0.23173803526448364, 1.0, 1.0, 3.4019530481651605, 0.9352226720647774], [0.631578947368421, 0.6, 0.37040524322972224, 0.474330706497194, 0.9316565540040632, 24, 0.22018348623853212, 1.0, 1.0, 3.637586159726385, 1.0], [0.5, 0.5, 0.1358693286767868, 0.392181175971253, 0.9247478261787359, 32, 0.2098360655737705, 0.84375, 0.9142857142857143, 3.2687464852186743, 0.8059226629199424], [0.7333333333333333, 0.6666666666666666, 0.6935512636502701, 0.668348418668298, 0.9677497361187998, 28, 0.20363636363636364, 1.0, 1.0, 3.1204783232946927, 0.857843137254902], [0.24, 0.24, 4.979960298599938e-10, 0.12170566815950139, 0.8618764553778161, 22, 0.21674876847290642, 1.0, 1.0, 3.6375861597263857, 1.0]]


[4, 4, 3, 3, 2]

# Models

In this section we fit our feature set to a model.

### Decision Tree

In [55]:
dt_classifier = DecisionTreeClassifier(random_state=14, max_depth=8)
dt_classifier.fit(train_input,train_data['scores'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [56]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')

training_scores = train_data['scores']
print(f'Accuracy: {dt_classifier.score(train_input, training_scores)}')

Nodes: 273
Max Depth: 8
Accuracy: 0.7284366576819407


### Random Forest

In [57]:
rf_classifier = RandomForestClassifier(random_state=14, n_estimators=100)
rf_classifier.fit(train_input, train_data['scores'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)

### XGBoost

In [58]:
# xgboost_model = XGBClassifier(booster='gbtree', 
#                        n_estimators=2000,
#                        n_jobs=4,
#                        learning_rate=.05,
#                        max_depth=10,
#                        random_state=42,
#                        gamma=.05,
#                        #early_stopping_rounds = 5
#                              )

In [59]:
xgboost_model = XGBClassifier(n_jobs=8, 
                              n_estimators=2000, 
                              #max_depth=6,
                              #gamma=.05, 
                              random_state=42)
xgboost_model.fit(np.asarray(train_input), np.asarray(train_data['scores']))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=8, nthread=None,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

# Testing

In [60]:
dev_data = preprocess("./data/dev-set.txt")
dev_input = pipeline(dev_data)

dt_dev_predictions = dt_classifier.predict(dev_input)
rf_dev_predictions = rf_classifier.predict(dev_input)
xgb_dev_predictions = xgboost_model.predict(np.asarray(dev_input))


3.6375861597263848
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.372215940858112
3.0531313377856697
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.5110896191580845
3.5427137543001597
3.0395644295156394
2.617085262257232
3.6375861597263857
3.6375861597263857
3.6375861597263848
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.5110896191580845
2.6827874025620844
2.389148504726084
3.366522144222883
3.6375861597263857
3.6375861597263857
3.6375861597263848
3.637586159726385
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.144885522935381
2.816418431741378
2.8341710034401277
3.0905163272582143
3.637586159726385
3.6375861597263848
3.6375861597263857
3.6375861597263857
1.949021350169756
2.1660995073723903
2.389148504726084
2.8415769889195364
3.6375861597263857
3.6375861597263857
3.215271328191239
2.8205671647594954
3.372215940858112
2.8525607751829143
1.9548579228128893
2.715080448351235
3.233409919756787
3.18288

3.637586159726385
3.6375861597263848
1.949021350169756
2.1660995073723903
3.6375861597263857
3.6375861597263857
2.8829494218963454
2.46723410065111
0.6198454434640076
0.24793817738560303
3.372215940858112
3.094228776707353
3.6375861597263857
3.6375861597263857
3.6375861597263848
3.6375861597263857
3.3853313050371296
2.8831789926518225
1.949021350169756
1.4762381027059137
3.637586159726385
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.117930994051188
3.0832979433365058
3.215271328191239
2.726126748272907
2.7005443035793717
3.0455613536770736
3.447841348873934
3.4689241056353177
3.372215940858112
3.1725062055743423
3.6375861597263857
3.6375861597263857
3.6375861597263848
3.6375861597263857
3.144885522935381
2.740709282965783
3.6375861597263857
3.6375861597263857
2.389148504726084
3.366522144222883
2.4288845145118403
3.341965777651783
3.6375861597263857
3.6375861597263857
2.4435724035161117
3.258096538021482
3.6375861597263857
3.6375861597263857
3.233409919756787
3.1179309940

3.5110896191580845
3.5291605535249846
3.215271328191239
2.93728416404048
1.949021350169756
2.1089115752941625
3.637586159726385
3.471842198548135
2.904743486165947
2.90823481759356
2.9762068579579517
2.7281896197947892
2.963546814259957
2.8337984891741868
3.6375861597263857
2.6038302401950655
3.637586159726384
3.6375861597263843
3.6375861597263857
3.282599495470957
3.502054151974633
3.5833733566256845
3.271382063503682
3.6375861597263848
2.784971481376486
2.920670747188788
3.6375861597263857
3.0783568867355102
2.8211854417510627
3.3792084074196946
2.186847952258998
3.197743271457176
3.6375861597263857
3.6375861597263857
3.6375861597263857
2.8985352045398787
3.0379920254114494
3.6375861597263848
3.6375861597263843
3.6375861597263843
3.4749477504242843
3.5532551326808512
2.6706888402650786
2.5237290269347805
3.117930994051188
3.1561784614036967
3.117930994051188
3.485790311044424
3.637586159726384
3.637586159726384
3.637586159726384
3.6375861597263843
3.6375861597263843
3.637586159726384

3.637586159726385
3.2046612565629533
2.90823481759356
2.7826030405725404
3.487900763645207
3.6375861597263848
3.6375861597263843
3.6375861597263848
3.4033394143590114
3.1111192803118453
3.5682714416703916
3.538565133932108
2.825267641203088
2.734811620012198
3.6375861597263857
3.117930994051188
3.6375861597263843
3.6375861597263857
3.085384681995719
3.6375861597263857
3.6375861597263857
3.6375861597263857
2.44493467662206
2.1755438953975097
3.637586159726384
3.6375861597263848
3.6375861597263848
3.385331305037129
2.249018846922999
3.561688235385405
3.6375861597263857
3.6375861597263848
2.8040417687174424
2.904743486165947
3.3577718397474317
3.3577718397474317
3.6375861597263857
3.637586159726384
3.637586159726385
2.8722745585370872
3.0095053878497864
2.955292584749086
2.443361307826785
2.9770847330328833
3.637586159726384
3.6375861597263843
3.6375861597263848
3.6375861597263848
3.400405146160821
3.491628612916806
3.01838898236708
3.495277551587047
3.6375861597263848
3.637586159726385
3

3.6375861597263857
3.6375861597263857
3.6375861597263848
3.6375861597263843
3.590149957013273
3.6375861597263843
3.4098923867034436
2.782425419514228
2.991439342091014
2.5126337421438194
2.4862936132293627
3.6375861597263857
3.2446391446874756
3.637586159726384
3.259675630959579
3.5110896191580845
3.2738275437537467
3.530290298897445
3.175488039353089
3.117725774306427
2.645673705592298
3.1354516870474747
3.112883752510337
2.964607161857524
3.6375861597263843
3.6375861597263843
2.611606416741771
3.6375861597263857
3.6375861597263843
3.6375861597263843
3.637586159726384
3.3777585768887852
3.6375861597263843
3.538565133932107
3.60596202458431
3.595420646203619
3.233409919756787
3.0098077090588946
2.7281896197947892
3.6375861597263857
3.595420646203619
3.465090877133247
3.3035478380443917
3.5532551326808512
3.3505975673438244
3.5792031410025538
3.381091223980232
3.6375861597263848
3.0590880517571026
2.1661887895749405
3.6375861597263857
3.6375861597263857
3.5340889901705026
3.550011631640

3.6375861597263843
3.6375861597263843
2.538973871058276
2.9100689277811087
3.6375861597263857
3.6375861597263843
3.6375861597263857
3.6375861597263857
3.6375861597263857
2.538265128115992
3.6375861597263843
3.5002596236428705
3.333994462362463
3.3213448083056325
2.4951404090411216
3.034046942563598
3.6375861597263848
3.637586159726384
3.3269728285028854
2.4275267277284285
3.148703961307074
3.0637181909504876
3.5996371975558956
3.637586159726385
3.117930994051188
2.214887476862743
3.5685880466891304
2.9782071616791446
3.2738275437537467
3.538565133932107
3.6375861597263848
3.6375861597263848
3.2379369475933237
1.9377260450350657
3.253821977560712
3.6375861597263857
3.595420646203619
3.485790311044424
3.6375861597263848
3.3565263252125055
3.6375861597263857
2.8092839431303855
3.088280015392331
2.8447299162476707
2.97841878652552
3.6375861597263857
3.6375861597263857
3.6375861597263857
3.1828878897605875
2.9380182305197717
3.6030871032077583
3.0870105968278225
3.637586159726385
3.32905383

In [61]:
# make sure our lengths match up
print(len(dev_data['scores']))
print(len(dt_dev_predictions))
print(len(rf_dev_predictions))
print(len(xgb_dev_predictions))

1209
1209
1209
1209


In [62]:
from sklearn.metrics import f1_score

def get_metrics(name, predictions, scores):
    correct = 0
    total_error = 0
    for i in range(0, len(predictions)): 
        if predictions[i] == scores[i]:
            correct += 1
        total_error += abs(int(scores[i]) - int(predictions[i]))
    acc = correct / len(predictions) 
    avg_err = total_error / len(predictions)
    
    f1 = f1_score(scores, predictions, average='weighted')
    return name, acc, avg_err, f1


# Ensure all our arrays are full of ints for metrics.
dev_data['scores'] = [int(i) for i in dev_data['scores']] 
rf_dev_predictions = [int(i) for i in rf_dev_predictions] 
xgb_dev_predictions = [int(i) for i in xgb_dev_predictions] 
dt_dev_predictions = [int(i) for i in dt_dev_predictions] 



xgb_metrics = get_metrics('XGBoost', xgb_dev_predictions, dev_data['scores'])
dt_metrics  = get_metrics('Decision Tree', dt_dev_predictions, dev_data['scores'])
rf_metrics = get_metrics('Random Forest', rf_dev_predictions, dev_data['scores'])

df = pd.DataFrame(
    [dt_metrics, rf_metrics, xgb_metrics], 
    columns = ['Model', 'Accuracy', 'Avg Error', 'F-Score']) 
df

  'recall', 'true', average, warn_for)


Unnamed: 0,Model,Accuracy,Avg Error,F-Score
0,Decision Tree,0.331679,1.030604,0.343136
1,Random Forest,0.37469,0.870141,0.383448
2,XGBoost,0.396195,0.860215,0.405101
