# Imports

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# XGBOOST
import xgboost as xgb
# from xgboost import XGBClassifier

# SPACY IMPORT
import spacy
nlp = spacy.load("en_core_web_lg")

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
from xgboost import XGBClassifier

# Read Data and PreProcess

In [3]:
# same readData from STS.py
def readData(fileName):

    s1 = []
    s2 = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):

        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        
        s1.insert(len(s1), (sentence.split('\t')[1].lower()).rstrip('.'))
        s2.insert(len(s1), (sentence.split('\t')[2].lower()).rstrip('.'))
        
        # inserting the score as a separate lists
        score.insert(len(s1), (sentence.split('\t')[3]))

    # print(s1)
    return s1, s2, score


def preprocess(fileName):

    s1, s2, scores = readData(fileName)
    s1_toks = []
    s2_toks = []

    # tokenizing and tagging
    s1_tags = []
    s2_tags = []

    for sentence in s1:
        tokens = nltk.word_tokenize(sentence)
        s1_toks.insert(len(s1_toks), tokens)
        s1_tags.insert(
            len(s1_tags), nltk.pos_tag(tokens))

    for sentence in s2:
        tokens = nltk.word_tokenize(sentence)
        s2_toks.insert(len(s2_toks), tokens)
        s2_tags.insert(
            len(s2_tags), nltk.pos_tag(tokens))
    
    # Remove the unnecessary tuple and keep just the tags
    for i, tag_list in enumerate(s1_tags):
        s1_tags[i] = [tup[1] for tup in tag_list]
    for i, tag_list in enumerate(s2_tags):
        s2_tags[i] = [tup[1] for tup in tag_list]

    # lemmatizing
    s1_lemmas = []
    s2_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in s1_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s1_lemmas.insert(
            len(s1_lemmas), sentence_components)

    for sentence in s2_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s2_lemmas.insert(
            len(s2_lemmas), sentence_components)

        
    # Zipping it all together into one object for each word
    s1_word_lists = []
    s2_word_lists = []
    
    for tok_list, lem_list, tag_list in zip(s1_toks, s1_lemmas, s1_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s1_word_lists.append(sentence_words) 
        
    for tok_list, lem_list, tag_list in zip(s2_toks, s2_lemmas, s2_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s2_word_lists.append(sentence_words)  
              
    
    # Create a corpus object to represent our corpus
    corpus = {}
    corpus["s1"] = {}
    corpus["s2"] = {}
    corpus['scores'] = [int(i) for i in scores]
    
    corpus["s1"]["sentences"] = s1
    corpus["s2"]["sentences"] = s2
    
    corpus["s1"]["tokens"] = s1_toks
    corpus["s2"]["tokens"] = s2_toks
    
    corpus["s1"]["lemmas"] = s1_lemmas
    corpus["s2"]["lemmas"] = s2_lemmas
    
    corpus["s1"]["tags"] = s1_tags
    corpus["s2"]["tags"] = s2_tags
    
    corpus["s1"]["words"] = s1_word_lists
    corpus["s2"]["words"] = s2_word_lists
    
    return corpus


In [7]:
train_data = preprocess("./data/train-set.txt")

In [8]:
train_data['s2']['words'][0][0:5]

[{'tok': 'but', 'lem': 'but', 'tag': 'CC'},
 {'tok': 'other', 'lem': 'other', 'tag': 'JJ'},
 {'tok': 'sources', 'lem': 'source', 'tag': 'NNS'},
 {'tok': 'close', 'lem': 'close', 'tag': 'RB'},
 {'tok': 'to', 'lem': 'to', 'tag': 'TO'}]

### Stop Words

In [25]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = train_data['s1']['tokens']+train_data['s2']['tokens']
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

# Data Inspection

### Frequency Distribution

In [26]:
def frequency_distribution(corpus):
    s1_toks = corpus['s1']['tokens']
    s2_toks = corpus['s2']['tokens']    
    freq_dist = FreqDist()
    for i in range(len(s1_toks)):
        for token in (s1_toks[i] + s2_toks[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

In [27]:
freq_dist = frequency_distribution(train_data)

print(freq_dist.most_common(40))

[('the', 5169), (',', 3690), ('of', 2497), ('to', 2133), ('and', 1716), ('a', 1615), ('in', 1573), ('is', 891), ('that', 831), ('on', 820), ('for', 756), ('it', 587), ('this', 579), ('we', 531), ('with', 464), ('be', 459), ('by', 443), ('i', 425), ('which', 403), ('have', 384), ('not', 366), ('at', 343), ('as', 334), ('are', 333), ('has', 319), ('said', 316), ('was', 304), ('european', 287), ("'s", 280), ('from', 261), ('``', 252), ("''", 242), ('will', 233), ('.', 229), ('also', 223), ('its', 194), ('but', 193), ('would', 191), ('all', 188), ('percent', 187)]


### Score Distribution

# TODO
this data is super imbalanced. we likely need to balance it in preprocessing. `https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html`

In [28]:
score_list = [0,0,0,0,0,0]
for s in train_data['scores']:
    score_list[int(s)] += 1
for i in range(0, len(score_list)):
    print("% 1d: % 4d % 6.2f per" %(i, score_list[i], 100*score_list[i]/len(train_data['scores']))) 

 0:    8   0.54 per
 1:   37   2.49 per
 2:   95   6.40 per
 3:  310  20.89 per
 4:  616  41.51 per
 5:  418  28.17 per


## Feature Engineering

This section includes all the code/functions to create features.

### Helper Functions

In [29]:
def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

In [30]:
def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

### Cosine Similarity (TF-IDF)

This is the same as the spacy similarity. This one is probably less accurate though as I don't believe it's trained from the GloVe w2v model.

In [31]:
def calc_cosine_similarity(s1, s2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([s1, s2])
    
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

In [32]:
calc_cos_sim('I like some apples', 'I like the pears')

0.33609692727625756

### Spacy Similarity

In [36]:
def calc_spacy_sim(s1, s2):
    s2 = nlp(s2)
    s1 = nlp(s1)
    return s1.similarity(s2)


In [37]:
calc_spacy_sim('I like some apples', 'I like the pears')

0.9361166303666173

### Smooth Inverse Frequency (SIF)

In [38]:
def calc_sif_sim(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

In [39]:
calc_sif_sim('I like some apples', 'I like the pears')

1.4515545128534995e-10

### Simple Overlap

Unique words that are in both sentences divided by the total number of words in both sentences. Does not include stop words.

In [40]:
def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
    overlap = 0
    encountered_indexes = []
    for word in (s1_tokens+s2_tokens):
        try:
            word_index = words_filtered.index(word)
            if word_index in encountered_indexes: # we know we have found an overlap
                overlap += 1
            encountered_indexes.append(word_index)
        except ValueError:
            # print(word + ' not found in lexicon. Skipping...')
            continue

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

### Synset Overlap

# TODO

We may be able to incorporate POS and dependency parsing here as right now i'm just taking the first synset.

In [48]:
def calc_synset_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
#     print(s2_tokens)
#     print(s1_tokens)

    s1_spread = []
    s2_spread = []
    
    for word in s1_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
calc_synset_overlap(train_data['s1']['tokens'][0], train_data['s2']['tokens'][0])

(46, 0.23173803526448364)

# Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [132]:
def pipeline(corpus):
    # TODO add a check to ensure the lengths of these arrays are the same
    # or add the basic processing to pipeline
    
    s1_array = corpus['s1']['sentences']
    s2_array = corpus['s2']['sentences']
    s1_tokens = corpus['s1']['tokens']
    s2_tokens = corpus['s2']['tokens']
    s1_lemmas = corpus['s1']['lemmas']
    s2_lemmas = corpus['s2']['lemmas']
    
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        sif_sim = calc_sif_similarity(s1_array[i], s2_array[i])
        w_overlap, w_norm_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
        l_overlap, l_norm_overlap = calc_basic_overlap(s1_lemmas[i], s2_lemmas[i])
        spacy_sim = calc_spacy_sim(s1_array[i], s2_array[i])
        syn_overlap, normalized_syn_overlap = calc_synset_overlap(s1_tokens[i], s2_tokens[i])
        data.append([w_norm_overlap, l_norm_overlap, sif_sim, cos_sim, spacy_sim, syn_overlap, normalized_syn_overlap])
    return data

In [133]:
train_input = pipeline(train_data)
print(data[0:5])

[[0, 0.6428571428571429, 0.6, 0.4043188683415115, 0.5949218057093537, 0.9699586985720816, 46, 0.23173803526448364], [1, 0.631578947368421, 0.6, 0.37040524322972224, 0.474330706497194, 0.9316565540040632, 24, 0.22018348623853212], [2, 0.5, 0.5, 0.1358693286767868, 0.392181175971253, 0.9247478261787359, 32, 0.2098360655737705], [3, 0.7333333333333333, 0.6666666666666666, 0.6935512636502701, 0.668348418668298, 0.9677497361187998, 28, 0.20363636363636364], [4, 0.24, 0.24, 4.979960298599938e-10, 0.12170566815950139, 0.8618764553778161, 22, 0.21674876847290642]]


In [134]:
train_data['scores'][0:5]

[4, 4, 3, 3, 2]

## Models

In this section we fit our feature set to a model.

### Decision Tree

In [135]:
dt_classifier = DecisionTreeClassifier(random_state=14, max_depth=8)
dt_classifier.fit(train_input,train_data['scores'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [136]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')

training_scores = train_data['scores']
print(f'Accuracy: {dt_classifier.score(train_input, training_scores)}')

Nodes: 265
Max Depth: 8
Accuracy: 0.6967654986522911


### Random Forest

In [137]:
rf_classifier = RandomForestClassifier(random_state=14, n_estimators=100)
rf_classifier.fit(train_input, train_data['scores'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)

### XGBoost

In [138]:
xgboost_model = XGBClassifier(booster='gbtree', 
                       n_estimators=1000,
                       n_jobs=4,
                       learning_rate=.05,
                       max_depth=7,
                       random_state=42,
                       #gamma=.05,
                       early_stopping_rounds = 5)

In [139]:
xgboost_model = XGBClassifier(n_jobs=4, n_estimators=2000, gamma=.05, random_state=42)
xgboost_model.fit(np.asarray(train_input), np.asarray(train_data['scores']))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0.05,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=2000, n_jobs=4,
       nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

# Testing

In [140]:
dev_data = preprocess("./data/dev-set.txt")
dev_input = pipeline(dev_data)

dt_dev_predictions = dt_classifier.predict(dev_input)
rf_dev_predictions = rf_classifier.predict(dev_input)
xgb_dev_predictions = xgboost_model.predict(np.asarray(dev_input))


In [141]:
# make sure our lengths match up
print(len(dev_data['scores']))
print(len(dt_dev_predictions))
print(len(rf_dev_predictions))
print(len(xgb_dev_predictions))

1209
1209
1209
1209


In [142]:
from sklearn.metrics import f1_score

def get_metrics(name, predictions, scores):
    correct = 0
    total_error = 0
    for i in range(0, len(predictions)): 
        if predictions[i] == scores[i]:
            correct += 1
        total_error += abs(int(scores[i]) - int(predictions[i]))
    acc = correct / len(predictions) 
    avg_err = total_error / len(predictions)
    
    f1 = f1_score(scores, predictions, average='weighted')
    return name, acc, avg_err, f1


# Ensure all our arrays are full of ints for metrics.
dev_data['scores'] = [int(i) for i in dev_data['scores']] 
rf_dev_predictions = [int(i) for i in rf_dev_predictions] 
xgb_dev_predictions = [int(i) for i in xgb_dev_predictions] 
dt_dev_predictions = [int(i) for i in dt_dev_predictions] 



xgb_metrics = get_metrics('XGBoost', xgb_dev_predictions, dev_data['scores'])
dt_metrics  = get_metrics('Decision Tree', dt_dev_predictions, dev_data['scores'])
rf_metrics = get_metrics('Random Forest', rf_dev_predictions, dev_data['scores'])

df = pd.DataFrame(
    [dt_metrics, rf_metrics, xgb_metrics], 
    columns = ['Model', 'Accuracy', 'Avg Error', 'F-Score']) 
df

Unnamed: 0,Model,Accuracy,Avg Error,F-Score
0,Decision Tree,0.318445,1.040529,0.327107
1,Random Forest,0.38627,0.878412,0.393129
2,XGBoost,0.394541,0.897436,0.403824
