In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from collections import Counter
import math

In [21]:
# same readData from STS.py
def readData(fileName):

    first_sentence = []
    second_sentence = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):
        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        first_sentence.insert(len(first_sentence),
                              (sentence.split('\t')[1].lower()).rstrip('.'))
        second_sentence.insert(len(first_sentence),
                               (sentence.split('\t')[2].lower()).rstrip('.'))
        # inserting the score as a separate lists
        score.insert(len(first_sentence), (sentence.split('\t')[3]))

    # print(first_sentence)
    return first_sentence, second_sentence, score


def preprocess(fileName):

    first_sentence, second_sentence, score = readData(fileName)
    first_sentence_tokens = []
    second_sentence_tokens = []

    # tokenizing and tagging
    first_sentence_tags = []
    second_sentence_tags = []

    for sentence in first_sentence:
        tokens = nltk.word_tokenize(sentence)
        first_sentence_tokens.insert(len(first_sentence_tokens), tokens)
        first_sentence_tags.insert(
            len(first_sentence_tags), nltk.pos_tag(tokens))
        # print(first_sentence_tokens)

    for sentence in second_sentence:
        tokens = nltk.word_tokenize(sentence)
        second_sentence_tokens.insert(len(second_sentence_tokens), tokens)
        second_sentence_tags.insert(
            len(second_sentence_tags), nltk.pos_tag(tokens))

        # print(second_sentence_tokens)

    # lemmatizing
    first_sentence_lemmas = []
    second_sentence_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in first_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        first_sentence_lemmas.insert(
            len(first_sentence_lemmas), sentence_components)

    for sentence in second_sentence_tokens:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        second_sentence_lemmas.insert(
            len(second_sentence_lemmas), sentence_components)

    return first_sentence, second_sentence, score, first_sentence_tokens, second_sentence_tokens


In [7]:
s1_arr_train, s2_arr_train, scores_train, s1_tokens_train, s2_tokens_train = preprocess("./data/train-set.txt")

## Feature Engineering

This section includes all the code/functions to create features.

### Cosine Similarity (Baseline)

In [8]:
def calc_cosine_similarity(sentence1, sentence2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([sentence1, sentence2])
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

## Pipeline

In this section we run the data through the pipeline to get it into the form necessary to create our models.

In [9]:
def pipeline(s1_array, s2_array):
    # TODO add a check to ensure the lengths of these arrays are the same
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        data.append([i, cos_sim])
    return data

In [10]:
data = pipeline(s1_arr_train, s2_arr_train)
print(data[0:5])

[[0, 0.5949218057093537], [1, 0.474330706497194], [2, 0.392181175971253], [3, 0.668348418668298], [4, 0.12170566815950139]]


In [11]:
scores_train[0:5]

['4', '4', '3', '3', '2']

## Models

In this section we fit our feature set to a model.

### Decision Tree

In [12]:
dt_classifier = DecisionTreeClassifier(random_state=14)
dt_classifier.fit(data,scores_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=14,
            splitter='best')

In [14]:
print(f'Nodes: {dt_classifier.tree_.node_count}')
print(f'Max Depth: {dt_classifier.tree_.max_depth}')
print(f'Accuracy: {dt_classifier.score(data, scores_train)}')

Nodes: 1399
Max Depth: 29
Accuracy: 1.0


# Testing

In [29]:
s1_arr_dev, s2_arr_dev, scores_dev, s1_tokens_dev, s2_tokens_dev = preprocess("./data/dev-set.txt")
dev_data = pipeline(s1_arr_dev, s2_arr_dev)
dev_predictions = dt_classifier.predict(dev_data)

In [31]:
# make sure our lengths match up
print(len(scores_dev))
print(len(dev_predictions))

1209
1209


In [None]:
for i in range(0, len(dev_predictions))N:
    correct = 0
    if dev_predictions[i] == scores_dev[i]:
        correct += 1

accuracy = correct / len(dev_predictions)
print()