# CS 6320 Natural Language Processing
## Shruti Agrawal & Pat Dayton

This notebook demos our code for Tasks 1 & 2 of the project.

In [184]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag


import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


# SPACY IMPORT
import spacy
nlp = spacy.load("en_core_web_lg")

# CoreNLP setup
core_nlp_url = 'http://localhost:9000'

## Start the Stanford CoreNLP Server
In another console run the script below in order to start the Stanford CoreNLP Server on port 9000. We will hit this API in Part 2.

In [165]:
#java -mx4g -cp "./corenlp/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# Task 1: Parse the Corpus
First read in the corpus and do basic parsing to split out the first sentence, second sentence, and score for each line.

In [193]:
def readData(fileName, test=False):

    s1 = []
    s2 = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):

        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        
        s1.insert(len(s1), (sentence.split('\t')[1].lower()).rstrip('.'))
        s2.insert(len(s1), (sentence.split('\t')[2].lower()).rstrip('.'))
        
        # inserting the score as a separate lists
        if (not test):
            score.insert(len(s1), (sentence.split('\t')[3]))

    # print(s1)
    if test:
        return s1, s2
    else:
        return s1, s2, score

In [194]:
# same readData from STS.py
def preprocess(fileName, test=False):

    if (test):
        s1, s2 = readData(fileName, test)
    else:
        s1, s2, scores = readData(fileName, test)

    s1_toks = []
    s2_toks = []

    # tokenizing and tagging
    s1_tags = []
    s2_tags = []

    for sentence in s1:
        tokens = nltk.word_tokenize(sentence)
        s1_toks.insert(len(s1_toks), tokens)
        s1_tags.insert(
            len(s1_tags), nltk.pos_tag(tokens))

    for sentence in s2:
        tokens = nltk.word_tokenize(sentence)
        s2_toks.insert(len(s2_toks), tokens)
        s2_tags.insert(
            len(s2_tags), nltk.pos_tag(tokens))
    
    # Remove the unnecessary tuple and keep just the tags
    for i, tag_list in enumerate(s1_tags):
        s1_tags[i] = [tup[1] for tup in tag_list]
    for i, tag_list in enumerate(s2_tags):
        s2_tags[i] = [tup[1] for tup in tag_list]

    # lemmatizing
    s1_lemmas = []
    s2_lemmas = []
    s1_ls = []
    s2_ls = []
    lemmatizer = WordNetLemmatizer()
    for sentence in s1_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s1_lemmas.insert(
            len(s1_lemmas), sentence_components)
        s1_ls.insert(len(s1_ls), ' '.join(word for word in sentence_components))
    
    for sentence in s2_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s2_lemmas.insert(
            len(s2_lemmas), sentence_components)
        s2_ls.insert(len(s2_ls), ' '.join(word for word in sentence_components))


        
    # Zipping it all together into one object for each word
    s1_word_lists = []
    s2_word_lists = []
    
    for tok_list, lem_list, tag_list in zip(s1_toks, s1_lemmas, s1_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s1_word_lists.append(sentence_words) 
        
    for tok_list, lem_list, tag_list in zip(s2_toks, s2_lemmas, s2_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s2_word_lists.append(sentence_words)  
    
    
    
    # Create a corpus object to represent our corpus
    corpus = {}
    corpus["s1"] = {}
    corpus["s2"] = {}
    if (not test):
        corpus['scores'] = [int(i) for i in scores]
    
    corpus["s1"]["sentences"] = s1
    corpus["s2"]["sentences"] = s2
    
    corpus["s1"]["tokens"] = s1_toks
    corpus["s2"]["tokens"] = s2_toks
    
    corpus["s1"]["lemmas"] = s1_lemmas
    corpus["s2"]["lemmas"] = s2_lemmas
    
    corpus["s1"]["tags"] = s1_tags
    corpus["s2"]["tags"] = s2_tags
    
    corpus["s1"]["words"] = s1_word_lists
    corpus["s2"]["words"] = s2_word_lists

    corpus["s1"]["ls"] = s1_ls
    corpus["s2"]["ls"] = s2_ls
    
    return corpus

In [168]:
train_data = preprocess("./data/train-set.txt")

the fine are part of failed republican effort to force or entice the democrat to return


In [169]:
print(len(train_data["s1"]['sentences']))
print(len(train_data["s2"]['sentences']))

1484
1484


In [170]:
r=286

tkns1 = train_data["s1"]['tokens'][r]
lems1 = train_data["s1"]['lemmas'][r]
tags1 = train_data["s1"]['tags'][r]
tkns2 = train_data["s2"]['tokens'][r]
lems2 = train_data["s2"]['lemmas'][r]
tags2 = train_data["s2"]['tags'][r]

data1 = []
data2 = []

for i in range(0, len(tkns1)):
    data1.append([tkns1[i], lems1[i], tags1[i]])
    
for i in range(0, len(tkns2)):
    data2.append([tkns2[i], lems2[i], tags2[i]])
    
df1 = pd.DataFrame(
    data1, 
    columns = ['Tokens', 'Lemmas', 'Tags']) 

df2 = pd.DataFrame(
    data2, 
    columns = ['Tokens', 'Lemmas', 'Tags']) 


print('ROW {} FROM TEST DATA\n'.format(r))
print('Sentence 1\n')
print('Raw: ', train_data["s1"]['sentences'][r])
display(df1)
print('Sentence 2\n')
print('Raw: ', train_data["s2"]['sentences'][r])
display(df2)
print('Score: ', train_data["scores"][r])

ROW 286 FROM TEST DATA

Sentence 1

Raw:  gemstar's shares gathered up 2.6 percent, adding 14 cents to $5.49 at the close
Sentence 2

Raw:  gemstar shares moved higher on the news, closing up 2.6 percent at $5.49 on nasdaq
Score:  4


Unnamed: 0,Tokens,Lemmas,Tags
0,gemstar,gemstar,NN
1,'s,'s,POS
2,shares,share,NNS
3,gathered,gathered,VBD
4,up,up,RP
5,2.6,2.6,CD
6,percent,percent,NN
7,",",",",","
8,adding,adding,VBG
9,14,14,CD


Unnamed: 0,Tokens,Lemmas,Tags
0,gemstar,gemstar,NN
1,shares,share,NNS
2,moved,moved,VBD
3,higher,higher,RBR
4,on,on,IN
5,the,the,DT
6,news,news,NN
7,",",",",","
8,closing,closing,VBG
9,up,up,RP


## Dependency Parsing

In [171]:
# dependency parsing
print("\nDependency Parsing Sentence 1\n")
dependency_parser = CoreNLPDependencyParser(url=core_nlp_url)
parse, = dependency_parser.raw_parse(train_data["s1"]['sentences'][r])
print(parse.to_conll(4))

print("\nDependency Parsing Sentence 2\n")
dependency_parser = CoreNLPDependencyParser(url=core_nlp_url)
parse, = dependency_parser.raw_parse(train_data["s2"]['sentences'][r])
print(parse.to_conll(4))


Dependency Parsing Sentence 1

gemstar	NN	3	nmod:poss
's	POS	1	case
shares	NNS	4	nsubj
gathered	VBD	0	ROOT
up	RP	4	compound:prt
2.6	CD	7	nummod
percent	NN	4	dobj
,	,	4	punct
adding	VBG	4	advcl
14	CD	11	nummod
cents	NNS	9	dobj
to	TO	14	case
$	$	14	dep
5.49	CD	9	nmod
at	IN	17	case
the	DT	17	det
close	NN	9	nmod


Dependency Parsing Sentence 2

gemstar	JJ	2	amod
shares	NNS	3	nsubj
moved	VBD	0	ROOT
higher	RBR	3	advmod
on	IN	7	case
the	DT	7	det
news	NN	3	nmod
,	,	3	punct
closing	VBG	3	advcl
up	RP	9	compound:prt
2.6	CD	12	nummod
percent	NN	9	dobj
at	IN	15	case
$	$	15	dep
5.49	CD	9	nmod
on	IN	17	case
nasdaq	NN	9	nmod



## Syntactic Parsing
https://www.nltk.org/api/nltk.parse.html

In [172]:
# syntactic parsing
print("\nFull syntactic parse tree for sentence 1: ")
syntactic_parser = CoreNLPParser(url=core_nlp_url)
s1_tree = next(syntactic_parser.raw_parse(train_data["s1"]['sentences'][r]))
s1_tree.pretty_print()

# type(s1_tree)
# s1_parse_tree_file = open("./output/s1_parse_tree.txt", "w") 
# s1_parse_tree_file.write(str(s1_tree))
# s1_parse_tree_file.close()

f = open("./output/s1_parse_tree.txt", "w", encoding="utf-8")
s1_tree.pretty_print(stream=f)
f.close()


Full syntactic parse tree for sentence 1: 
                                                         ROOT                                                  
                                                          |                                                     
                                                          S                                                    
              ____________________________________________|_____                                                
             |                                                  VP                                             
             |             _____________________________________|_________________                              
             |            |      |       |           |                            S                            
             |            |      |       |           |                            |                             
             |            |      |       |           |  

In [173]:
# syntactic parsing
print("\nFull syntactic parse tree for sentence 1: ")
syntactic_parser = CoreNLPParser(url=core_nlp_url)
s1_tree = next(syntactic_parser.raw_parse(train_data["s2"]['sentences'][r]))
s1_tree.pretty_print()

f = open("./output/s2_parse_tree.txt", "w", encoding="utf-8")
s1_tree.pretty_print(stream=f)
f.close()


Full syntactic parse tree for sentence 1: 
                                                       ROOT                                                         
                                                        |                                                            
                                                        S                                                           
          ______________________________________________|_____                                                       
         |                                                    VP                                                    
         |            ________________________________________|___________________                                   
         |           |     |         |            |                               S                                 
         |           |     |         |            |                               |                                  
         |      

## Wordnet Analysis

In [174]:
for tk, tg in zip(train_data["s2"]['tokens'][r], train_data["s1"]['tags'][r]):
    
    print('\n***********************************************************************************')
    print(tk, tg)
    synonyms = []
    hypernyms = []
    hyponyms = []
    substance_meronyms = []
    part_meronyms = []
    holonyms = []

    for syn in wn.synsets(tk):
        # Synonyms
        for l in syn.lemmas():
            if l.name() not in synonyms:
                synonyms.append(l.name())

        # Hypernyms
        for hpr in syn.hypernyms():
            for l in hpr.lemmas():
                if l.name() not in hypernyms:
                    hypernyms.append(l.name())

        # Hyponyms
        for hpo in syn.hyponyms():
            for l in hpo.lemmas():
                if l.name() not in hyponyms:
                    hyponyms.append(l.name())

        # Substance Meronyms
        for mrn in syn.substance_meronyms():
            for l in mrn.lemmas():
                if l.name() not in substance_meronyms:
                    substance_meronyms.append(l.name())

        # Part Meronyms
        for mrn in syn.part_meronyms():
            for l in mrn.lemmas():
                if l.name() not in part_meronyms:
                    part_meronyms.append(l.name())

        # Holonyms
        for hol in syn.member_holonyms():
            for l in hol.lemmas():
                if l.name() not in holonyms:
                    holonyms.append(l.name())

    print('\nSynonyms: ', synonyms)
    print('\nHypernyms: ', hypernyms)
    print('\nHyponyms: ', hyponyms)
    print('\nMeronyms (substance): ', substance_meronyms)
    print('\nMeronyms (part): ', part_meronyms)
    print('\nHolonyms:', holonyms)


***********************************************************************************
gemstar NN

Synonyms:  []

Hypernyms:  []

Hyponyms:  []

Meronyms (substance):  []

Meronyms (part):  []

Holonyms: []

***********************************************************************************
shares POS

Synonyms:  ['share', 'portion', 'part', 'percentage', 'parcel', 'contribution', 'plowshare', 'ploughshare', 'partake', 'partake_in', 'divvy_up', 'portion_out', 'apportion', 'deal']

Hypernyms:  ['assets', 'stock_certificate', 'stock', 'allotment', 'apportionment', 'apportioning', 'allocation', 'parceling', 'parcelling', 'assignation', 'attempt', 'effort', 'endeavor', 'endeavour', 'try', 'wedge', 'overlap', 'use', 'utilize', 'utilise', 'apply', 'employ', 'get', 'acquire', 'distribute', 'give_out', 'hand_out', 'pass_out', 'communicate', 'intercommunicate']

Hyponyms:  ['allotment', 'allocation', 'allowance', 'cut', 'dispensation', 'dole', 'interest', 'stake', 'profit_sharing', 'ration', 'sli

# Task 3

In our model we used 11 features for each pair of sentences to build our Machine Learning Model:
- Cosine Similarity
- Spacy (Cosine) Similarity 
- SIF Similarity
- Word Overlap
- Normalized Word Overlap
- Lemma Overlap
- Normalized Lemma Overlap
- Synset Overlap
- Normalized Synset Overlap
- Path Similarity 
- Named Entity Overlap

In the following cells we show some examples of these in use.


In [175]:
# Test sentences for the following demonstrations.

# Similar Sentences
s1 = 'I enjoy eating apples.'
s1_tok = ['I', 'enjoy', 'eating', 'apples']
s2 = 'I like munching red apples'
s2_tok = ['I', 'like', 'munching', 'red', 'apples']

# Dissimilar Sentences
s3 = 'My final exam was very difficult.'
s3_tok = ['My', 'final', 'exam', 'was', 'very', 'difficult']
s4 = 'Your mother smelled of elderberries.'
s4_tok = ['Your', 'mother', 'smelled', 'of', 'elderberries']

## Cosine Similarity
Cosine of embedding vectors in 3D Space. 0-1 value.

In [176]:
def calc_cosine_similarity(s1, s2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([s1, s2])
    
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

# Close Example
print('{:15} {:<10.3}'.format('similar:', calc_cosine_similarity(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', calc_cosine_similarity(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', calc_cosine_similarity(s3, s3)) )

similar:        0.171     
dissimilar:     0.0       
same:           1.0       


## Spacy Cosine Similarity
Cosine similarity calculated with the Spacy embeddings (large file).

In [177]:
def calc_spacy_sim(s1, s2):
    s2 = nlp(s2)
    s1 = nlp(s1)
    return s1.similarity(s2)

# Close Example
print('{:15} {:<10.3}'.format('similar:', calc_spacy_sim(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', calc_spacy_sim(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', calc_spacy_sim(s3, s3)) )    

similar:        0.853     
dissimilar:     0.637     
same:           1.0       


## Smooth Inverse Frequency (SIF) Similarity
Smooth Inverse Frequency is a weighted average of word vectors.

In [178]:
def frequency_distribution(corpus):
    s1_toks = corpus['s1']['tokens']
    s2_toks = corpus['s2']['tokens']    
    freq_dist = FreqDist()
    for i in range(len(s1_toks)):
        for token in (s1_toks[i] + s2_toks[i]):
            freq_dist[token.lower()] += 1
    return freq_dist

freq_dist = frequency_distribution(train_data)


def calc_sif_similarity(s1, s2, a = .001):
    vectorizer = CountVectorizer(stop_words="english")
    X = vectorizer.fit_transform([s1, s2])
    X_arr = X.toarray()
    sif_matrix = []
    for i in range(0, len(X_arr)):
        sif_arr = []
        for j in range(0, len(X_arr[i])):
            word = vectorizer.get_feature_names()[j]
            w = a / (a + freq_dist[word])
            v = X_arr[i][j]
            sif_arr.append(v*w)
        sif_matrix.append(sif_arr)
    sif_cos_sim_matrix = cosine_similarity(sif_matrix, sif_matrix)
    sif_cos_sim = sif_cos_sim_matrix[0][1]
    return sif_cos_sim

# Close Example
print('{:15} {:<10.3}'.format('similar:', calc_sif_similarity(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', calc_sif_similarity(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', calc_sif_similarity(s3, s3)) ) 


similar:        0.707     
dissimilar:     0.0       
same:           1.0       


## Simple Word Overlap (Raw and Normalized)
How many words do the two sentences have in common? This doesn't count stopwords or duplicates.

## Simple Lemma Overlap (Raw and Normalized)
How many lemmas do the two sentences have in common? This doesn't count stopwords or duplicates.

This is the same function as simple word overlap, except that it takes in lemmas vs tokens and thus would likely have more overlap.

*Note* This method takes tokenized sentences so I've hard coded those in.

In [179]:
stop_words = set(stopwords.words('english'))
tokenized_sentence_list = train_data['s1']['tokens']+train_data['s2']['tokens']
words_filtered = []

# print(words)

# looking through I've noticed there are a number of stop-words that can be added to the set
stop_words.add(',')
stop_words.add('``')
stop_words.add("n't")

for tsl in tokenized_sentence_list:
    for w in tsl:
        if w not in stop_words and w not in words_filtered:
            words_filtered.append(w)

def remove_duplicate_tokens(token_list):
    blank_list = []
    for w in token_list:
        if w not in blank_list:
            blank_list.append(w)
    return blank_list

def remove_stopwords(token_list):
    blank_list = []
    for w in token_list:
        if w not in stop_words:
            blank_list.append(w)
    return blank_list

def calc_basic_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
        
    overlap = 0
    encountered_words = []
    for word in (s1_tokens+s2_tokens):
        try:
            if word in encountered_words: # we know we have found an overlap
                overlap += 1
            encountered_words.append(word)
        except ValueError:
            # print(word + ' not found in lexicon. Skipping...')
            continue

    avg_sentence_len = len(s1_tokens+s2_tokens) / 2
    
    overlap_normlalized = overlap / avg_sentence_len
    return overlap, overlap_normlalized

s1s2_word_raw, s1s2_word_norm = calc_basic_overlap(s1_tok, s2_tok)
s3s4_word_raw, s3s4_word_norm = calc_basic_overlap(s3_tok, s4_tok)
s3s3_word_raw, s3s3_word_norm = calc_basic_overlap(s3_tok, s3_tok)

# Close Example
print('{:15} {:<5} {:<10.3}'.format('similar:',  s1s2_word_raw, s1s2_word_norm))

# Different Example
print('{:15} {:<5} {:<10.3}'.format('dessimilar:',  s3s4_word_raw, s3s4_word_norm))

# Same Example
print('{:15} {:<5} {:<10.3}'.format('same:',  s3s3_word_raw, s3s3_word_norm))

similar:        2     0.444     
dessimilar:     0     0.0       
same:           4     1.0       


## Synset Overlap


*NOTE* This feature also ingests tokens.

In [180]:
def calc_synset_overlap(s1_tokens, s2_tokens):
    s1_tokens = remove_stopwords(s1_tokens)
    s1_tokens = remove_duplicate_tokens(s1_tokens)

    s2_tokens = remove_stopwords(s2_tokens)
    s2_tokens = remove_duplicate_tokens(s2_tokens)
    
#     print(s2_tokens)
#     print(s1_tokens)

    s1_spread = []
    s2_spread = []
    
    for word in s1_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s1_spread:
                    s1_spread.append(syn_word)

    for word in s2_tokens:
        for synset in wn.synsets(word):
            for i in range(0, len(synset.lemmas())):
                syn_word = synset.lemmas()[i].name()
                if syn_word not in s2_spread:
                    s2_spread.append(syn_word)         
    
    return calc_basic_overlap(s1_spread, s2_spread)
    
s1s2_syn_raw, s1s2_syn_norm = calc_synset_overlap(s1_tok, s2_tok)
s3s4_syn_raw, s3s4_syn_norm = calc_synset_overlap(s3_tok, s4_tok)
s3s3_syn_raw, s3s3_syn_norm = calc_synset_overlap(s3_tok, s3_tok)

# Close Example
print('{:15} {:<5} {:<10.3}'.format('similar:',  s1s2_syn_raw, s1s2_syn_norm))

# Different Example
print('{:15} {:<5} {:<10.3}'.format('dessimilar:',  s3s4_syn_raw, s3s4_syn_norm))

# Same Example
print('{:15} {:<5} {:<10.3}'.format('same:',  s3s3_syn_raw, s3s3_syn_norm))

similar:        13    0.306     
dessimilar:     0     0.0       
same:           13    1.0       


## Path Similarity


In [182]:
def get_synsets(sentence1, sentence2):
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tag_to_synset(sentence1, *tagged_word) for tagged_word in sentence1]
    synsets2 = [tag_to_synset(sentence2, *tagged_word) for tagged_word in sentence2]
    
    # Filter out the Nones
    synsets1 = [syn for syn in synsets1 if syn]
    synsets2 = [syn for syn in synsets2 if syn]

    return synsets1, synsets2

def postag_to_synsettag(tag):
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V') or tag == "MD":
        return 'v'
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    return

def tag_to_synset(sent, word, tag):
    wn_tag = postag_to_synsettag(tag)
    if wn_tag is None:
        return None
    else:
        try:
            return lesk(sent, word, wn_tag)
        except:
            return None


def sentence_path_similarity(sentence1, sentence2):
    synsets1, synsets2 = get_synsets(sentence1, sentence2)
    score, count = 0.0, 0
    best = 0.0
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        for synset2 in synsets2:
            if synset.path_similarity(synset2) is not None and synset.path_similarity(synset2) > best:
                try:
                    best = synset.path_similarity(synset2)
                except TypeError:
                    continue
        # Check that the similarity could have been computed
        if best is not None:
            score += best
            count += 1
    # Average the values
    try:
        score /= count
    except:
        return 0.0
    return score


def symmetric_sentence_path_similarity(sentence1, sentence2):
    """ compute the symmetric sentence similarity using Wordnet """
    return (sentence_path_similarity(sentence1, sentence2) + sentence_path_similarity(sentence2, sentence1)) / 2     

# Close Example
print('{:15} {:<10.3}'.format('similar:', symmetric_sentence_path_similarity(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', symmetric_sentence_path_similarity(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', symmetric_sentence_path_similarity(s3, s3)) ) 

similar:        0.0       
dissimilar:     0.0       
same:           0.0       


## Named Entity Overlap

In [183]:
def named_entity_overlap(s1, s2):    
  
    sentence_nlp1= nlp(s1)
    ner1= [(word.text, word.ent_type_) for word in sentence_nlp1 if word.ent_type_]
    
    sentence_nlp2=nlp(s2)
    ner2 = [(word.text, word.ent_type_) for word in sentence_nlp2 if word.ent_type_]

    overlap = []

    
    da = {k:v for k,v in ner1}
    db = {k:v for k,v in ner2}
    total_length = len(set(ner1+ner2))
    temp = []
    for a in da.keys():
        for b in db.keys():
            if a==b:
                temp.insert(len(temp), a)
    if total_length != 0:
        overlap =len(temp)/total_length
    else:
        overlap = 0.0

    return overlap

# Close Example
print('{:15} {:<10.3}'.format('similar:', named_entity_overlap(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', named_entity_overlap(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', named_entity_overlap(s3, s3)) ) 

similar:        0.0       
dissimilar:     0.0       
same:           0.0       


# Task 4

- Load in model with Pickle Dump
- Preprocess test data
- Run it through the model
- Output a file of our models guesses.

In [200]:
pkl_filename = "./model/random_forest.pkl"
test_data = preprocess("./data/test-set.txt", True)

In [201]:
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)



In [202]:
def pipeline(corpus):
   
    s1_array = corpus['s1']['sentences']
    s2_array = corpus['s2']['sentences']
    s1_tokens = corpus['s1']['tokens']
    s2_tokens = corpus['s2']['tokens']
    s1_lemmas = corpus['s1']['lemmas']
    s2_lemmas = corpus['s2']['lemmas']
    s1_ls = corpus['s2']['ls']
    s2_ls = corpus['s2']['ls']
    
    data = []
    for i in range(0, len(s1_array)):
        cos_sim = calc_cosine_similarity(s1_array[i], s2_array[i])
        sif_sim = calc_sif_similarity(s1_array[i], s2_array[i])
        w_overlap, w_norm_overlap = calc_basic_overlap(s1_tokens[i], s2_tokens[i])
        l_overlap, l_norm_overlap = calc_basic_overlap(s1_lemmas[i], s2_lemmas[i])
        spacy_sim = calc_spacy_sim(s1_array[i], s2_array[i])
        syn_overlap, normalized_syn_overlap = calc_synset_overlap(s1_tokens[i], s2_tokens[i])
        path_similarity = symmetric_sentence_path_similarity(s1_ls[i], s2_ls[i]) 
        ne_overlap = named_entity_overlap(s1_array[i], s2_array[i])
        data.insert(len(data),[len(s1_tokens), len(s2_tokens), w_norm_overlap, l_norm_overlap, spacy_sim, sif_sim, cos_sim, syn_overlap, normalized_syn_overlap, path_similarity, ne_overlap, len(s1_tokens[i]), len(s2_tokens[i])])

    return data
test_input = pipeline(test_data)

In [203]:
pickle_model.predict(test_input)

array([4, 1, 4, 4, 1, 4, 3, 3, 4, 4, 4, 5, 5, 2, 4, 4, 3, 3, 3, 3, 4, 4,
       4, 4, 4, 4, 4, 2, 3, 4, 4, 4, 3, 3, 4, 5, 4, 4, 3, 3, 3, 5, 4, 4,
       4, 4, 2, 4, 3, 3, 4, 4, 4, 3, 4, 3, 4, 3, 2, 1, 5, 3, 3, 4, 4, 2,
       4, 3, 2, 4, 4, 4, 1, 2, 4, 3, 2, 3, 4, 4, 4, 4, 5, 2, 5, 4, 4, 4,
       3, 3, 3, 4, 4, 3, 3, 5, 3, 5, 2, 4, 4, 3, 4, 4, 4, 5, 5, 3, 5, 3,
       3, 3, 3, 3, 4, 1, 3, 1, 4, 4, 4, 4, 3, 4, 3, 4, 5, 4, 5, 4, 4, 3,
       3, 3, 3, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 5, 4, 4, 3, 4, 4, 4, 3, 3,
       3, 3, 5, 4, 4, 4, 4, 2, 4, 5, 3, 4, 3, 3, 4, 4, 5, 4, 5, 3, 2, 2,
       3, 3, 4, 1, 5, 4, 3, 4, 3, 2, 4, 3, 4, 4, 5, 4, 4, 3, 3, 3, 4, 4,
       1, 2, 5, 3, 3, 3, 4, 4, 5, 4, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 1,
       3, 3, 4, 4, 3, 1, 4, 4, 4, 4, 4, 3, 4, 4, 1, 5, 3, 4, 4, 3, 4, 3,
       2, 4, 5, 3, 1, 4, 5, 4, 3, 4, 4, 4, 4, 2, 4, 1, 2, 4, 3, 4, 3, 4,
       4, 4, 5, 3, 2, 2, 3, 4, 5, 5, 3, 4, 4, 4, 2, 4, 4, 3, 3, 3, 4, 3,
       4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 4, 4, 3, 5, 4,