# CS 6320 Natural Language Processing
## Shruti Agrawal & Pat Dayton

This notebook demos our code for Tasks 1 & 2 of the project.

In [34]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.corpus import wordnet as wn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# SPACY IMPORT
import spacy
nlp = spacy.load("en_core_web_lg")

# CoreNLP setup
core_nlp_url = 'http://localhost:9000'

## Start the Stanford CoreNLP Server
In another console run the script below in order to start the Stanford CoreNLP Server on port 9000. We will hit this API in Part 2.

In [35]:
#java -mx4g -cp "./corenlp/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# Task 1: Parse the Corpus
First read in the corpus and do basic parsing to split out the first sentence, second sentence, and score for each line.

In [36]:
def readData(fileName):
    """Read in the file, strip out sentence 1, sentence 2, and score"""
    s1 = []
    s2 = []
    score = []
    file = open(fileName, encoding="utf8")
    text = file.readline()
    text = file.read()
    
    # loop to extract a set of two sentences
    for sentence in text.split('\n'):

        # creating two separate lists of the sentences
        # '.rstrip('.') only removes the last period in the sentence
        
        s1.insert(len(s1), (sentence.split('\t')[1].lower()).rstrip('.'))
        s2.insert(len(s1), (sentence.split('\t')[2].lower()).rstrip('.'))
        
        # inserting the score as a separate lists
        score.insert(len(s1), (sentence.split('\t')[3]))

    # print(s1)
    return s1, s2, score

In [37]:
def preprocess(fileName):

    s1, s2, scores = readData(fileName)
    s1_toks = []
    s2_toks = []

    # tokenizing and tagging
    s1_tags = []
    s2_tags = []

    for sentence in s1:
        tokens = nltk.word_tokenize(sentence)
        s1_toks.insert(len(s1_toks), tokens)
        s1_tags.insert(
            len(s1_tags), nltk.pos_tag(tokens))

    for sentence in s2:
        tokens = nltk.word_tokenize(sentence)
        s2_toks.insert(len(s2_toks), tokens)
        s2_tags.insert(
            len(s2_tags), nltk.pos_tag(tokens))
    
    # Remove the unnecessary tuple and keep just the tags
    for i, tag_list in enumerate(s1_tags):
        s1_tags[i] = [tup[1] for tup in tag_list]
    for i, tag_list in enumerate(s2_tags):
        s2_tags[i] = [tup[1] for tup in tag_list]

    # lemmatizing
    s1_lemmas = []
    s2_lemmas = []
    lemmatizer = WordNetLemmatizer()
    for sentence in s1_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s1_lemmas.insert(
            len(s1_lemmas), sentence_components)

    for sentence in s2_toks:
        sentence_components = []
        for token in sentence:
            lemmas = lemmatizer.lemmatize(token)
            sentence_components.insert(len(sentence_components), lemmas)
        s2_lemmas.insert(
            len(s2_lemmas), sentence_components)

        
    # Zipping it all together into one object for each word
    s1_word_lists = []
    s2_word_lists = []
    
    for tok_list, lem_list, tag_list in zip(s1_toks, s1_lemmas, s1_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s1_word_lists.append(sentence_words) 
        
    for tok_list, lem_list, tag_list in zip(s2_toks, s2_lemmas, s2_tags):
        sentence_words = []
        for tok, lem, tag in zip(tok_list, lem_list, tag_list):
            word = {}
            word['tok'] = tok
            word['lem'] = lem
            word['tag'] = tag
            sentence_words.append(word)
        s2_word_lists.append(sentence_words)  
              
    
    # Create a corpus object to represent our corpus
    corpus = {}
    corpus["s1"] = {}
    corpus["s2"] = {}
    corpus['scores'] = [int(i) for i in scores]
    
    corpus["s1"]["sentences"] = s1
    corpus["s2"]["sentences"] = s2
    
    corpus["s1"]["tokens"] = s1_toks
    corpus["s2"]["tokens"] = s2_toks
    
    corpus["s1"]["lemmas"] = s1_lemmas
    corpus["s2"]["lemmas"] = s2_lemmas
    
    corpus["s1"]["tags"] = s1_tags
    corpus["s2"]["tags"] = s2_tags
    
    corpus["s1"]["words"] = s1_word_lists
    corpus["s2"]["words"] = s2_word_lists
    
    return corpus

### Task 1 Example Output

In [38]:
train_data = preprocess("./data/train-set.txt")

In [39]:
print(len(train_data["s1"]['sentences']))
print(len(train_data["s2"]['sentences']))

1484
1484


In [40]:
r=286

tkns1 = train_data["s1"]['tokens'][r]
lems1 = train_data["s1"]['lemmas'][r]
tags1 = train_data["s1"]['tags'][r]
tkns2 = train_data["s2"]['tokens'][r]
lems2 = train_data["s2"]['lemmas'][r]
tags2 = train_data["s2"]['tags'][r]

data1 = []
data2 = []

for i in range(0, len(tkns1)):
    data1.append([tkns1[i], lems1[i], tags1[i]])
    
for i in range(0, len(tkns2)):
    data2.append([tkns2[i], lems2[i], tags2[i]])
    
df1 = pd.DataFrame(
    data1, 
    columns = ['Tokens', 'Lemmas', 'Tags']) 

df2 = pd.DataFrame(
    data2, 
    columns = ['Tokens', 'Lemmas', 'Tags']) 


print('ROW {} FROM TEST DATA\n'.format(r))
print('Sentence 1\n')
print('Raw: ', train_data["s1"]['sentences'][r])
display(df1)
print('Sentence 2\n')
print('Raw: ', train_data["s2"]['sentences'][r])
display(df2)
print('Score: ', train_data["scores"][r])

ROW 286 FROM TEST DATA

Sentence 1

Raw:  gemstar's shares gathered up 2.6 percent, adding 14 cents to $5.49 at the close
Sentence 2

Raw:  gemstar shares moved higher on the news, closing up 2.6 percent at $5.49 on nasdaq
Score:  4


Unnamed: 0,Tokens,Lemmas,Tags
0,gemstar,gemstar,NN
1,'s,'s,POS
2,shares,share,NNS
3,gathered,gathered,VBD
4,up,up,RP
5,2.6,2.6,CD
6,percent,percent,NN
7,",",",",","
8,adding,adding,VBG
9,14,14,CD


Unnamed: 0,Tokens,Lemmas,Tags
0,gemstar,gemstar,NN
1,shares,share,NNS
2,moved,moved,VBD
3,higher,higher,RBR
4,on,on,IN
5,the,the,DT
6,news,news,NN
7,",",",",","
8,closing,closing,VBG
9,up,up,RP


## Dependency Parsing

In [41]:
# dependency parsing
print("\nDependency Parsing Sentence 1\n")
dependency_parser = CoreNLPDependencyParser(url=core_nlp_url)
parse, = dependency_parser.raw_parse(train_data["s1"]['sentences'][r])
print(parse.to_conll(4))

print("\nDependency Parsing Sentence 2\n")
dependency_parser = CoreNLPDependencyParser(url=core_nlp_url)
parse, = dependency_parser.raw_parse(train_data["s2"]['sentences'][r])
print(parse.to_conll(4))


Dependency Parsing Sentence 1



ConnectionError: HTTPConnectionPool(host='localhost', port=9000): Max retries exceeded with url: /?properties=%7B%22outputFormat%22%3A+%22json%22%2C+%22annotators%22%3A+%22tokenize%2Cpos%2Clemma%2Cssplit%2Cdepparse%22%2C+%22ssplit.eolonly%22%3A+%22true%22%2C+%22tokenize.whitespace%22%3A+%22false%22%7D (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1a4dede080>: Failed to establish a new connection: [Errno 61] Connection refused'))

## Syntactic Parsing
https://www.nltk.org/api/nltk.parse.html

In [0]:
# syntactic parsing
print("\nFull syntactic parse tree for sentence 1: ")
syntactic_parser = CoreNLPParser(url=core_nlp_url)
s1_tree = next(syntactic_parser.raw_parse(train_data["s1"]['sentences'][r]))
s1_tree.pretty_print()

# type(s1_tree)
# s1_parse_tree_file = open("./output/s1_parse_tree.txt", "w") 
# s1_parse_tree_file.write(str(s1_tree))
# s1_parse_tree_file.close()

f = open("./output/s1_parse_tree.txt", "w", encoding="utf-8")
s1_tree.pretty_print(stream=f)
f.close()

In [0]:
# syntactic parsing
print("\nFull syntactic parse tree for sentence 1: ")
syntactic_parser = CoreNLPParser(url=core_nlp_url)
s1_tree = next(syntactic_parser.raw_parse(train_data["s2"]['sentences'][r]))
s1_tree.pretty_print()

f = open("./output/s2_parse_tree.txt", "w", encoding="utf-8")
s1_tree.pretty_print(stream=f)
f.close()

## Wordnet Analysis

In [0]:
for tk, tg in zip(train_data["s2"]['tokens'][r], train_data["s1"]['tags'][r]):
    
    print('\n***********************************************************************************')
    print(tk, ind)
    synonyms = []
    hypernyms = []
    hyponyms = []
    substance_meronyms = []
    part_meronyms = []
    holonyms = []

    for syn in wn.synsets(tk):
        # Synonyms
        for l in syn.lemmas():
            if l.name() not in synonyms:
                synonyms.append(l.name())

        # Hypernyms
        for hpr in syn.hypernyms():
            for l in hpr.lemmas():
                if l.name() not in hypernyms:
                    hypernyms.append(l.name())

        # Hyponyms
        for hpo in syn.hyponyms():
            for l in hpo.lemmas():
                if l.name() not in hyponyms:
                    hyponyms.append(l.name())

        # Substance Meronyms
        for mrn in syn.substance_meronyms():
            for l in mrn.lemmas():
                if l.name() not in substance_meronyms:
                    substance_meronyms.append(l.name())

        # Part Meronyms
        for mrn in syn.part_meronyms():
            for l in mrn.lemmas():
                if l.name() not in part_meronyms:
                    part_meronyms.append(l.name())

        # Holonyms
        for hol in syn.member_holonyms():
            for l in hol.lemmas():
                if l.name() not in holonyms:
                    holonyms.append(l.name())

    print('\nSynonyms: ', synonyms)
    print('\nHypernyms: ', hypernyms)
    print('\nHyponyms: ', hyponyms)
    print('\nMeronyms (substance): ', substance_meronyms)
    print('\nMeronyms (part): ', part_meronyms)
    print('\nHolonyms:', holonyms)

# Task 3

In our model we used 11 features for each pair of sentences to build our Machine Learning Model:
- Cosine Similarity
- Spacy (Cosine) Similarity 
- SIF Similarity
- Word Overlap
- Normalized Word Overlap
- Lemma Overlap
- Normalized Lemma Overlap
- Synset Overlap
- Normalized Synset Overlap
- Path Similarity 
- Named Entity Overlap
- Verb Overlap

In the following cells we show some examples of these in use.


In [None]:
# Test sentences for the following demonstrations.

# Similar Sentences
s1 = 'I enjoy eating apples.'
s2 = 'I like munching red apples'

# Dissimilar Sentences
s3 = 'My final exam was very difficult.'
s4 = 'Your mother smelled of elderberries.'

## Cosine Similarity
Cosine of embedding vectors in 3D Space. 0-1 value.

In [45]:
def calc_cosine_similarity(s1, s2):

    # remove the stopwords, transform into TF-IDF matrix, then
    tfidf_matrix = TfidfVectorizer(
        stop_words="english").fit_transform([s1, s2])
    
    cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # print(tfidf_matrix.toarray())

    cos_sim = cos_sim_matrix[0][1]

    return cos_sim

# Close Example
print('{:15} {:<10.3}'.format('similar:', calc_cosine_similarity(s1, s2)) )

# Different Example
print('{:15} {:<10.3}'.format('dissimilar:', calc_cosine_similarity(s3, s4)) )

# Same Example
print('{:15} {:<10.3}'.format('same:', calc_cosine_similarity(s3, s3)) )

similar:        0.171     
dissimilar:     0.0       
same:           1.0       


## Spacy Cosine Similarity
Cosine similarity calculated with the Spacy embeddings (large file).

In [None]:
def calc_spacy_sim(s1, s2):
    s2 = nlp(s2)
    s1 = nlp(s1)
    return s1.similarity(s2)

# Task 4: Use Pickle Dump here to Run the Model for given sentences or text file outputting in the correct order