# Additional Practice

You should complete this notebook after watching Lecture 3. You can use page 5-6 (begining with the top of page 5) of [N-gram Language Models (Jurafsky and Martin)](https://github.com/ychennay/dso-560-nlp-and-text-analytics/blob/master/week3/N-Gram%20Language%20Models%20(Jurafsky%20and%20Martin).pdf) for additional help completing the following exercises. Show your work for all exercises to receive credit.

## Transition Matrix for Bigrams

In [43]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize
import pandas as pd

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word.lower())
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word.lower(), tag))
    return " ".join(lemmatized_sentence)

In [44]:
corpus = ["He eats lunch at home",
         "She wants to eat dinner at home",
         "He eats lunch at work",
         "She wants to go home",
         "He wants lunch"]

# Including START and END in every sentence of the corpus
for i in range(len(corpus)):
    corpus[i] = 'START ' + corpus[i] + ' END'

In [45]:
# Lemmatizing sentence
for i in range(len(corpus)):
    corpus[i] = lemmatize_sentence(corpus[i])

In [46]:
# Creating term matrix
tokens = []
for sentence in corpus:
        words = word_tokenize(sentence)
        for word in words:
            tokens.append(word)

df = pd.DataFrame(index = list(set(tokens)),columns = list(set(tokens)))
df.fillna(0,inplace = True)
cols = list(df.columns) # Make a list of all of the columns in the df
cols.pop(cols.index('end')) #Remove 'end' from list
df = df[cols+['end']]
df

Unnamed: 0,start,dinner,to,go,lunch,at,work,home,he,she,eat,want,end
start,0,0,0,0,0,0,0,0,0,0,0,0,0
dinner,0,0,0,0,0,0,0,0,0,0,0,0,0
to,0,0,0,0,0,0,0,0,0,0,0,0,0
end,0,0,0,0,0,0,0,0,0,0,0,0,0
go,0,0,0,0,0,0,0,0,0,0,0,0,0
lunch,0,0,0,0,0,0,0,0,0,0,0,0,0
at,0,0,0,0,0,0,0,0,0,0,0,0,0
work,0,0,0,0,0,0,0,0,0,0,0,0,0
home,0,0,0,0,0,0,0,0,0,0,0,0,0
he,0,0,0,0,0,0,0,0,0,0,0,0,0


### Calculate the Transition Frequency Matrix
You may lemmatize the original documents (ie., `eats` $\rightarrow$ `eat`, `wants` $\rightarrow$ `want`) and ignore case.

**Hint:** a transition frequency matrix is of shape $V$ x $V$, where $V$ is the number of unique words in the vocabulary of the corpus.

In [47]:
def readData():
    data = corpus
    dat=[]

    for sentence in corpus:
        words = word_tokenize(sentence)
        for word in words:
            dat.append(word)
    print(dat)
    return dat



def createBigram(data,df):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for i in range(len(data)-1):
      if i < len(data) - 1 and data[i+1].islower():

         listOfBigrams.append((data[i], data[i + 1]))

         if (data[i], data[i+1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1
            df.loc[data[i], data[i + 1]] += 1
         else:
            bigramCounts[(data[i], data[i + 1])] = 1
            df.loc[data[i], data[i + 1]] = 1

      if data[i] in unigramCounts:
         unigramCounts[data[i]] += 1
      else:
         unigramCounts[data[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts


def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb

data = readData()
listOfBigrams, unigramCounts, bigramCounts = createBigram(data, df)

print("\n All the possible Bigrams are ")
print(listOfBigrams)

print("\n Bigrams along with their frequency (transition frequency matrix)")
print(bigramCounts)

print("\n Unigrams along with their frequency ")
print(unigramCounts)

bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

print("\n Bigrams along with their probability (transition frequency matrix)")
print(bigramProb)
inputList="This is my cat"
splt=inputList.split()
outputProb1 = 1
bilist=[]
bigrm=[]

for i in range(len(splt) - 1):
    if i < len(splt) - 1:

        bilist.append((splt[i], splt[i + 1]))

print("\n The bigrams in given sentence are ")
print(bilist)
for i in range(len(bilist)):
    if bilist[i] in bigramProb:

        outputProb1 *= bigramProb[bilist[i]]
    else:

        outputProb1 *= 0
print('\n' + 'Probablility of sentence \"This is my cat\" = ' + str(outputProb1))

['start', 'he', 'eat', 'lunch', 'at', 'home', 'end', 'start', 'she', 'want', 'to', 'eat', 'dinner', 'at', 'home', 'end', 'start', 'he', 'eat', 'lunch', 'at', 'work', 'end', 'start', 'she', 'want', 'to', 'go', 'home', 'end', 'start', 'he', 'want', 'lunch', 'end']

 All the possible Bigrams are 
[('start', 'he'), ('he', 'eat'), ('eat', 'lunch'), ('lunch', 'at'), ('at', 'home'), ('home', 'end'), ('end', 'start'), ('start', 'she'), ('she', 'want'), ('want', 'to'), ('to', 'eat'), ('eat', 'dinner'), ('dinner', 'at'), ('at', 'home'), ('home', 'end'), ('end', 'start'), ('start', 'he'), ('he', 'eat'), ('eat', 'lunch'), ('lunch', 'at'), ('at', 'work'), ('work', 'end'), ('end', 'start'), ('start', 'she'), ('she', 'want'), ('want', 'to'), ('to', 'go'), ('go', 'home'), ('home', 'end'), ('end', 'start'), ('start', 'he'), ('he', 'want'), ('want', 'lunch'), ('lunch', 'end')]

 Bigrams along with their frequency (transition frequency matrix)
{('start', 'he'): 3, ('he', 'eat'): 2, ('eat', 'lunch'): 2, (

In [48]:
# transition frequency matrix
df = df.drop('start',axis = 1).drop('end',axis = 0)
df

Unnamed: 0,dinner,to,go,lunch,at,work,home,he,she,eat,want,end
start,0,0,0,0,0,0,0,3,2,0,0,0
dinner,0,0,0,0,1,0,0,0,0,0,0,0
to,0,0,1,0,0,0,0,0,0,1,0,0
go,0,0,0,0,0,0,1,0,0,0,0,0
lunch,0,0,0,0,2,0,0,0,0,0,0,1
at,0,0,0,0,0,1,2,0,0,0,0,0
work,0,0,0,0,0,0,0,0,0,0,0,1
home,0,0,0,0,0,0,0,0,0,0,0,3
he,0,0,0,0,0,0,0,0,0,2,1,0
she,0,0,0,0,0,0,0,0,0,0,2,0


### Convert the Transition Frequency Matrix into a Transition Matrix

**Hint:** the rows in the transition matrix should sum up to 1.

In [49]:
# transition matrix
df = df.div(df.sum(axis=1), axis=0)
df

Unnamed: 0,dinner,to,go,lunch,at,work,home,he,she,eat,want,end
start,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.4,0.0,0.0,0.0
dinner,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
to,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
go,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
lunch,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
at,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.0
work,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
he,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.333333,0.0
she,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Calculate the Probability of the Following Documents

Assume a bi-gram language model. 

Hint: a valid probability is between 0 and 1. 

### `He wants to go home`
**Hint**: Calculate first the probability of `He` $\rightarrow$ `wants`.


In [50]:
# probability of He wants to go home

inputList = "He wants to go home"
inputList = lemmatize_sentence(inputList)
splt=inputList.split()
outputProb1 = 1
bilist=[]
bigrm=[]

for i in range(len(splt) - 1):
    if i < len(splt) - 1:

        bilist.append((splt[i], splt[i + 1]))

print("\n The bigrams in given sentence are ")
print(bilist)
for i in range(len(bilist)):
    if bilist[i] in bigramProb:

        outputProb1 *= bigramProb[bilist[i]]
    else:

        outputProb1 *= 0
print('\n' + 'Probablility of sentence \"He wants to go home\" = ' + str(round(outputProb1*100,2)) + '%')




 The bigrams in given sentence are 
[('he', 'want'), ('want', 'to'), ('to', 'go'), ('go', 'home')]

Probablility of sentence "He wants to go home" = 11.11%


### `At to go work`

**Hint**: Calculate first the probability of `At` $\rightarrow$ `to`.

In [51]:
# probability of At to go work

inputList="At to go work"
inputList = lemmatize_sentence(inputList)
splt=inputList.split()
outputProb1 = 1
bilist=[]
bigrm=[]

for i in range(len(splt) - 1):
    if i < len(splt) - 1:

        bilist.append((splt[i], splt[i + 1]))

print("\n The bigrams in given sentence are ")
print(bilist)
for i in range(len(bilist)):
    if bilist[i] in bigramProb:

        outputProb1 *= bigramProb[bilist[i]]
    else:

        outputProb1 *= 0
print('\n' + 'Probablility of sentence \"At to go work\" = ' + str(round(outputProb1*100,2)) + '%')





 The bigrams in given sentence are 
[('at', 'to'), ('to', 'go'), ('go', 'work')]

Probablility of sentence "At to go work" = 0.0%


## Cosine Similarity and Euclidean Distance

Calculate the **cosine similarity** and **Euclidean distance** of the following pairs of sentences. Do not use any 3rd party libraries (besides Numpy) to perform your calculation and show your work.

A. *He wants candy.*

B. *He wants soup.*

In [139]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# take two very similar sentences, should have high similarity
# edit these sentences to become less similar, and the similarity score should decrease
data_corpus = ["He wants candy.", 
               "He wants soup."]

X = vectorizer.fit_transform(data_corpus) 
X = X.toarray()
print(vectorizer.get_feature_names())
X

['candy', 'he', 'soup', 'wants']


array([[1, 1, 0, 1],
       [0, 1, 1, 1]], dtype=int64)

In [140]:
# cosine similarity
def cosine_similarity(A, B):
    numerator = dot(A, B)
    denominator = norm(A) * norm(B)
    return numerator / denominator

def cosine_distance(A,B):
    return 1 - cosine_similarity

print(round(cosine_similarity(X[0], X[1]),2))

0.67


In [141]:
# Euclidean distance
def euclidean_distance(x,y):
    x = np.array(x)
    y = np.array(y)
    return np.linalg.norm(x-y)

print(round(euclidean_distance(X[0], X[1]),2))

1.41


A. *He wants candy and sweets, but she wants soup.*

B. *He wants soup.*

In [142]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# take two very similar sentences, should have high similarity
# edit these sentences to become less similar, and the similarity score should decrease
data_corpus = ["He wants candy and sweets, but she wants soup.", 
               "He wants soup."]

X = vectorizer.fit_transform(data_corpus) 
X = X.toarray()
print(vectorizer.get_feature_names())

['and', 'but', 'candy', 'he', 'she', 'soup', 'sweets', 'wants']


In [143]:
# cosine similarity
def cosine_similarity(A, B):
    numerator = dot(A, B)
    denominator = norm(A) * norm(B)
    return numerator / denominator

def cosine_distance(A,B):
    return 1 - cosine_similarity

print(round(cosine_similarity(X[0], X[1]),2))

0.7


In [144]:
# Euclidean distance
def euclidean_distance(x,y):
    x = np.array(x)
    y = np.array(y)
    return np.linalg.norm(x-y)

print(round(euclidean_distance(X[0], X[1]),2))

2.45


### Explain why cosine similarity is usually preferred over Euclidean distance when working within natural language processing projects.

Please use your own words and provide a concrete example.

### your answer
Cosine similarity utilizes the vector representation of a token where as Euclidean distance relies solely on the counts of individual tokens. Cosine similatiry scores similarity of two documents based on the angle between the vector representations by dividing the dot product of the two vectors with the norm of individual vector. Thus it normalizes for the length of the different documents.

For example,

document 1: he loves <br>
document 2: he loves he loves

For the following documents, the cosine similarity would be exactly 1 whereas the euclidean distance would be square root of 2 since it depends simply on the count of tokens.



## Term Frequency - Inverse Document Frequency

Calculate the TF-IDF vectors for the three documents below.

You may use the following equations for term frequency:

$$
n(t,d)
$$
Where $n(t,d)$ is the number of times the term $t$ appears in document $d$.

For inverse document frequency, you may use the following equation:

$$
\frac{N}{1 + n(t)}
$$
Where $N$ is the total number of documents and $n(t)$ is the number of documents the term $t$ appears in.

A. `blue jeans ripped`

B. `blue navy shoes navy`

C. `gym shoes`

In [145]:
# your answer

# creating term frequencies
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# edit these sentences to become less similar, and the similarity score should decrease
data_corpus = ["blue jeans ripped", 
               "blue navy shoes navy",
               "gym shoes",]

X = vectorizer.fit_transform(data_corpus) 
X = X.toarray()
print(vectorizer.get_feature_names())

df = pd.DataFrame(columns = vectorizer.get_feature_names(),
                  data = X)
df.head()

['blue', 'gym', 'jeans', 'navy', 'ripped', 'shoes']


Unnamed: 0,blue,gym,jeans,navy,ripped,shoes
0,1,0,1,0,1,0
1,1,0,0,2,0,1
2,0,1,0,0,0,1


In [146]:
# calculating doc frequency and idf
df.loc['df(t)',:] = np.count_nonzero(df, axis=0)
for i in df.columns:
    df.loc['idf',i] = 3/(1 + df.loc['df(t)',i])
    
df

Unnamed: 0,blue,gym,jeans,navy,ripped,shoes
0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,2.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,1.0
df(t),2.0,1.0,1.0,1.0,1.0,2.0
idf,1.0,1.5,1.5,1.5,1.5,1.0


In [147]:
# calculating final tf.idf vectors!
for i in range(3):
    df.loc[f'd{i+1}',:] = df.loc[i,:]*df.loc['idf',:]

tf_idf = df.loc[['d1','d2','d3'],:]
tf_idf

Unnamed: 0,blue,gym,jeans,navy,ripped,shoes
d1,1.0,0.0,1.5,0.0,1.5,0.0
d2,1.0,0.0,0.0,3.0,0.0,1.0
d3,0.0,1.5,0.0,0.0,0.0,1.0
