In [19]:
import numpy as np
import pandas as pd
import math
import nltk
from nltk.book import *
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import warnings
warnings.filterwarnings('ignore')

In [20]:
def pre_process_text(text):
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)

    for token, tag in pos_tag(text):

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
   
        token = lemmatizer.lemmatize(token, pos)

        if token not in string.punctuation and token.lower() not in stop_words and ((token.isalpha() and len(token)>1) or token.isdigit()):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [21]:
text_1 = pre_process_text(" ".join(text1))
text_2 = pre_process_text(" ".join(text2))
text_3 = pre_process_text(" ".join(text3))
text_4 = pre_process_text(" ".join(text4))
text_5 = pre_process_text(" ".join(text5))
text_6 = pre_process_text(" ".join(text6))
text_7 = pre_process_text(" ".join(text7))
text_8 = pre_process_text(" ".join(text8))
text_9 = pre_process_text(" ".join(text9))

In [22]:
corpus = [text_1[:int(round(len(text_1)/300))],
          text_2[:int(round(len(text_2)/300))],
          text_3[:int(round(len(text_3)/300))],
          text_4[:int(round(len(text_4)/300))]] 

### Task 1-2

In [23]:
# PPMI

def ppmi(corpus, length=0, normalizing=False):
    fd = FreqDist(nltk.word_tokenize(' '.join([' '.join(corpus[i][j] for j in range(len(corpus[i]))) for i in range(len(corpus))])))
    fd_keys_lst = list(fd.keys())
    corp_lst = []
    for i in corpus:
        corp_lst = corp_lst+i
            
    if length > 0:
        
        w_dict = {w: np.array([1 if w in doc else 0 
                  for doc in 
                  [fd_keys_lst[i:i+length]
                  for i in range(len(corp_lst))]
                  +
                  [fd_keys_lst[j:j-length] 
                  for j in range(len(corp_lst), 0, -1)]]) 
                  for w in corp_lst}
    
    else: 
        w_dict = {w: np.array([1 if w in doc else 0 for doc in corpus]) for w in corp_lst}

    def n(w, c):
        return np.sum(w_dict[w]*w_dict[c])

    denominator = 0
    counter = 0

    for w in fd_keys_lst:
        denominator += counter
        counter = 0
        for c in fd_keys_lst:
            counter += n(w, c)

    def p_ij(w, c):
        return n(w, c)/denominator

    def p_i(w):
        return sum([n(w, c) for c in fd_keys_lst])/denominator
    
    def p_j(c):
        return sum([n(w, c) for w in fd_keys_lst])/denominator

    def pmi(w, c):
        return max(np.log2((p_ij(w,c))/(p_i(w)*p_j(c))), 0)
    
    term_term_matrix = np.zeros((len(fd_keys_lst), len(fd_keys_lst)))

    if normalizing == False:

        for i, w in enumerate(fd_keys_lst):
            for j, c in enumerate(fd_keys_lst):
                term_term_matrix[i, j] = round(pmi(w,c), 2)
    else:
        for i, w in enumerate(fd_keys_lst):
            for j, c in enumerate(fd_keys_lst):
                term_term_matrix[i, j] = round(1/(1+(np.exp(-(pmi(w,c))))), 2)

    return pd.DataFrame(term_term_matrix, columns=fd_keys_lst, index=fd_keys_lst)

#### PPMI (context = paragraph)

In [24]:
t_t_paragraph = ppmi(corpus)
t_t_paragraph

Unnamed: 0,moby,dick,herman,melville,1851,etymology,supplied,late,consumptive,usher,...,sentiment,less,either,bind,acknowledge,adore,invisible,conduct,affair,men
moby,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.19,0.76,0.76,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
dick,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.19,0.76,0.76,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
herman,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.19,0.76,0.76,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
melville,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.19,0.76,0.76,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1851,0.76,0.76,0.76,0.76,0.76,0.76,0.76,0.19,0.76,0.76,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adore,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93
invisible,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93
conduct,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93
affair,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93,1.93


#### PPMI (context = sliding window length 2)

In [25]:
t_t_sliding_window = ppmi(corpus, 2)
t_t_sliding_window

Unnamed: 0,moby,dick,herman,melville,1851,etymology,supplied,late,consumptive,usher,...,sentiment,less,either,bind,acknowledge,adore,invisible,conduct,affair,men
moby,16.35,8.17,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00
dick,8.17,1.57,0.99,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00
herman,0.00,0.99,1.57,0.99,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00
melville,0.00,0.00,0.99,1.57,0.99,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00
1851,0.00,0.00,0.00,0.99,1.57,0.99,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adore,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.99,1.57,0.99,0.00,0.00,0.00
invisible,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.99,1.57,0.99,0.00,0.00
conduct,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.99,1.57,0.99,0.00
affair,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.99,1.57,7.58


#### PPMI (context = sliding window length 10)

In [26]:
t_t_sliding_window = ppmi(corpus, 10)
t_t_sliding_window

Unnamed: 0,moby,dick,herman,melville,1851,etymology,supplied,late,consumptive,usher,...,sentiment,less,either,bind,acknowledge,adore,invisible,conduct,affair,men
moby,14.85,8.98,7.99,7.41,6.99,6.67,6.41,6.19,6.00,5.83,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
dick,8.98,4.68,3.70,3.11,2.70,2.38,2.12,1.90,1.70,1.54,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
herman,7.99,3.70,3.44,2.86,2.45,2.13,1.87,1.65,1.45,1.28,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
melville,7.41,3.11,2.86,2.77,2.36,2.03,1.77,1.55,1.36,1.19,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1851,6.99,2.70,2.45,2.36,2.31,1.98,1.72,1.50,1.31,1.14,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adore,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.12,1.44,1.76,2.11,2.48,2.90,3.08,3.38,3.99,7.10
invisible,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.16,1.52,1.88,2.25,2.64,3.08,3.60,3.92,4.56,7.72
conduct,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.23,1.66,2.07,2.49,2.91,3.38,3.92,4.61,5.28,8.48
affair,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.43,2.01,2.52,3.00,3.48,3.99,4.56,5.28,6.35,9.58


### Task 3

In [27]:
t_t_sliding_window_normalized = ppmi(corpus, 10, True)
t_t_sliding_window_normalized

Unnamed: 0,moby,dick,herman,melville,1851,etymology,supplied,late,consumptive,usher,...,sentiment,less,either,bind,acknowledge,adore,invisible,conduct,affair,men
moby,1.0,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.5
dick,1.0,0.99,0.98,0.96,0.94,0.92,0.89,0.87,0.85,0.82,...,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.5
herman,1.0,0.98,0.97,0.95,0.92,0.89,0.87,0.84,0.81,0.78,...,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.5
melville,1.0,0.96,0.95,0.94,0.91,0.88,0.85,0.83,0.80,0.77,...,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.5
1851,1.0,0.94,0.92,0.91,0.91,0.88,0.85,0.82,0.79,0.76,...,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
adore,0.5,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,...,0.75,0.81,0.85,0.89,0.92,0.95,0.96,0.97,0.98,1.0
invisible,0.5,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,...,0.76,0.82,0.87,0.90,0.93,0.96,0.97,0.98,0.99,1.0
conduct,0.5,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,...,0.77,0.84,0.89,0.92,0.95,0.97,0.98,0.99,0.99,1.0
affair,0.5,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,0.50,...,0.81,0.88,0.93,0.95,0.97,0.98,0.99,0.99,1.00,1.0


### Task 4

In [28]:
t_t_sliding_window_normalized['asylum']['residence']

0.5

In [29]:
t_t_sliding_window_normalized['late']['old']

0.5

In [30]:
t_t_sliding_window_normalized['body']['man']

0.5