In [1]:
import nltk
from nltk.book import *
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import warnings
import numpy as np
import pandas as pd

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [17]:
warnings.filterwarnings('ignore')

In [18]:
def preprocess_text(text):
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)
    for token, tag in pos_tag(text):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        token = lemmatizer.lemmatize(token, pos)
        if token not in string.punctuation and token.lower() not in stop_words and ((token.isalpha() and len(token) > 1) or token.isdigit()):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [19]:
def ppmi_matrix(corpus, window_length=0, normalize=False):
    fd = FreqDist(nltk.word_tokenize(' '.join([' '.join(corpus[i][j] for j in range(len(corpus[i]))) for i in range(len(corpus))])))
    fd_keys = list(fd.keys())
    corp_list = [word for sublist in corpus for word in sublist]
    if window_length > 0:
        w_dict = {w: np.array([1 if w in doc else 0 for doc in [fd_keys[i:i+window_length] for i in range(len(corp_list))] + [fd_keys[j:j-window_length] for j in range(len(corp_list), 0, -1)]]) for w in corp_list}
    else: 
        w_dict = {w: np.array([1 if w in doc else 0 for doc in corpus]) for w in corp_list}
        
    term_term_matrix = np.zeros((len(fd_keys), len(fd_keys)))
    if not normalize:
        for i, w in enumerate(fd_keys):
            for j, c in enumerate(fd_keys):
                term_term_matrix[i, j] = round(pmi(w,c), 2)
    else:
        for i, w in enumerate(fd_keys):
            for j, c in enumerate(fd_keys):
                term_term_matrix[i, j] = round(1/(1+(np.exp(-(pmi(w,c))))), 2) 
                
    def n(w, c):
        return np.sum(w_dict[w]*w_dict[c])
    denominator = 0
    counter = 0
    for w in fd_keys:
        denominator += counter
        counter = 0
        for c in fd_keys:
            counter += n(w, c)

    def p_ij(w, c):
        return n(w, c)/denominator

    def p_i(w):
        return sum([n(w, c) for c in fd_keys])/denominator
    
    def p_j(c):
        return sum([n(w, c) for w in fd_keys])/denominator

    def pmi(w, c):
        return max(np.log2((p_ij(w,c))/(p_i(w)*p_j(c))), 0)
    
    return pd.DataFrame(term_term_matrix, columns=fd_keys, index=fd_keys)

##### З цими текстами не так довго як з іншими бо на інших ppmi_paragraph грузив тільки годину, боюсь уявити, що було б з sliding window

In [20]:
text_2 = pre_process_text(" ".join(text2))
text_3 = pre_process_text(" ".join(text3))
text_7 = pre_process_text(" ".join(text7))
text_9 = pre_process_text(" ".join(text9))

In [21]:
corpus = [text_2[:int(round(len(text_2)/300))], text_7[:int(round(len(text_7)/300))], text_3[:int(round(len(text_3)/300))], text_9[:int(round(len(text_9)/300))]] 

## paragraph

In [24]:
ppmi_paragraph = ppmi_matrix(corpus)
print("PPMI with paragraph context:")
print(ppmi_paragraph)

PPMI with paragraph context:
             sense  sensibility  jane  austen  1811  chapter     1  family  \
sense         1.05         1.05  1.05    1.05  1.05     1.05  1.05    1.05   
sensibility   1.05         1.05  1.05    1.05  1.05     1.05  1.05    1.05   
jane          1.05         1.05  1.05    1.05  1.05     1.05  1.05    1.05   
austen        1.05         1.05  1.05    1.05  1.05     1.05  1.05    1.05   
1811          1.05         1.05  1.05    1.05  1.05     1.05  1.05    1.05   
...            ...          ...   ...     ...   ...      ...   ...     ...   
hymn          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
children      0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
fort          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
sand          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
eve           0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   

             dashwood  long  ...  

 ## sliding window(2)

In [22]:
ppmi_sliding_window_2 = ppmi_matrix(corpus, 2)
print("\nPPMI with sliding window length 2:")
print(ppmi_sliding_window_2)



PPMI with sliding window length 2:
             sense  sensibility  jane  austen  1811  chapter    1  family  \
sense        14.78         7.38  0.00    0.00  0.00     0.00  0.0     0.0   
sensibility   7.38         1.57  0.98    0.00  0.00     0.00  0.0     0.0   
jane          0.00         0.98  1.57    0.98  0.00     0.00  0.0     0.0   
austen        0.00         0.00  0.98    1.57  0.98     0.00  0.0     0.0   
1811          0.00         0.00  0.00    0.98  1.57     0.98  0.0     0.0   
...            ...          ...   ...     ...   ...      ...  ...     ...   
hymn          0.00         0.00  0.00    0.00  0.00     0.00  0.0     0.0   
children      0.00         0.00  0.00    0.00  0.00     0.00  0.0     0.0   
fort          0.00         0.00  0.00    0.00  0.00     0.00  0.0     0.0   
sand          0.00         0.00  0.00    0.00  0.00     0.00  0.0     0.0   
eve           0.00         0.00  0.00    0.00  0.00     0.00  0.0     0.0   

             dashwood  long  ...  thus 

## sliding window (5)

In [23]:
ppmi_sliding_window_5 = ppmi_matrix(corpus, 5)
print("\nPPMI with sliding window length 5:")
print(ppmi_sliding_window_5)



PPMI with sliding window length 5:
             sense  sensibility  jane  austen  1811  chapter     1  family  \
sense        14.12         8.03  7.04    6.46  6.05     0.00  0.00    0.00   
sensibility   8.03         3.53  2.54    1.96  1.54     0.96  0.00    0.00   
jane          7.04         2.54  2.28    1.70  1.29     0.97  0.55    0.00   
austen        6.46         1.96  1.70    1.61  1.19     0.97  0.71    0.39   
1811          6.05         1.54  1.29    1.19  1.14     0.97  0.78    0.56   
...            ...          ...   ...     ...   ...      ...   ...     ...   
hymn          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
children      0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
fort          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
sand          0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   
eve           0.00         0.00  0.00    0.00  0.00     0.00  0.00    0.00   

             dashwood  long

## Normalized PPMI sliding window (5)

In [25]:
ppmi_sliding_window_5_normalized = ppmi_matrix(corpus, 5, True)
print("\nNormalized PPMI with sliding window length 5:")
print(ppmi_sliding_window_5_normalized)


Normalized PPMI with sliding window length 5:
             sense  sensibility  jane  austen  1811  chapter     1  family  \
sense          1.0         1.00  1.00    1.00  1.00     0.50  0.50    0.50   
sensibility    1.0         0.97  0.93    0.88  0.82     0.72  0.50    0.50   
jane           1.0         0.93  0.91    0.85  0.78     0.72  0.63    0.50   
austen         1.0         0.88  0.85    0.83  0.77     0.73  0.67    0.60   
1811           1.0         0.82  0.78    0.77  0.76     0.73  0.69    0.64   
...            ...          ...   ...     ...   ...      ...   ...     ...   
hymn           0.5         0.50  0.50    0.50  0.50     0.50  0.50    0.50   
children       0.5         0.50  0.50    0.50  0.50     0.50  0.50    0.50   
fort           0.5         0.50  0.50    0.50  0.50     0.50  0.50    0.50   
sand           0.5         0.50  0.50    0.50  0.50     0.50  0.50    0.50   
eve            0.5         0.50  0.50    0.50  0.50     0.50  0.50    0.50   

             das

## Example

In [26]:
t_t_sliding_window_normalized['fort']['sand']

0.96