In [1]:
import nltk
import numpy as np
from nltk.book import text1
from nltk.corpus import gutenberg

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
corpus = []
lemantizer = nltk.stem.WordNetLemmatizer()
for token in text1.tokens:
    corpus.append(lemantizer.lemmatize(lemantizer.lemmatize(token, "v"), "a").lower())

In [3]:
len(corpus)

260819

In [4]:
corpus_paragraphs_tmp = nltk.tokenize.blankline_tokenize(gutenberg.raw('melville-moby_dick.txt'))
corpus_paragraphs = []

for paragraph in corpus_paragraphs_tmp:
    paragraph_tokens = []
    for token in nltk.word_tokenize(paragraph):
        paragraph_tokens.append(lemantizer.lemmatize(lemantizer.lemmatize(token, "v"), "a").lower())
    
    corpus_paragraphs.append(paragraph_tokens)

len(corpus_paragraphs)

2793

In [5]:
def make_vocab(corpus):
    vocab = {}
    for i, token in enumerate(set(corpus), 1):
        vocab[token] = i

    return vocab

In [6]:
def make_f_matrix_with_window(corpus, vocab, window_len=10):
    fm = np.zeros((len(vocab) + 1, len(vocab) + 1))
    
    for token_id in range(len(corpus) - window_len):
        token = corpus[token_id + window_len // 2]
        
        for context in corpus[token_id: token_id + window_len]:
            fm[vocab[token], vocab[context]] += 1
    
    return fm

In [7]:
def make_f_matrix_same_paragraph(corpus, vocab):
    fm = np.zeros((len(vocab) + 1, len(vocab) + 1))
    
    for paragraph in corpus:        
        for token in paragraph:
            for context in paragraph:
                fm[vocab[token], vocab[context]] += 1
    
    return fm

In [8]:
def PMI(word1, word2, fm, vocab):
    return np.log(fm[vocab[word1], vocab[word2]] * fm.sum() / fm[vocab[word1]].sum() / fm[vocab[word2]].sum()) 

In [9]:
def PPMI(word1, word2, fm, vocab):
    return max(PMI(word1, word2, fm, vocab), 0)

In [10]:
corpus_tmp = []

for par in corpus_paragraphs:
    corpus_tmp.extend(par)

In [11]:
vocab_par = make_vocab(corpus_tmp)
len(vocab_par)

14997

In [12]:
fm_par = make_f_matrix_same_paragraph(corpus_paragraphs, vocab_par)

In [13]:
fm_par.sum()

50974600.0

In [14]:
fm_par[vocab_par["am"], vocab_par["i"]]

14.0

In [15]:
PMI("am", "i", fm_par, vocab_par)

0.8414838122787834

In [16]:
PPMI("am", "i", fm_par, vocab_par)

0.8414838122787834

In [10]:
vocab_win = make_vocab(corpus)
len(vocab_win)

13439

In [11]:
fm_win = make_f_matrix_with_window(corpus, vocab_win, 10)

In [12]:
fm_win.sum()

2608090.0

In [13]:
fm_win[vocab_win["white"], vocab_win["whale"]]

126.0

In [14]:
PMI("sperm", "whale", fm_win, vocab_win)

2.5999149980578924

In [15]:
PPMI("sperm", "whale", fm_win, vocab_win)

2.5999149980578924

In [16]:
text1.collocation_list()

[('Sperm', 'Whale'),
 ('Moby', 'Dick'),
 ('White', 'Whale'),
 ('old', 'man'),
 ('Captain', 'Ahab'),
 ('sperm', 'whale'),
 ('Right', 'Whale'),
 ('Captain', 'Peleg'),
 ('New', 'Bedford'),
 ('Cape', 'Horn'),
 ('cried', 'Ahab'),
 ('years', 'ago'),
 ('lower', 'jaw'),
 ('never', 'mind'),
 ('Father', 'Mapple'),
 ('cried', 'Stubb'),
 ('chief', 'mate'),
 ('white', 'whale'),
 ('ivory', 'leg'),
 ('one', 'hand')]

### Very rare words might will have high PMI values. How would you solve the problem? First I think about is clipping values between some limited numbers, same method is used to prevent gradient vanishing and explosion problems.

In [20]:
PPMI("good", "well", fm_par, vocab_par)

0.06569768907981557

In [21]:
PPMI("i", "am", fm_par, vocab_par)

0.8414838122787834

In [22]:
PPMI("i", "he", fm_par, vocab_par)

0.07592487342471296

In [25]:
PPMI("he", "is", fm_par, vocab_par)

0.14113469315833196

In [28]:
PPMI("beautiful", "gorgeous", fm_par, vocab_par)

  return np.log(fm[vocab[word1], vocab[word2]] * fm.sum() / fm[vocab[word1]].sum() / fm[vocab[word2]].sum())


0