### Collocations are expressions of multiple words which commonly co-occur


In [1]:
from nltk.book import *

import nltk

nltk.download('book')
import matplotlib as plt
import numpy as np
import pandas as pd
import plotly.express as px


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/fxr/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /home/fxr/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /home/fxr/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /home/fxr/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/fxr/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/fxr/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /home/fxr/nltk_data...
[nltk_data]    |   Package

In [2]:
md_bigrams = list(bigrams(text1))

threshold = 2
# Distribution of bi-grams
filtered_bigrams = [bigram for bigram in md_bigrams if len(bigram[0])>threshold and len(bigram[1])>threshold]
filtered_bigram_dist = FreqDist(filtered_bigrams)
# Distribution of words
filtered_words = [word for word in text1 if len(word)>threshold]
filtered_word_dist = FreqDist(filtered_words)

In [3]:
df = pd.DataFrame()
df['bi_gram'] = list(set(filtered_bigrams))
df['word_0'] = df['bi_gram'].apply(lambda x: x[0])
df['word_1'] = df['bi_gram'].apply(lambda x: x[1])
df['bi_gram_freq'] = df['bi_gram'].apply(lambda x: filtered_bigram_dist[x])
df['word_0_freq'] = df['word_0'].apply(lambda x: filtered_word_dist[x])
df['word_1_freq'] = df['word_1'].apply(lambda x: filtered_word_dist[x])
df

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq
0,"(this, appellative)",this,appellative,1,1280,2
1,"(any, Nantucketer)",any,Nantucketer,1,320,22
2,"(the, contraband)",the,contraband,1,13721,1
3,"(call, upon)",call,upon,1,52,538
4,"(and, twenty)",and,twenty,4,6024,33
...,...,...,...,...,...,...
67937,"(one, figure)",one,figure,1,889,19
67938,"(each, three)",each,three,1,127,237
67939,"(the, astonished)",the,astonished,1,13721,4
67940,"(Consider, all)",Consider,all,1,8,1462


### **Pointwise Mutual Information (PMI)** is a metric based on information theory used to find collocations

$$
PMI = \log\left(\frac{P(w_1, w_2)}{P(w_1)P(w_2)}\right)
$$


In [4]:
df['PMI'] = df[['bi_gram_freq', 'word_0_freq', 'word_1_freq']].apply(lambda x: np.log2(x.values[0] / (x.values[1] * x.values[2])), axis=1)
df

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq,PMI
0,"(this, appellative)",this,appellative,1,1280,2,-11.321928
1,"(any, Nantucketer)",any,Nantucketer,1,320,22,-12.781360
2,"(the, contraband)",the,contraband,1,13721,1,-13.744098
3,"(call, upon)",call,upon,1,52,538,-14.771902
4,"(and, twenty)",and,twenty,4,6024,33,-15.600900
...,...,...,...,...,...,...,...
67937,"(one, figure)",one,figure,1,889,19,-14.043967
67938,"(each, three)",each,three,1,127,237,-14.877428
67939,"(the, astonished)",the,astonished,1,13721,4,-15.744098
67940,"(Consider, all)",Consider,all,1,8,1462,-13.513728


In [5]:
df.sort_values(by='PMI', ascending=False)

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq,PMI
20973,"(CHIEF, MATES)",CHIEF,MATES,1,1,1,0.000000
43594,"(Wretched, entertainment)",Wretched,entertainment,1,1,1,0.000000
21730,"(Saint, Stylites)",Saint,Stylites,1,1,1,0.000000
42634,"(INTO, ASIA)",INTO,ASIA,1,1,1,0.000000
55635,"(Descartian, vortices)",Descartian,vortices,1,1,1,0.000000
...,...,...,...,...,...,...,...
3752,"(man, the)",man,the,1,508,13721,-22.732783
24750,"(some, the)",some,the,1,578,13721,-22.919024
24223,"(one, the)",one,the,1,889,13721,-23.540138
62594,"(the, not)",the,not,1,13721,1103,-23.851315


In [6]:
df['log(bi_gram_freq)'] = df['bi_gram_freq'].apply(lambda x: np.log2(x))
df

Unnamed: 0,bi_gram,word_0,word_1,bi_gram_freq,word_0_freq,word_1_freq,PMI,log(bi_gram_freq)
0,"(this, appellative)",this,appellative,1,1280,2,-11.321928,0.0
1,"(any, Nantucketer)",any,Nantucketer,1,320,22,-12.781360,0.0
2,"(the, contraband)",the,contraband,1,13721,1,-13.744098,0.0
3,"(call, upon)",call,upon,1,52,538,-14.771902,0.0
4,"(and, twenty)",and,twenty,4,6024,33,-15.600900,2.0
...,...,...,...,...,...,...,...,...
67937,"(one, figure)",one,figure,1,889,19,-14.043967,0.0
67938,"(each, three)",each,three,1,127,237,-14.877428,0.0
67939,"(the, astonished)",the,astonished,1,13721,4,-15.744098,0.0
67940,"(Consider, all)",Consider,all,1,8,1462,-13.513728,0.0


In [7]:
fig = px.scatter(x=df['PMI'].values , y=df['log(bi_gram_freq)'].values , color=df['PMI']+df['log(bi_gram_freq)'] , hover_name=df['bi_gram'].values , width=600 , height= 600, labels={'x': 'PMI', 'y': 'log(Bigram Frequency)'})
fig.show()

### Built-in measures with NLTK

In [3]:
from nltk.collocations import *
# PMI
bigram_measure = nltk.collocations.BigramAssocMeasures()
# Collocations
finder = BigramCollocationFinder.from_words(text1)

### Find collocations

In [4]:
# Filter possible collocations by frequency distance
finder.apply_freq_filter(20)
finder.nbest(bigram_measure.pmi, 10)

[('Moby', 'Dick'),
 ('Sperm', 'Whale'),
 ('White', 'Whale'),
 ('Right', 'Whale'),
 ('Captain', 'Peleg'),
 (',"', 'said'),
 ('never', 'mind'),
 ('!"', 'cried'),
 ('no', 'means'),
 ('each', 'other')]

### Spanish Example

In [9]:
nltk.download('cess_esp')
corpus = nltk.corpus.cess_esp.sents()
corpus

[nltk_data] Downloading package cess_esp to /home/fxr/nltk_data...
[nltk_data]   Package cess_esp is already up-to-date!


[['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana', 'Electricidad_Águila_de_Altamira', '-Fpa-', 'EAA', '-Fpt-', ',', 'creada', 'por', 'el', 'japonés', 'Mitsubishi_Corporation', 'para', 'poner_en_marcha', 'una', 'central', 'de', 'gas', 'de', '495', 'megavatios', '.'], ['Una', 'portavoz', 'de', 'EDF', 'explicó', 'a', 'EFE', 'que', 'el', 'proyecto', 'para', 'la', 'construcción', 'de', 'Altamira_2', ',', 'al', 'norte', 'de', 'Tampico', ',', 'prevé', 'la', 'utilización', 'de', 'gas', 'natural', 'como', 'combustible', 'principal', 'en', 'una', 'central', 'de', 'ciclo', 'combinado', 'que', 'debe', 'empezar', 'a', 'funcionar', 'en', 'mayo_del_2002', '.'], ...]

In [11]:
flatten_corpus = [w for l in corpus for w in l]
print(flatten_corpus[:10])

['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',']


In [12]:
finder = BigramCollocationFinder.from_documents(corpus)
finder.apply_freq_filter(10)
finder.nbest(bigram_measure.pmi, 10)

[('señora', 'Aguirre'),
 ('secretario', 'general'),
 ('elecciones', 'generales'),
 ('campaña', 'electoral'),
 ('quiere', 'decir'),
 ('Se', 'trata'),
 ('segunda', 'vuelta'),
 ('director', 'general'),
 ('primer', 'ministro'),
 ('primer', 'lugar')]

In [14]:
finder = BigramCollocationFinder.from_words(flatten_corpus)
finder.apply_freq_filter(10)
finder.nbest(bigram_measure.pmi, 10)

[('señora', 'Aguirre'),
 ('secretario', 'general'),
 ('elecciones', 'generales'),
 ('campaña', 'electoral'),
 ('quiere', 'decir'),
 ('Se', 'trata'),
 ('segunda', 'vuelta'),
 ('director', 'general'),
 ('primer', 'ministro'),
 ('primer', 'lugar')]

### Bibliography:
- [Language Processing and Python](https://www.nltk.org/book/ch01.html)
- [Foundations of Statistical Natural Language Processing](https://nlp.stanford.edu/fsnlp/)
- [nltk.collocations module](https://www.nltk.org/api/nltk.collocations.html)