In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from nltk import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from gensim.models.phrases import Phrases, Phraser

In [2]:
import nltk
nltk.download('punkt')
nltk.download('reuters')
from nltk.corpus import reuters


[nltk_data] Downloading package punkt to /home/ddellera/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /home/ddellera/nltk_data...


In [3]:
documents = []
 
for fileid in reuters.fileids():
    documents.append(reuters.raw(fileid).lower())


In [4]:
documents[0]



In [5]:
tokens =  [w for doc in documents for w in word_tokenize(doc)]
tokens[:15]

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u.s.-japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u.s.',
 'and',
 'japan']

In [6]:
# Bi-gramas
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
# Filtra bigramas con frecuencia menor a 5
finder.apply_freq_filter(10)
# devuelve los "n" bigramas con mayor PMI
finder.nbest(bigram_measures.pmi, n = 50)

[('het', 'comite'),
 ('hoare', 'govett'),
 ('lago', 'agrio'),
 ('dar', 'es'),
 ('es', 'salaam'),
 ('corpus', 'christi'),
 ('paz', 'estenssoro'),
 ('corazon', 'aquino'),
 ('ay', 'expd-e'),
 ('l.f.', 'rothschild'),
 ('lear', 'siegler'),
 ('ranks', 'hovis'),
 ('hajime', 'tamura'),
 ('poison', 'pill'),
 ('abu', 'dhabi'),
 ('kleinwort', 'benson'),
 ('ind', 'ttl-f'),
 ('rjr', 'nabisco'),
 ('gates', 'learjet'),
 ('pro', 'forma'),
 ('margaret', 'thatcher'),
 ('carter', 'hawley'),
 ('canary', 'islands'),
 ('bra', 'kanon'),
 ('mcdonnell', 'douglas'),
 ('lord', 'abbett'),
 ('puerto', 'rico'),
 ('phelps', 'dodge'),
 ('sao', 'paulo'),
 ('brace', 'jovanovich'),
 ('karl', 'otto'),
 ('marlin', 'fitzwater'),
 ('pizza', 'inn'),
 ('dean', 'witter'),
 ('buenos', 'aires'),
 ('costa', 'rica'),
 ('king', 'fahd'),
 ('del', 'este'),
 ('hernandez', 'grisanti'),
 ('pl', '480'),
 ('arturo', 'hernandez'),
 ('punta', 'del'),
 ('el', 'nino'),
 ('du', 'pont'),
 ('optional', 'origin'),
 ('drexel', 'burnham'),
 ('denis

# Gensim

In [7]:
sentences =  [word_tokenize(sent) for sent in sent_tokenize("\n".join(documents).lower())]

In [15]:
sentences = [sent for sent in sentences if len(sent)>1]
sentences[0]

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u.s.-japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u.s.',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 "'s",
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far-reaching',
 'economic',
 'damage',
 ',',
 'businessmen',
 'and',
 'officials',
 'said',
 '.']

In [9]:
collocations = Phrases(sentences=sentences, min_count=10,threshold=0.5,scoring='npmi') # threshold: minimo score aceptado

In [10]:
to_collocations = Phraser(collocations)

In [11]:
sent = 'new york is in united states of america. south africa and south america are in different continents'

In [12]:
to_collocations[word_tokenize(sent)]

['new_york',
 'is',
 'in',
 'united_states',
 'of',
 'america',
 '.',
 'south_africa',
 'and',
 'south',
 'america',
 'are',
 'in',
 'different',
 'continents']

In [13]:
df_collocations = pd.DataFrame([x for x in collocations.export_phrases(sentences)],columns=["bigram","score"])
df_collocations.shape

(114001, 2)

In [14]:
df_collocations.drop_duplicates().sort_values(by="score",ascending=False).head(50)

Unnamed: 0,bigram,score
33040,b'bra kanon',1.0
8009,b'het comite',1.0
39184,b'corpus christi',1.0
7599,b'lago agrio',1.0
1258,b'buenos aires',1.0
8,b'& lt',0.999734
170,b'lt ;',0.998407
5813,b'04/09/87 03/09/87',0.997678
22537,b'hoare govett',0.992659
8494,b'crazy eddie',0.992514
