In [1]:
import re
import collections
import itertools

import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.metrics import BigramAssocMeasures

In [2]:
text = '''
... Punkt knows that the periods in Mr. Smith and Johann S. Bach
... do not mark sentence boundaries.
And sometimes sentences
... can start with non-capitalized words.
i is a good variable
... name.
... '''

In [3]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
sents = sent_detector.tokenize(text.strip())
sents

['Punkt knows that the periods in Mr. Smith and Johann S. Bach\ndo not mark sentence boundaries.',
 'And sometimes sentences\ncan start with non-capitalized words.',
 'i is a good variable\nname.']

In [5]:
#dictionary = corpora.Dictionary.load('ap.txt')
#corpus = corpora.MmCorpus('ap.txt')
#lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [6]:
dictionary = {}
with open('ap.txt', 'r') as file:
    string = file.read()
docs = string.split('<DOC>')
for doc in docs[1:]:
    doc_id = doc.split('DOCNO>')[1].split('</')[0].strip()
    text = doc.split('TEXT>')[1].split('</')[0].strip()
    dictionary[doc_id] = text
dictionary

{'AP900813-0074': "The White House said today its naval barricade of Iraqi ships is taking hold as the United States and its allies squeeze Saddam Hussein economically so the Iraqis cannot ``maintain their war machine.'' The Bush administration said bluntly today that even food would be withheld from Iraq. ``You don't give them the essentials that enable them to carry on the war,'' White House spokesman Marlin Fitzwater said. In Saudi Arabia, where American troops are building a massive defense against any further Iraqi aggression, the acting chief of staff for U.S. troop operations told reporters, ``I think our position grows stronger with each day that goes by.'' As Maj. Gen. Don L. Kaufman spoke, the steady drone of planes brought more soldiers and hardware to the Saudi desert. Kaufman said planes were landing virtually every 10 minutes or so with their military cargo. On the economic front, Fitzwater said the sanctions endorsed by the United Nations are hurting Baghdad. ``The purpo

In [7]:
all_words = word_tokenize('\n'.join(dictionary.values()).replace('.', ''))
all_words = [word.lower() for word in all_words]
all_words

['the',
 'white',
 'house',
 'said',
 'today',
 'its',
 'naval',
 'barricade',
 'of',
 'iraqi',
 'ships',
 'is',
 'taking',
 'hold',
 'as',
 'the',
 'united',
 'states',
 'and',
 'its',
 'allies',
 'squeeze',
 'saddam',
 'hussein',
 'economically',
 'so',
 'the',
 'iraqis',
 'can',
 'not',
 '``',
 'maintain',
 'their',
 'war',
 'machine',
 "''",
 'the',
 'bush',
 'administration',
 'said',
 'bluntly',
 'today',
 'that',
 'even',
 'food',
 'would',
 'be',
 'withheld',
 'from',
 'iraq',
 '``',
 'you',
 'do',
 "n't",
 'give',
 'them',
 'the',
 'essentials',
 'that',
 'enable',
 'them',
 'to',
 'carry',
 'on',
 'the',
 'war',
 ',',
 "''",
 'white',
 'house',
 'spokesman',
 'marlin',
 'fitzwater',
 'said',
 'in',
 'saudi',
 'arabia',
 ',',
 'where',
 'american',
 'troops',
 'are',
 'building',
 'a',
 'massive',
 'defense',
 'against',
 'any',
 'further',
 'iraqi',
 'aggression',
 ',',
 'the',
 'acting',
 'chief',
 'of',
 'staff',
 'for',
 'us',
 'troop',
 'operations',
 'told',
 'reporters'

In [8]:
regexp = re.compile(r'^[a-z]+$')
#all_words = [word for word in all_words if regexp.match(word) and word not in stopwords.words('english')]
len(all_words)

1006572

In [9]:
words_by_count = collections.Counter(all_words)

In [10]:
print("Tamaño del vocabulario: {}".format(len(words_by_count)))

filtered = [(word, count) for word, count in words_by_count.items()
            if word.lower() not in stopwords.words('english') and
            re.match(r'^[a-z]+$', word)]

Tamaño del vocabulario: 41981


In [11]:
most_common = collections.Counter(**dict(filtered)).most_common(500)

In [12]:
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words, window_size=5)

In [13]:
import math
most_common_words = [pair[0] for pair in most_common]
scores = {}
for i, j in itertools.combinations(most_common_words, 2):
    score_together = bigram_finder.score_ngram(BigramAssocMeasures.mi_like, i, j)
    scores[(i, j)] = math.log(score_together, 2) if score_together is not None else float('-inf')

In [14]:
sorted(scores.items(), key=lambda p: p[1], reverse=True)

[(('united', 'states'), 2.7551278160984856),
 (('new', 'york'), 1.6535991637970748),
 (('soviet', 'union'), -0.7298933215015819),
 (('south', 'africa'), -0.948522532564868),
 (('interest', 'rates'), -1.4963077251140278),
 (('news', 'conference'), -1.8485540274414642),
 (('last', 'week'), -1.956718582265691),
 (('told', 'reporters'), -1.9710934819032946),
 (('years', 'ago'), -2.161809198934046),
 (('west', 'german'), -2.2167864104483908),
 (('air', 'force'), -2.3618762941958744),
 (('news', 'agency'), -2.6961006725238628),
 (('stock', 'exchange'), -2.7058619380806177),
 (('west', 'germany'), -2.7480105798076004),
 (('last', 'month'), -3.191722262113112),
 (('east', 'germany'), -3.6021608883152436),
 (('eastern', 'europe'), -3.720898369091322),
 (('million', 'shares'), -4.004202833957604),
 (('chief', 'executive'), -4.0201460824503),
 (('president', 'bush'), -4.05708172664382),
 (('said', 'would'), -4.211349598225981),
 (('world', 'war'), -4.340771745806838),
 (('president', 'reagan'), -