In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def preprocess(sent):
    def convert(word):
        # Verifica se é um número.
        try:
            _ = float(word)
            return '<num>'
        except:
            pass
        
        # Verifica se é uma palavra.
        if word.isalpha():
            lower = word.lower()
            return '<stop>' if lower in STOPWORDS else lower
        
        # Caso contrário, é pontuação ou estranho.
        return '<weird>'
    
    processed = [convert(word) for word in sent]
    forbidden_words = set(('<num>', '<stop>', '<weird>'))
    return [word for word in processed if word not in forbidden_words]

In [3]:
%%time
# from nltk.corpus import reuters
# sents = [preprocess(sent) for sent in reuters.sents()]

# from nltk.corpus import reuters
# from nltk.tokenize import word_tokenize
# docs = [word_tokenize(reuters.raw(fileid).strip()) for fileid in reuters.fileids()]
# sents = [preprocess(doc) for doc in docs]

# from nltk.corpus import brown
# sents = [preprocess(sent) for sent in brown.sents()]

from sklearn.datasets import fetch_20newsgroups
sents = [preprocess(item.strip().split()) for item in fetch_20newsgroups()['data']]

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Wall time: 1min 35s


# Doc2Vec

In [4]:
import gensim

In [5]:
with open('sentences.txt', 'w', encoding='utf8') as file:
    for sentence in sents:
        file.write(f'{" ".join([tok for tok in sentence if tok != "<num>"])}\n')

In [6]:
%%time
model_cbow = gensim.models.Doc2Vec(
    corpus_file='sentences.txt',
    vector_size=200,
    window=5,
    min_count=5,
    workers=12,
    epochs=100,
)

Wall time: 1min 1s


In [7]:
model_cbow.docvecs.vectors_docs

array([[-4.2916369e-01,  5.5152011e-01,  2.1634729e-01, ...,
         4.0536472e-01,  5.6265938e-01,  6.0627490e-01],
       [-4.3762356e-01, -1.1232828e+00, -1.4373672e+00, ...,
        -1.8785514e+00,  2.6089594e-01,  8.5045612e-01],
       [-1.3427912e+00,  1.1281815e+00,  2.0876613e-01, ...,
         1.5984105e+00, -2.1001346e+00,  5.1825309e-01],
       ...,
       [ 6.2170543e-04, -2.0785832e-03,  2.3196817e-03, ...,
        -2.1602854e-03,  2.0566414e-04,  1.9103849e-03],
       [ 1.7440557e-03, -7.1087194e-04,  1.6258265e-03, ...,
         1.1208635e-03,  9.2979521e-04,  1.8092123e-03],
       [-1.3417950e-03, -9.4703777e-04, -6.8338070e-04, ...,
        -2.3585355e-03, -1.4061083e-03,  1.2617821e-03]], dtype=float32)

In [9]:
from sklearn.cluster import MiniBatchKMeans
kmeans_cbow = MiniBatchKMeans(n_clusters=10, random_state=42)

In [10]:
%%time
result = kmeans_cbow.fit_transform(model_cbow.docvecs.vectors_docs)

Wall time: 131 ms


In [11]:
result.shape

(11314, 10)

In [12]:
import pandas as pd
df = pd.DataFrame(result)

In [13]:
num_sents = 10
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(' '
              .join(sents[k])
              .replace('<weird>', '')
              .replace('  ', ' ')
              .replace('<num>', '#')[:300]
             )
        print('-' * 80)

################################################################################

Cluster 0: [4255]
computer science western canada usa dmorf could anyone direct ftp site find morphing package called downloaded file last new dos crashed hard drive lost find site got morphing packages dos thanks barry author wgt graphics toolkit vga version available mouse graphics primitives power data storage mul
--------------------------------------------------------------------------------
################################################################################

Cluster 1: [11302  4809  9789 10718  9384   852   671  8120  8200  5191]
code good san ca na trademark risc chip used thier wonder intergraph going infringement thier name probably keep quiet take lest get kneecaps good ten weeks friday good
--------------------------------------------------------------------------------
employment concentrate child exnet systems ltd public access uk article take logic far man invent need warmth man

In [17]:
pd.Series(kmeans_cbow.labels_).value_counts()

7    5171
6    4703
9     572
3     404
4     272
1     110
2      75
5       5
8       1
0       1
dtype: int64

# LDA

https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [18]:
import pickle
from gensim.corpora import Dictionary

In [19]:
%%time
dictionary = Dictionary(sents)

Wall time: 1.27 s


In [20]:
%%time
corpus = [dictionary.doc2bow(sent) for sent in sents]

Wall time: 709 ms


In [21]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [22]:
for k, item in enumerate(dictionary.items()):
    print(item)
    if k >= 20:
        break

(0, 'anyone')
(1, 'brought')
(2, 'bumper')
(3, 'called')
(4, 'car')
(5, 'college')
(6, 'could')
(7, 'doors')
(8, 'early')
(9, 'engine')
(10, 'enlighten')
(11, 'front')
(12, 'funky')
(13, 'il')
(14, 'info')
(15, 'late')
(16, 'lerxst')
(17, 'looked')
(18, 'looking')
(19, 'model')
(20, 'neighborhood')


In [23]:
from gensim.models.ldamulticore import LdaMulticore

In [24]:
%%time
NUM_TOPICS = 100
ldamodel = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=30)

Wall time: 5min 7s


In [25]:
ldamodel.save('model5.gensim')

In [26]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(31, '0.029*"hockey" + 0.020*"nhl" + 0.015*"team" + 0.010*"play" + 0.009*"players" + 0.009*"league" + 0.008*"teams" + 0.007*"city" + 0.007*"north" + 0.007*"europeans"')
(95, '0.019*"article" + 0.014*"would" + 0.009*"one" + 0.007*"get" + 0.006*"university" + 0.006*"think" + 0.005*"way" + 0.005*"government" + 0.005*"going" + 0.004*"dumb"')
(99, '0.009*"think" + 0.007*"article" + 0.007*"people" + 0.006*"get" + 0.006*"would" + 0.006*"good" + 0.006*"like" + 0.005*"something" + 0.005*"really" + 0.005*"university"')
(74, '0.011*"like" + 0.009*"one" + 0.009*"would" + 0.008*"know" + 0.007*"university" + 0.007*"anyone" + 0.006*"windows" + 0.006*"article" + 0.005*"heard" + 0.004*"much"')
(19, '0.016*"would" + 0.014*"oil" + 0.009*"nuclear" + 0.008*"water" + 0.008*"power" + 0.007*"like" + 0.007*"use" + 0.007*"get" + 0.005*"think" + 0.005*"know"')
(29, '0.009*"mac" + 0.008*"motif" + 0.007*"linux" + 0.007*"would" + 0.006*"article" + 0.006*"amiga" + 0.006*"port" + 0.006*"os" + 0.006*"one" + 0.006*"lik

In [27]:
dictionary = Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = LdaMulticore.load('model5.gensim')

In [28]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

ModuleNotFoundError: No module named 'pyLDAvis'