In [1]:
%matplotlib inline
import re

from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def preprocess(sent):
    def convert(word):
        # Verifica se é um número.
        try:
            _ = float(word)
            return '<num>'
        except:
            pass
        
        # Verifica se é uma palavra.
        if word.isalpha():
            return word.lower()
        
        # Caso contrário, é pontuação ou estranho.
        return '<weird>'
    
    return [convert(word) for word in sent]

In [2]:
# from nltk.corpus import reuters
# sents = [preprocess(sent) for sent in reuters.sents()]

from nltk.corpus import brown
sents = [preprocess(sent) for sent in brown.sents()]

# from sklearn.datasets import fetch_20newsgroups
# sents = [preprocess(item.strip().split()) for item in fetch_20newsgroups()['data']]

In [3]:
from nltk.probability import FreqDist

words = chain.from_iterable(sents)
fdist = FreqDist(words)

ser = pd.Series(fdist, name='fdist').sort_values(ascending=False)
ser

<weird>        172869
the             69971
of              36412
and             28853
to              26158
                ...  
oviform             1
saigon              1
superlunary         1
sublunary           1
stupefying          1
Name: fdist, Length: 40234, dtype: int64

In [4]:
num_stopwords = 20
stopwords = list(ser.iloc[:num_stopwords].index)
stopwords

['<weird>',
 'the',
 'of',
 'and',
 'to',
 'a',
 'in',
 'that',
 'is',
 'was',
 'he',
 'for',
 'it',
 'with',
 'as',
 'his',
 'on',
 '<num>',
 'be',
 'at']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(min_df=5, max_df=0.9, max_features=5000, stop_words=stopwords, sublinear_tf=False, analyzer=lambda x: x)

vecs = model.fit_transform(sents)

words = model.get_feature_names()

In [6]:
words[:30]

['<num>',
 'a',
 'abandoned',
 'abel',
 'ability',
 'able',
 'aboard',
 'about',
 'above',
 'abroad',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorbed',
 'abstract',
 'academic',
 'academy',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'access',
 'accident',
 'accompanied',
 'accomplish',
 'accomplished',
 'accordance',
 'according']

In [7]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=10, n_init=10, random_state=42)

In [8]:
%%time
result = kmeans.fit_transform(vecs)

Wall time: 393 ms


In [9]:
result.shape

(57340, 10)

In [10]:
import pandas as pd
df = pd.DataFrame(result)

In [11]:
num_sents = 10
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(' '
              .join(sents[k])
              .replace('<weird>', '')
              .replace('  ', ' ')
             )
        print('-' * 80)

################################################################################

Cluster 0: [ 7258 23434 28352 28337 28325 28290 28283  7689 54144   654]
the def a tambourine 
--------------------------------------------------------------------------------
a knowledgeable celebrity
--------------------------------------------------------------------------------
a 
--------------------------------------------------------------------------------
 a 
--------------------------------------------------------------------------------
 a 
--------------------------------------------------------------------------------
 a 
--------------------------------------------------------------------------------
 a 
--------------------------------------------------------------------------------
 a horror 
--------------------------------------------------------------------------------
quint smothered a yawn 
--------------------------------------------------------------------------------
 a missionary 

In [12]:
pd.Series(kmeans.labels_).value_counts()

3    16206
2    10858
0     6665
1     4052
8     3878
9     3861
5     3786
7     3679
6     2968
4     1387
dtype: int64

In [13]:
import gensim

In [14]:
with open('sentences.txt', 'w', encoding='utf8') as file:
    for sentence in sents:
        file.write(f'{" ".join(sentence)}\n')

In [15]:
%%time
model_cbow = gensim.models.Word2Vec(
    corpus_file='sentences.txt',
    window=5,
    size=200,
    seed=42,
    iter=100,
    workers=12,
)

Wall time: 1min 12s


In [None]:
def cbow(model, sent):
    vec = np.zeros(model.wv.vector_size)
    for word in sent:
        if word in model:
            vec += model.wv.get_vector(word)
            
    norm = np.linalg.norm(vec)
    if norm > np.finfo(float).eps:
        vec /= norm
    return vec

In [None]:
vecs_cbow = [cbow(model_cbow, sent) for sent in sents]

In [None]:
kmeans_cbow = MiniBatchKMeans(n_clusters=20, random_state=42)

In [None]:
%%time
result = kmeans_cbow.fit_transform(vecs_cbow)

In [None]:
result.shape

In [None]:
import pandas as pd
df = pd.DataFrame(result)

In [None]:
num_sents = 10
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(' '
              .join(sents[k])
              .replace('<weird>', '')
              .replace('  ', ' ')
              .replace('<num>', '#')
             )
        print('-' * 80)

In [None]:
pd.Series(kmeans_cbow.labels_).value_counts()