# Processing a large email inbox with NLTK and ScikitLearn

### 1. Decode an email message using base64

In [67]:
import base64

In [68]:
body = 'PGh0bWwgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGh0bWwiIHhtbDpsYW5nPSJwdC1i \
ciIKICAgICAgbGFuZz0icHQtYnIiPgoKICAKICA8aGVhZD4KICAgIDxtZXRhIGNvbnRlbnQ9InRl \
eHQvaHRtbDtjaGFyc2V0PXV0Zi04IiBodHRwLWVxdWl2PSJDb250ZW50LVR5cGUiPgogIDwvaGVh \
ZD4KCiAgPGJvZHk+CiAgICA8ZGl2IHN0eWxlPSJtYXJnaW46IDEwcHg7Ij4gICAgICAgICAgICAK \
ICAgICAgPGxhYmVsIHN0eWxlPSJmb250LXdlaWdodDogYm9sZDsiPk5vbWUgZG8gcmVtZXRlbnRl \
OjwvbGFiZWw+CiAgICAgIDxzcGFuPlZJUkdJTklBIENPTkRPUkkgQ0hPUVVFPC9zcGFuPiAgICAK \
ICAgIDwvZGl2PgogICAgPGRpdiBzdHlsZT0ibWFyZ2luOiAxMHB4OyI+ICAgICAgICAgICAgCiAg \
ICAgIDxsYWJlbCBzdHlsZT0iZm9udC13ZWlnaHQ6IGJvbGQ7Ij5FLW1haWwgZG8gcmVtZXRlbnRl \
OjwvbGFiZWw+CiAgICAgIDxzcGFuPm1pX2FuZ2VsX2VzcGVqb0Bob3RtYWlsLmNvbTwvc3Bhbj4g \
ICAgCiAgICA8L2Rpdj4KICAgIDxkaXYgc3R5bGU9Im1hcmdpbjogMTBweDsiPiAgICAgICAgICAg \
IAogICAgICA8bGFiZWwgc3R5bGU9ImZvbnQtd2VpZ2h0OiBib2xkOyI+QXNzdW50bzo8L2xhYmVs \
PgogICAgICA8c3Bhbj5FdSBxdWVybyBwYXJ0aWNpcGFyIGRlIE8gRXhhbWUgTmFjaW9uYWwgZGUg \
UmV2YWxpZGHDp8OjbyBkZSBEaXBsb21hcyBNw6lkaWNvcyAyMDE0PC9zcGFuPiAgICAKICAgIDwv \
ZGl2PgogICAgPGRpdiBzdHlsZT0ibWFyZ2luOiAxMHB4OyI+ICAgICAgICAgICAgCiAgICAgIDxs \
YWJlbCBzdHlsZT0iZm9udC13ZWlnaHQ6IGJvbGQ7Ij5UaXBvOjwvbGFiZWw+CiAgICAgIDxzcGFu \
Pk91dHJvPC9zcGFuPgogICAgPC9kaXY+ICAgICAKICAgIDxkaXYgc3R5bGU9Im1hcmdpbjogMTBw \
eDsiPiAgICAgICAgICAgIAogICAgICA8bGFiZWwgc3R5bGU9ImZvbnQtd2VpZ2h0OiBib2xkOyI+ \
ZGF0YSBkZSBlbnZpbzo8L2xhYmVsPgogICAgICA8c3Bhbj4yNC8wNi8yMDE0IDAxOjE5OjMzPC9z \
cGFuPiAgICAKICAgIDwvZGl2PgogICAgPGRpdiBzdHlsZT0ibWFyZ2luOiAxMHB4OyI+ICAgICAg \
ICAgICAgCiAgICAgIDxsYWJlbCBzdHlsZT0iZm9udC13ZWlnaHQ6IGJvbGQ7Ij5SZXNwb25kaWRh \
OjwvbGFiZWw+CiAgICAgIDxzcGFuPlNpbTwvc3Bhbj4KICAgIDwvZGl2PgogICAgPGRpdiBzdHls \
ZT0ibWFyZ2luOiAxMHB4OyI+ICAgICAgICAgICAgCiAgICAgIDxsYWJlbCBzdHlsZT0iZm9udC13 \
ZWlnaHQ6IGJvbGQ7Ij5NZW5zYWdlbSBFeHRlcm5hOjwvbGFiZWw+CiAgICAgIDxzcGFuPlNpbTwv \
c3Bhbj4KICAgIDwvZGl2PgogICAgPGRpdiBzdHlsZT0ibWFyZ2luOiAxMHB4OyI+ICAgICAgICAg \
ICAgCiAgICAgIDxsYWJlbCBzdHlsZT0iZm9udC13ZWlnaHQ6IGJvbGQ7Ij5NZW5zYWdlbTo8L2xh \
YmVsPjxiciAvPjxiciAvPgogICAgICA8ZGl2PkJvYSBub2l0ZSwgcG9yIGZhdm9yIGV1IHF1ZXJv \
IHBhcnRpY2lwYXIgZGUgTyBFeGFtZSBOYWNpb25hbCBkZSBSZXZhbGlkYcOnw6NvIGRlIERpcGxv \
bWFzIE3DqWRpY29zIEV4cGVkaWRvcyBwb3IgSW5zdGl0dWnDp8O1ZXMgZGUgRWR1Y2HDp8OjbyBT \
dXBlcmlvciBFc3RyYW5nZWlyYXMg4oCTIFJFVkFMSURBIDIwMTQsIG1haXMgbsOjbyBwb3NzbyBm \
YXplciBtaW5oYSBpbnNjcmnDp8OjbyBwb3IgcXVlIG5vIGFubyBwYXNzYWRvIHF1aXMgZmF6ZXIg \
bWluaGEgaW5zY3Jpw6fDo28gcGVybyBmb2kgdGFyZGUgZGVtYWlzIGUgbyBjYWRhc3RybyBxdWUg \
Zml6IGZvaSBjb20gdW0gZS1tYWlsIHF1ZSBwZXJkaSAocGVyZGkgYSBjb250YSllIG7Do28gbGVt \
YnJvIGEgc2VuaGEsIFBPUkZBVk9SIENPTU8gUE9TU08gRkFaRVIgUEFSQS1NRSBDQURBU1RSQVIg \
Tk9WQU1FTlRFIENPTSBPVVRSTyBFLU1BSUwsIFBPUiBGQVZPUiBNRSBBSlVERSwgRVUgUVVFUk8g \
TVVJVE8gTUVTTU8gUEFSVElDSVBBUiBEQSBQUk9WQS4gZXUgcGXDp28gc3VhIGFqdWRhIGUgZ3Jh \
dGEgYW50ZWNpcGFkYW1lbnRlIHBlbGEgc3VhIGFqdWRhIFBPUkZBVk9SLjxici8+T3UgcG9yIGZh \
dm9yIG1lIGRpZ2EgY29tbyBldSBwb3NzbyBmYWxhciBjb20gdm9jw6pzLjxici8+TWV1IGUtbWFp \
bCBhdHVhbCBtaV9hbmdlbF9lc3Blam9AaG90bWFpbC5jb208L2Rpdj4gICAgCiAgICA8L2Rpdj4K \
ICA8L2JvZHk+CjwvaHRtbD4K'
email_body = base64.b64decode(body)
print email_body

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pt-br"
      lang="pt-br">

  
  <head>
    <meta content="text/html;charset=utf-8" http-equiv="Content-Type">
  </head>

  <body>
    <div style="margin: 10px;">            
      <label style="font-weight: bold;">Nome do remetente:</label>
      <span>VIRGINIA CONDORI CHOQUE</span>    
    </div>
    <div style="margin: 10px;">            
      <label style="font-weight: bold;">E-mail do remetente:</label>
      <span>mi_angel_espejo@hotmail.com</span>    
    </div>
    <div style="margin: 10px;">            
      <label style="font-weight: bold;">Assunto:</label>
      <span>Eu quero participar de O Exame Nacional de Revalidação de Diplomas Médicos 2014</span>    
    </div>
    <div style="margin: 10px;">            
      <label style="font-weight: bold;">Tipo:</label>
      <span>Outro</span>
    </div>     
    <div style="margin: 10px;">            
      <label style="font-weight: bold;">data de envio:</label>
      <span

### 2. Parse and extract relevant features from email body

In [69]:
from pyquery import PyQuery
import unicodedata

In [70]:
def unicode_normalize(text):
    return unicodedata.normalize('NFKD', unicode(text)).encode('ascii','ignore')

In [71]:
def parse_email(email_body):
    email_parser = PyQuery(email_body)
    email_fields = [unicode_normalize(span.text) for span in email_parser('div>span')]
    
    feature_names = ['remetente', 'email', 'assunto', 'tipo', 'data_envio', 'respondida?', 'mensagem_externa?']
    features = dict(zip(feature_names, email_fields))
    
    features['mensagem'] = unicode_normalize(email_parser('div>div').text()).lower()
    return features

In [72]:
email_features = parse_email(email_body)
print email_features

{'tipo': 'Outro', 'assunto': 'Eu quero participar de O Exame Nacional de Revalidacao de Diplomas Medicos 2014', 'respondida?': 'Sim', 'data_envio': '24/06/2014 01:19:33', 'mensagem': 'boa noite, por favor eu quero participar de o exame nacional de revalidacao de diplomas medicos expedidos por instituicoes de educacao superior estrangeiras  revalida 2014, mais nao posso fazer minha inscricao por que no ano passado quis fazer minha inscricao pero foi tarde demais e o cadastro que fiz foi com um e-mail que perdi (perdi a conta)e nao lembro a senha, porfavor como posso fazer para-me cadastrar novamente com outro e-mail, por favor me ajude, eu quero muito mesmo participar da prova. eu peco sua ajuda e grata antecipadamente pela sua ajuda porfavor. ou por favor me diga como eu posso falar com voces. meu e-mail atual mi_angel_espejo@hotmail.com', 'mensagem_externa?': 'Sim', 'email': 'mi_angel_espejo@hotmail.com', 'remetente': 'VIRGINIA CONDORI CHOQUE'}


### 3. Open messages file

In [74]:
messages = []
input_file = open('emails/INBOX.Arquivo Remoto.mbox')
line = input_file.readline()
while line:
    if 'X-Proofpoint-Virus-Version' in line: 
        content = ''
        input_file.readline()
        line = input_file.readline()
        line = ''
        while line != '\n':
            line = input_file.readline()
            content += line
        messages.append(content)
    line = input_file.readline()

In [29]:
# IN PROGRESS, DO NOT RUN
import mailbox
import email
import json

MBOX = 'emails/INBOX.Arquivo Remoto.mbox'
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

def objectify_message(msg):
    o_msg = dict([ (k, v) for (k,v) in msg.items() ])
    
    part = [p for p in msg.walk()][0]
    o_msg['contentType'] = part.get_content_type()
    o_msg['content'] = part.get_payload()
    
    return o_msg

messages = []
while 1:
    msg = mbox.next()
    if msg is None: break
    messages.append(objectify_message(msg))
    
print json.dumps(messages, indent=1)

[]


In [75]:
len(messages)

33941

In [76]:
messages[3]

'PGh0bWwgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGh0bWwiIHhtbDpsYW5nPSJwdC1i\nciIKICAgICAgbGFuZz0icHQtYnIiPgoKICAKICA8aGVhZD4KICAgIDxtZXRhIGNvbnRlbnQ9InRl\neHQvaHRtbDtjaGFyc2V0PXV0Zi04IiBodHRwLWVxdWl2PSJDb250ZW50LVR5cGUiPgogIDwvaGVh\nZD4KCiAgPGJvZHk+CiAgICA8ZGl2IHN0eWxlPSJtYXJnaW46IDEwcHg7Ij4gICAgICAgICAgICAK\nICAgICAgPGxhYmVsIHN0eWxlPSJmb250LXdlaWdodDogYm9sZDsiPk5vbWUgZG8gcmVtZXRlbnRl\nOjwvbGFiZWw+CiAgICAgIDxzcGFuPk7DumJpYSBNaWxhbmV6aTwvc3Bhbj4gICAgCiAgICA8L2Rp\ndj4KICAgIDxkaXYgc3R5bGU9Im1hcmdpbjogMTBweDsiPiAgICAgICAgICAgIAogICAgICA8bGFi\nZWwgc3R5bGU9ImZvbnQtd2VpZ2h0OiBib2xkOyI+RS1tYWlsIGRvIHJlbWV0ZW50ZTo8L2xhYmVs\nPgogICAgICA8c3Bhbj5udWJpYV9taWxhbmV6aUBob3RtYWlsLmNvbTwvc3Bhbj4gICAgCiAgICA8\nL2Rpdj4KICAgIDxkaXYgc3R5bGU9Im1hcmdpbjogMTBweDsiPiAgICAgICAgICAgIAogICAgICA8\nbGFiZWwgc3R5bGU9ImZvbnQtd2VpZ2h0OiBib2xkOyI+QXNzdW50bzo8L2xhYmVsPgogICAgICA8\nc3Bhbj5SZWRlIFDDumJsaWNhPC9zcGFuPiAgICAKICAgIDwvZGl2PgogICAgPGRpdiBzdHlsZT0i\nbWFyZ2luOiAxMHB4OyI+ICAgICAgICAgICAgCiAgICAgIDxsYWJlbCBzdHlsZT0

In [77]:
emails = []
for message in messages:
    try:
        decoded_message = base64.b64decode(message)
        emails.append(parse_email(decoded_message))
    except:
        pass


In [78]:
len(emails)

33299

In [79]:
emails[3]

{'assunto': 'Rede Publica',
 'data_envio': '30/07/2014 22:57:47',
 'email': 'nubia_milanezi@hotmail.com',
 'mensagem': 'ola, boa noite! escrevo como estudante, fiz todo o ensino na rede publica e com muito esforco tirei o melhor proveito. ainda existem excelente professores sim, aqueles que tem vontade de ensinar e sao por esses professores que me empenhei para ter uma boa pontuacao no enem e conseguir uma bolsa parcial para estudar em uma otima faculdade que estou hoje (esamc). atualmente estou finalizando meu curso em publicidade e propaganda e almejo um intercambio. trabalho desde os meus 13 anos,(como minha mae dizia nao era pelo valor mas sim pela responsabilidade e valorizacao no trabalho. e assim eu cresci com a minha mae e minha irma me ajudando. hoje venha em nome de todos os estudantes pedir uma ajuda, tenho esse sonho de estudar ingles em outro pais e me especializar na minha area, mas mesmo com meu salario inteiro e a ajuda da minha mae isso nao e possivel. com toda a confi

### 4.  TF-IDF and KMeans Clustering

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

import nltk
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import *

from collections import Counter

#### Stopwords

In [81]:
sorted(stopwords.words('portuguese'))

[u'a',
 u'ao',
 u'aos',
 u'aquela',
 u'aquelas',
 u'aquele',
 u'aqueles',
 u'aquilo',
 u'as',
 u'at\xe9',
 u'com',
 u'como',
 u'da',
 u'das',
 u'de',
 u'dela',
 u'delas',
 u'dele',
 u'deles',
 u'depois',
 u'do',
 u'dos',
 u'e',
 u'ela',
 u'elas',
 u'ele',
 u'eles',
 u'em',
 u'entre',
 u'era',
 u'eram',
 u'essa',
 u'essas',
 u'esse',
 u'esses',
 u'esta',
 u'estamos',
 u'estas',
 u'estava',
 u'estavam',
 u'este',
 u'esteja',
 u'estejam',
 u'estejamos',
 u'estes',
 u'esteve',
 u'estive',
 u'estivemos',
 u'estiver',
 u'estivera',
 u'estiveram',
 u'estiverem',
 u'estivermos',
 u'estivesse',
 u'estivessem',
 u'estiv\xe9ramos',
 u'estiv\xe9ssemos',
 u'estou',
 u'est\xe1',
 u'est\xe1vamos',
 u'est\xe3o',
 u'eu',
 u'foi',
 u'fomos',
 u'for',
 u'fora',
 u'foram',
 u'forem',
 u'formos',
 u'fosse',
 u'fossem',
 u'fui',
 u'f\xf4ramos',
 u'f\xf4ssemos',
 u'haja',
 u'hajam',
 u'hajamos',
 u'havemos',
 u'hei',
 u'houve',
 u'houvemos',
 u'houver',
 u'houvera',
 u'houveram',
 u'houverei',
 u'houverem',


In [82]:
stopwords_extended = ['nao', 'ja', 'pois', 'pra', 'sobre', 'deste', 'ta', 'bom', 'obrigado', 'obrigada']
stopwords_extended.extend(['gostaria', 'saber', 'fazer', 'boa', 'ate'])
stopwords_extended.extend(stopwords.words('portuguese'))

#### Tokenizing

In [83]:
def get_tokens(text):
    lowers = text.lower()
    no_punctuation = lowers.translate(None, string.punctuation)
    tokens = nltk.word_tokenize(no_punctuation)
    return tokens

tokens = get_tokens(emails[210]['mensagem'])
filtered = [w for w in tokens if not w in stopwords_extended]
count = Counter(filtered)
print count.most_common(100)

[('consigo', 2), ('suspender', 1), ('46669003811', 1), ('erro', 1), ('senha', 1), ('faculdade', 1), ('tirei', 1), ('logar', 1), ('vou', 1), ('exibe', 1), ('tranquei', 1), ('contrato', 1), ('cadastrado', 1), ('tentando', 1), ('suspencao', 1), ('novo', 1), ('posso', 1), ('aqui', 1), ('nova', 1), ('permanesse', 1), ('20140000416031', 1), ('todas', 1), ('dias', 1), ('pedir', 1), ('protocolo', 1), ('segui', 1), ('menssagem', 1), ('cadastrar', 1), ('conta', 1), ('ha', 1), ('ultimo', 1), ('atendimento', 1), ('semestre', 1), ('printscreen', 1), ('570501892', 1), ('anexar', 1), ('resolveu', 1), ('temporariamente', 1), ('fies', 1), ('0800', 1), ('orientacoes', 1), ('cpf', 1), ('email', 1), ('alguns', 1), ('orientado', 1)]


#### Stemming

In [84]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = SnowballStemmer('portuguese')
stemmed = stem_tokens(filtered[:1000], stemmer)
count = Counter(stemmed)
print count.most_common(20)

[(u'cadastr', 2), (u'consig', 2), (u'nov', 2), (u'suspend', 1), (u'erro', 1), (u'tranqu', 1), (u'segu', 1), (u'atend', 1), (u'vou', 1), (u'contrat', 1), (u'cont', 1), (u'orient', 1), (u'46669003811', 1), (u'suspenca', 1), (u'exib', 1), (u'log', 1), (u'ultim', 1), (u'tir', 1), (u'ped', 1), (u'dias', 1)]


In [85]:
texto = ' '.join([d['mensagem'] for d in emails])
tokens = get_tokens(texto)
filtered = [w for w in tokens if not w in stopwords_extended]
count = Counter(filtered)
print count.most_common(20)

[('curso', 7504), ('dia', 6629), ('preciso', 4872), ('faculdade', 4785), ('inscricao', 4342), ('mec', 4272), ('fies', 3772), ('consigo', 3732), ('aguardo', 3721), ('faco', 3639), ('desde', 3620), ('site', 3549), ('tarde', 3477), ('favor', 3462), ('posso', 3081), ('ensino', 2885), ('fiz', 2813), ('resposta', 2750), ('escola', 2745), ('ano', 2693)]


#### Pre-processing messages

In [55]:
def preparar_mensagem(text):
    tokens = get_tokens(text)
    filtered = [w for w in tokens if not w in stopwords_extended]
    stemmer = SnowballStemmer('portuguese')
    stemmed = stem_tokens(filtered, stemmer)
    return ' '.join(stemmed) 

In [87]:
random.choice(emails)['mensagem']

'bom dia gostaria de saber como faco para me escrever para prova do encceja 2014'

In [56]:
import random

preparar_mensagem(random.choice(emails)['mensagem'])

u'dia alun unifacs possu credit educativofi motiv pesso afast durant quatr semestr consecut matricul instituica engenh civil ped mud adm sra pass informaco dev proced retorn academ possu relatori medic abaix segu dad alun rodrig brit souz matricul 150121079 cpf 04210136565 ps possivel marc reunia sra maior esclarec grat rodrig brit'

In [57]:
mensagens_limpas = [preparar_mensagem(d['mensagem']) for d in emails]

#### Running TF-IDF

In [58]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words=stopwords_extended,
                                 use_idf=True)
X = vectorizer.fit_transform(mensagens_limpas)

#### Clustering

In [59]:
number_of_clusters = 10

km = KMeans(n_clusters=number_of_clusters, init='k-means++', max_iter=100, n_init=1,verbose=False)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=10, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

In [60]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print

Top terms per cluster:
Cluster 0:
 pag
 7458
 suely
 qto
 recolh
 part
 valor
 agor
 dev
 dess

Cluster 1:
 asim
 pess
 consig
 sisu
 escrev
 final
 ver
 urgenc
 ultim
 not

Cluster 2:
 univers
 matricul
 rematricul
 tranc
 curs
 tuiut
 period
 praz
 lucrat
 setim

Cluster 3:
 curs
 mec
 fac
 precis
 escol
 dia
 faculdad
 quer
 poss
 receb

Cluster 4:
 respald
 acerc
 transferenc
 vinh
 realidad
 tid
 veig
 zer
 comprovaca
 respost

Cluster 5:
 empreg
 vag
 domest
 cadastr
 precis
 procur
 fac
 trabalh
 sit
 cei

Cluster 6:
 2000
 enem
 fies
 oi
 duv
 entrar
 medi
 obter
 conclu
 ano

Cluster 7:
 entrar
 instrutor
 simec
 consig
 der
 antes
 soluca
 ness
 mand
 quant

Cluster 8:
 inscrica
 fies
 consig
 cadastr
 senh
 acess
 enem
 consegu
 sit
 cpf

Cluster 9:
 servic
 temp
 empres
 prest
 dia
 gui
 sit
 encontr
 solicit
 public



In [88]:
print order_centroids

[[7045  814 9011 ..., 6659 6658    0]
 [1598 7295 2776 ..., 6658 6657    0]
 [9598 6201 8174 ..., 6649 6648    0]
 ..., 
 [3955 5424 8772 ..., 6660 6659    0]
 [5367 4487 2776 ..., 5533 5535 4999]
 [8697 9190 3839 ..., 5970 5972    0]]


In [89]:
print km.predict(X)

[8 3 3 ..., 3 8 8]


In [61]:
Counter(km.predict(X))

Counter({0: 1,
         1: 2,
         2: 2,
         3: 24696,
         4: 3,
         5: 308,
         6: 2,
         7: 2,
         8: 7336,
         9: 947})

In [62]:
indices = [ind for ind, x in enumerate(km.predict(X)) if x == 4]

In [63]:
[emails[i]['assunto'] for i in indices]

['bolsa da univercidade', 'bolsa da univercidade', 'bolsa da univercidade']

In [64]:
# Analisando resultado
import numpy as np
from sklearn import metrics

labels = km.labels_
metrics.silhouette_score(X, labels)



-0.014773567204662734

#### DBSCAN

In [65]:
from sklearn import cluster

dbscan = cluster.DBSCAN(eps=.2)

In [66]:
X_dense=X.todense()
dbscan.fit(X_dense)

DBSCAN(algorithm='auto', eps=0.2, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=None)

#### Topics extraction with Non-Negative Matrix Factorization

In [113]:
from sklearn.decomposition import NMF

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words=stopwords_extended)
tfidf = vectorizer.fit_transform(mensagens_limpas)

nmf = NMF(n_components=10, random_state=1).fit(tfidf)
feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-20 - 1:-1]]))
    print()

Topic #0:
curs mec reconhec faculdad instituica tecnic graduaca pos distanc univers diplom sao oferec quer superior matricul ola credenc tard pronatec
()
Topic #1:
consig acess sit cont pagin encontr tent ajud ter result entrar dad consult onde aparec link informaco mud sistem ver
()
Topic #2:
inscrica enem pagament efetu fiz 2014 dia confirm bolet aparec realiz cpf consegu tent const pagu sit numer prov erro
()
Topic #3:
fies precis adit faculdad banc semestr consegu problem contrat financ ajud praz favor resolv cancel sistem urgent nom demand tent
()
Topic #4:
dia receb pod aguard dev desd poss ser trabalh inform respost agradec pag alun solicit fic mes proced tod algum
()
Topic #5:
fac numer quer via precis consegu obter benefici pis tir segund tard inss cpf carta perd consult empres nit noit
()
Topic #6:
senh email acess envi nov recuper esquec precis favor consegu login receb cpf solicit result lembr inval enem perd mand
()
Topic #7:
ensin medi certific conclusa conclu enem ano no

#### Other experiments