In [63]:
import numpy as np
import pandas as pd

In [64]:
df = pd.read_csv('messages.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2831 non-null   object
 1   message  2893 non-null   object
 2   label    2893 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.9+ KB


In [65]:
df[df['label'] == 0].head().merge(df[df['label'] == 1].head(), how='outer')

Unnamed: 0,subject,message,label
0,cable decsrambler now only $ 6 . 99 !,this is really cool ! premium channels and pay...,1
1,free,this is a multi-part message in mime format . ...,1
2,free stealth 3 . 0 bulk email software . . .,"just released . . . 30 , 000 , 000 email addre...",1
3,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
4,need more money ?,"hi , would you like to earn an extra $ 700 a w...",1
5,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
6,request book information,earlier this morning i was on the phone with a...,0
7,risk,a colleague and i are researching the differin...,0
8,the internet success toolbox,note : we do not wish to send e-mail to anyone...,1
9,,"lang classification grimes , joseph e . and ba...",0


In [66]:
df.notnull().sum()

subject    2831
message    2893
label      2893
dtype: int64

In [67]:
# %pip install unicode

In [68]:
import re

def remover_caracteres_especiais(texto, remover_acentos=False) -> str:
    if remover_acentos:
        from unidecode import unidecode
        texto = unidecode(texto)
    
    texto_limpo = re.sub(r'[^\w\s]', '', texto)
    return texto_limpo

import re

def remover_numeros(texto):
    texto_limpo = re.sub(r'\d', '', texto)
    return texto_limpo

In [69]:
def criar_vocabulario(comentarios):
    vocabulario = set()
    for comentario in comentarios:
        tokens = remover_numeros(remover_caracteres_especiais(comentario)).lower().split() # importante para case_sensitive
        vocabulario.update(tokens)
    return sorted(list(vocabulario))

def criar_vetores(comentarios, vocabulario):
    vetores = []
    for comentario in comentarios:
        tokens = remover_numeros(remover_caracteres_especiais(comentario)).lower().split()
        vetor = [1 if palavra in tokens else 0 for palavra in vocabulario]
        vetores.append(vetor)
    return np.array(vetores)

In [70]:
comentarios = df['message'].dropna().tolist()

vocabulario = criar_vocabulario(comentarios)

vetores_numericos = criar_vetores(comentarios, vocabulario)

In [71]:
df['Vocabulario'] = str(vocabulario)
df['Vetor'] = [list(vetor) for vetor in vetores_numericos]

In [72]:
df[['message', 'Vocabulario', 'Vetor']].head()

Unnamed: 0,message,Vocabulario,Vetor
0,content - length : 3386 apple-iss research cen...,"['_', 'a', 'aa', 'aaa', 'aaai', 'aaal', 'aaarg...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"lang classification grimes , joseph e . and ba...","['_', 'a', 'aa', 'aaa', 'aaai', 'aaal', 'aaarg...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,i am posting this inquiry for sergei atamas ( ...,"['_', 'a', 'aa', 'aaa', 'aaai', 'aaal', 'aaarg...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,a colleague and i are researching the differin...,"['_', 'a', 'aa', 'aaa', 'aaai', 'aaal', 'aaarg...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,earlier this morning i was on the phone with a...,"['_', 'a', 'aa', 'aaa', 'aaai', 'aaal', 'aaarg...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [76]:
print(df['Vocabulario'].size)
print(df['Vocabulario'].size == df['Vetor'].size)

2893
True


In [None]:
df['mean'] = np.mean(df['Vetor'], axis=0)

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X = df.drop(['Vocabulario', 'label', 'subject', 'message'], axis=1)[100:300]
y = df['label'][500:700]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [126]:
X_train

Unnamed: 0,Vetor
258,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
291,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
150,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
100,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
194,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
167,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
292,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
217,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
147,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [127]:
gnb = GaussianNB()
fit_model = gnb.fit(X_train, y_train)

ValueError: setting an array element with a sequence.