# Classificação de Texto

#### Importando as bibliotecas

In [10]:
#import sys
#import nltk
#import sklearn
import pandas as pd
import numpy as np

#### Carregar o dataset

5000 SMS labeled mensagens classificadas como spam. O dataset pode ser acessado no seguiinte [link](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)

In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/spam.csv',encoding='latin-1')

In [22]:
# checando o shape do dataset
df.shape

(5572, 5)

In [23]:
# checando o head do dataset
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
# renomeando as colunas
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
# examinando a distribuição
df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [31]:
# hot-encoding a coluna de label
df['label_num']=df.label.map({"ham":0,"spam":1})
# checando a conversão
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [32]:
# estabelecendo x e y
x = df.message
y = df.label_num
print(x.shape)
print(y.shape)

(5572,)
(5572,)


In [34]:
# separar x e y em treino e teste
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1)

print(x_train.shape)
print(x_test.shape)

(4179,)
(1393,)


#### Vetorizando os dados

In [35]:
# import o countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

# criando a matriz de termos
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm

<4179x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 55614 stored elements in Compressed Sparse Row format>

In [36]:
# transformar o conjunto de teste na matrix
x_test_dtm = vect.transform(x_test)
x_test_dtm

<1393x7496 sparse matrix of type '<class 'numpy.int64'>'
	with 17010 stored elements in Compressed Sparse Row format>

### Naive Bayes Classificador

In [37]:
# importando bayes# print the confusion matriz
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()metrics.confusion_matrix(y_test,y_pred_class)

In [38]:
# treiando o modelo usando o x _train_dtm
%time nb.fit(x_train_dtm,y_train)

CPU times: user 6.52 ms, sys: 8.14 ms, total: 14.7 ms
Wall time: 754 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
# faça a predição de x_test_dtm
y_pred_class = nb.predict(x_test_dtm)

In [40]:
# calcular a acurácia da classe de predição
from sklearn import metrics
metrics.accuracy_score(y_test,y_pred_class)

0.9856424982053122

In [43]:
# print a matriz de confusão
metrics.confusion_matrix(y_test,y_pred_class)

array([[1205,    8],
       [  12,  168]])

<strong>1205</strong> - eram spam e foram classificados como spam<br />
<strong>8</strong>    - não eram spam, mas foram classicados como spam<br />
<strong>12</strong>   - eram spam, mas não foram classificados como spam<br />
<strong>168</strong>  - não eram spam, e não foram classificados como spam

In [45]:
# imprima a mensagem como falso positivo (o que significa que eles foram classificados incorretamente como spam)
x_test[y_test<y_pred_class]

325                      No calls..messages..missed calls
4598              Have you laid your airtel line to rest?
1289    Hey...Great deal...Farm tour 9am to 5pm $95/pa...
45                       No calls..messages..missed calls
573                                Waiting for your call.
3373                              Also andros ice etc etc
1081                    Can u get pic msgs to your phone?
494                      Are you free now?can i call now?
Name: message, dtype: object

In [46]:
# imprima a mensagem como falso negativo (o que significa que eles foram classificados incorretamente como não spam)
x_test[y_test > y_pred_class]

4674    Hi babe its Chloe, how r u? I was smashed on s...
3528    Xmas & New Years Eve tickets are now on sale f...
1499    SMS. ac JSco: Energy is high, but u may not kn...
3417    LIFE has never been this much fun and great un...
2773    How come it takes so little time for a child w...
5       FreeMsg Hey there darling it's been 3 week's n...
1457    CLAIRE here am havin borin time & am now alone...
2429    Guess who am I?This is the first time I create...
4067    TBS/PERSOLVO. been chasing us since Sept forå£...
3358    Sorry I missed your call let's talk when you h...
2821    ROMCAPspam Everyone around should be respondin...
2247    Back 2 work 2morro half term over! Can U C me ...
Name: message, dtype: object

#### Fim!