###Prof. Fernando Amaral https://www.eia.ai/

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from google.colab import files 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [6]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)

In [7]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)

X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)



In [12]:
print(X_train)

[[108, 40, 9, 28, 1, 150, 6, 30, 718, 398], [70, 20, 209, 535, 98, 10, 5], [141, 1, 560, 70, 26, 51], [], [29, 3, 112, 8, 5, 189, 47, 44, 561, 11, 385], [1, 399, 843, 113, 50, 8, 285], [295, 341, 49, 599, 501, 18, 6, 719, 341, 74, 193, 21, 2, 562, 12, 193, 501, 26, 844, 21, 2, 562, 12, 193, 216, 720, 238], [52, 22, 1, 60, 8, 400, 29, 1, 61, 217, 1, 35, 61, 920, 136], [33, 668, 600, 55, 21, 102, 41, 386, 76, 249, 1, 121, 35, 46, 64], [52], [25, 12, 25, 218], [124, 103, 169, 3, 55, 315, 59, 42], [41, 30, 779, 36, 2, 50, 4, 416], [480, 54, 91, 721, 5, 64, 14], [29, 1, 65, 1, 780], [15, 273, 373, 43, 194, 56, 40], [86, 71, 16, 99, 52], [86, 8, 286, 71, 16, 3, 99], [601, 56, 70, 219, 34, 4, 316, 14, 64, 8, 19, 458, 845, 26, 92, 13, 56, 4, 57, 75, 2, 8, 4, 257, 401, 921, 22], [45, 239, 25, 2, 563, 43, 210, 64, 205, 8, 5, 173, 361, 781, 9, 325, 14, 3, 564, 26, 10, 19, 3, 402, 2, 563], [250, 77, 8, 387, 722], [3, 220, 782, 2, 536, 10, 42, 44], [17, 3, 55, 502, 503, 29, 25, 1, 35, 32, 15, 22], 

In [13]:
X_train = pad_sequences(X_train, padding="post", maxlen=500)
X_test = pad_sequences(X_test, padding="post", maxlen=500)

In [14]:
print(X_train)

[[108  40   9 ...   0   0   0]
 [ 70  20 209 ...   0   0   0]
 [141   1 560 ...   0   0   0]
 ...
 [103   1 331 ...   0   0   0]
 [110  30  40 ...   0   0   0]
 [776  74  16 ...   0   0   0]]


In [15]:
print(len(token.word_index))

7438


In [16]:
modelo = Sequential()
modelo.add(Embedding(input_dim=len(token.word_index), output_dim=50,input_length=500))
modelo.add(Flatten())

modelo.add(Dense(units=10,activation="relu"))
modelo.add(Dropout(0.1))
modelo.add(Dense(units=1,activation="sigmoid"))

In [17]:
modelo.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           371900    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 10)                250010    
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 621,921
Trainable params: 621,921
Non-trainable params: 0
_________________________________________________________________


In [18]:
modelo.fit(X_train, y_train,epochs=20,batch_size=10, verbose=True, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff7253c2490>

In [19]:
loss, accuracy = modelo.evaluate(X_test,y_test)
print("Loss: ", loss)
print("Acurácia: ", accuracy)

Loss:  0.009569073095917702
Acurácia:  0.9886363744735718


In [20]:
nova_previsao = modelo.predict(X_test)
print(nova_previsao)

[[3.2913049e-15]
 [3.4213838e-06]
 [2.0274712e-05]
 ...
 [1.0000000e+00]
 [5.1779631e-10]
 [1.0000000e+00]]


In [21]:
prev = (nova_previsao > 0.5)
print(prev)

[[False]
 [False]
 [False]
 ...
 [ True]
 [False]
 [ True]]


In [22]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1445    7]
 [  12  208]]
