In [1]:
#importando bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Upload do dataset
from google.colab import files
upload = files.upload()

In [None]:
dataset = pd.read_csv("spam.csv")
dataset.head()

In [None]:
#Obtendo os valores de categoria
dataset["Category"].unique()

In [None]:
#formato dos dados (Linhas e colunas)
dataset.shape

In [None]:
#distribuição de documentos por classe
dataset["Category"].value_counts()

In [8]:
#Separando os valores em documentos e classes
documentos = dataset["Message"]
classes = dataset["Category"]

In [9]:
#Extraindo as features dos textos - Matriz TF-IDF
vetorizador = TfidfVectorizer()
features = vetorizador.fit_transform(documentos)

In [10]:
#Formato da matriz de features
features.shape

(5572, 8709)

In [11]:
#Rótulos das colunas
print(vetorizador.get_feature_names_out()[1:10])

['000' '000pes' '008704050406' '0089' '0121' '01223585236' '01223585334'
 '0125698789' '02']


In [12]:
#dividindo os dados em treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(features, classes, test_size=0.3)

In [17]:
#Treinando o modelo
random_forest = RandomForestClassifier()
random_forest.fit(x_treino, y_treino)

In [18]:
#Realizando a predição das classes do conjunto de teste
previsao = random_forest.predict(x_teste)

In [19]:
#Visualizando as previsões
print(previsao)

['not-spam' 'not-spam' 'not-spam' ... 'not-spam' 'not-spam' 'not-spam']


In [20]:
#matriz de confusão
print(confusion_matrix(y_teste, previsao))

[[1464    0]
 [  33  175]]


In [21]:
#Acurácia do modelo
print(accuracy_score(y_teste, previsao))

0.9802631578947368


In [22]:
#Apresentando as métricas do modelo
print(metrics.classification_report(y_teste, previsao))

              precision    recall  f1-score   support

    not-spam       0.98      1.00      0.99      1464
        spam       1.00      0.84      0.91       208

    accuracy                           0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [27]:
#mensagem = ["To earn £2.50 of free call credit and details of great deals, reply OK to this\ text with your full name, house number and postcode"]
mensagem = ["Hello, I would like to invite you on a long journey. If you accept, \
send me your personal data"]
mensagem = vetorizador.transform(mensagem)
mensagem.shape

(1, 8709)

In [28]:
#Predição da classe
predicao = random_forest.predict(mensagem)
print(predicao)

['not-spam']
