In [3]:
# Importar bibliotecas
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Cargar el dataset
data = pd.read_csv("spam.csv", encoding="latin-1")

# Seleccionar solo las columnas necesarias
data = data.iloc[:, [0, 1]]  # Selecciona solo la primera y segunda columna
data.columns = ["label", "message"]  # Renombra las columnas


# Mapear las etiquetas a valores binarios
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# Previsualizar los datos
print(data.head())

# Dividir en características (X) y etiquetas (y)
X = data['message']
y = data['label']

# Vectorizar los mensajes de texto
vectorizer = CountVectorizer(stop_words='english')
X_vectors = vectorizer.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.3, random_state=42)

# Entrenar el modelo Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Hacer predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Probar con un mensaje nuevo
test_message = ["Congratulations! You've won a $1,000 Walmart gift card. Call now!"]
test_vector = vectorizer.transform(test_message)
prediction = model.predict(test_vector)
print("\n¿Es spam?:", "Sí" if prediction[0] == 1 else "No")


   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
Accuracy: 0.9772727272727273

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1453
           1       0.91      0.92      0.91       219

    accuracy                           0.98      1672
   macro avg       0.95      0.95      0.95      1672
weighted avg       0.98      0.98      0.98      1672


¿Es spam?: Sí
