In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("spam.csv")
print("Data shape:", df.shape)
df.head()

texts = df["text"].astype(str).tolist()
labels = df["target"].tolist()


label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
)

print("Train size:", len(X_train_texts), "Test size:", len(X_test_texts))


Data shape: (5572, 2)
Train size: 4457 Test size: 1115


In [2]:
!pip install gensim




In [5]:
from gensim.models import Word2Vec
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Tokenize texts
def tokenize(text):
    return nltk.word_tokenize(text.lower())

tokenized_train = [tokenize(text) for text in X_train_texts]
tokenized_test = [tokenize(text) for text in X_test_texts]

# Train Word2Vec on your training texts
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=4)
print("Word2Vec vocab size:", len(w2v_model.wv))

# Function to average word vectors for each text
def text_to_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

X_train_vec = np.array([text_to_vector(tokens) for tokens in tokenized_train])
X_test_vec = np.array([text_to_vector(tokens) for tokens in tokenized_test])

print("Train vector shape:", X_train_vec.shape)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word2Vec vocab size: 8384
Train vector shape: (4457, 100)


In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)
print("Naive Bayes Results:\n", classification_report(y_test, y_pred_nb))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)
print("Random Forest Results:\n", classification_report(y_test, y_pred_rf))

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb.fit(X_train_vec, y_train)
y_pred_xgb = xgb.predict(X_test_vec)
print("XGBoost Results:\n", classification_report(y_test, y_pred_xgb))


Naive Bayes Results:
               precision    recall  f1-score   support

           0       0.95      0.68      0.79       966
           1       0.26      0.74      0.39       149

    accuracy                           0.69      1115
   macro avg       0.60      0.71      0.59      1115
weighted avg       0.85      0.69      0.74      1115

Random Forest Results:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       966
           1       0.83      0.57      0.68       149

    accuracy                           0.93      1115
   macro avg       0.89      0.78      0.82      1115
weighted avg       0.92      0.93      0.92      1115



Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       966
           1       0.87      0.70      0.77       149

    accuracy                           0.95      1115
   macro avg       0.91      0.84      0.87      1115
weighted avg       0.94      0.95      0.94      1115

