In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/SMSSpamCollection", sep='\t', header=None, names=["label", "message"])

# Show first few rows
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Step 1: Convert labels to binary
df["label_num"] = df["label"].map({"ham": 0, "spam": 1})

# Step 2: Check the balance
print(df["label_num"].value_counts())

# Optional: Check for missing values
print(df.isnull().sum())


label_num
0    4825
1     747
Name: count, dtype: int64
label        0
message      0
label_num    0
dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["message"], df["label_num"], test_size=0.2, random_state=42)

# Step 2: TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Check shape
X_train_vec.shape, X_test_vec.shape


((4457, 7441), (1115, 7441))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train the model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Predict on test set
y_pred = model.predict(X_test_vec)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
