In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


data = pd.read_csv("spam.csv",encoding="windows-1252")

Split label column and content column

In [12]:

X = data["v2"]
y = data["v1"]

encode ham and spam

In [13]:

y = y.map({"ham": 0, "spam": 1})


split train and test data

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initialize and utilize TF-IDF to convert text to numerical representation

In [15]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


Initialize and train the Logistic Regression model

In [16]:

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

Predict on the test set

In [17]:
y_pred = model.predict(X_test_tfidf)

Evaluate the model

In [18]:

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9524663677130045


In [19]:

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



Create a DataFrame for the classification result

In [20]:
result_df = pd.DataFrame({'Text': X_test, 'Actual Label': y_test, 'Predicted Label': y_pred})
display(result_df)

Unnamed: 0,Text,Actual Label,Predicted Label
3245,"Funny fact Nobody teaches volcanoes 2 erupt, t...",0,0
944,I sent my scores to sophas and i had to do sec...,0,0
1044,We know someone who you know that fancies you....,1,0
2484,Only if you promise your getting out as SOON a...,0,0
812,Congratulations ur awarded either å£500 of CD ...,1,1
...,...,...,...
4264,&lt;DECIMAL&gt; m but its not a common car he...,0,0
2439,Rightio. 11.48 it is then. Well arent we all u...,0,0
5556,Yes i have. So that's why u texted. Pshew...mi...,0,0
4205,"Get the door, I'm here",0,0


Export the classification result

In [21]:

result_df.to_csv("classification_result.csv", index=False)
