In [6]:
import pandas as pd
import os

print("Files in this session:", os.listdir())

df = pd.read_csv("spam.csv")
print("First few rows of the dataset:")
print(df.head())


Files in this session: ['.config', 'spam.csv', 'sample_data']
First few rows of the dataset:
  label                                               text
0   ham   Hey there! Are we still meeting for lunch today?
1  spam  Congratulations! You've won a $1000 gift card....
2   ham  Can you send me the notes from yesterday’s class?
3  spam  Get cheap meds now!!! No prescription needed. ...
4   ham  Don't forget to submit the assignment before 5...


In [7]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_text(txt):
    txt = txt.lower()
    txt = ''.join([c for c in txt if c not in string.punctuation])
    words = txt.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Clean the text
df['clean_text'] = df['text'].apply(clean_text)

# Convert labels: ham = 0, spam = 1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,text,clean_text,label_num
0,ham,Hey there! Are we still meeting for lunch today?,hey still meeting lunch today,0
1,spam,Congratulations! You've won a $1000 gift card....,congratulations youve 1000 gift card click claim,1
2,ham,Can you send me the notes from yesterday’s class?,send notes yesterday’s class,0
3,spam,Get cheap meds now!!! No prescription needed. ...,get cheap meds prescription needed visit websi...,1
4,ham,Don't forget to submit the assignment before 5...,dont forget submit assignment 5 pm,0


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['clean_text']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tfidf = TfidfVectorizer()
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tf.shape)


TF-IDF shape: (6, 35)


In [11]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

model = LinearSVC()
model.fit(X_train_tf, y_train)

y_pred = model.predict(X_test_tf)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
def predict_message(msg):
    clean = clean_text(msg)
    vec = tfidf.transform([clean])
    result = model.predict(vec)[0]
    return "SPAM ⚠️" if result == 1 else "NOT SPAM ✅"

# Test it
print(predict_message(" Win free money now !"))
print(predict_message("Hi, let 's catchup later."))
print(predict_message("Reminder: class starts at 9AM , dont be late!"))


SPAM ⚠️
SPAM ⚠️
NOT SPAM ✅
