In [38]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

In [39]:
path = kagglehub.dataset_download('abhishek14398/sms-spam-collection')

Using Colab cache for faster access to the 'sms-spam-collection' dataset.


In [40]:
df = pd.read_csv(f'{path}/SMSSpamCollection.csv')
df.head()

Unnamed: 0,"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
0,ham\tOk lar... Joking wif u oni...
1,spam\tFree entry in 2 a wkly comp to win FA Cu...
2,ham\tU dun say so early hor... U c already the...
3,"ham\tNah I don't think he goes to usf, he live..."
4,spam\tFreeMsg Hey there darling it's been 3 we...


In [41]:
from sklearn.model_selection import train_test_split

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [43]:
df[['label','message']] = df['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'].str.split('\t', expand=True)
df.drop('ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', axis=1, inplace=True)
df.head()

Unnamed: 0,label,message
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [44]:
df['label_number'] = df.label.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,label,message,label_number
0,ham,Ok lar... Joking wif u oni...,0
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
2,ham,U dun say so early hor... U c already then say...,0
3,ham,"Nah I don't think he goes to usf, he lives aro...",0
4,spam,FreeMsg Hey there darling it's been 3 week's n...,1


In [45]:
vactorizer = TfidfVectorizer(stop_words='english')
X = vactorizer.fit_transform(df.message)
y = df.label_number

In [46]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=42)


In [47]:
model = MultinomialNB()
model.fit(train_x, train_y)

In [48]:
prediction = model.predict(test_x)
print(f"Accuracy: {accuracy_score(test_y, prediction)}")
print(f"\nClassification Report:\n{classification_report(test_y, prediction)}")

Accuracy: 0.968609865470852

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       953
           1       1.00      0.78      0.88       162

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [50]:
custom_sms = input("Enter a message: ")
custom_sms_vectorized = vactorizer.transform([custom_sms])
prediction = model.predict(custom_sms_vectorized)
ans = {0:'Not a Spam', 1:'Spam'}[prediction[0]]
print(f"The message is: {ans}")

Enter a message: This project is spam
The message is: Not a Spam
