In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label','message'])

df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['label_num'] = df.label.map({'ham':0, 'spam':1})



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=2000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9829596412556054
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
msg = ["Congratulations! You won free tickets. Call now!"] #testing1
msg_vec = vectorizer.transform(msg)
print("Prediction:", model.predict(msg_vec))


Prediction: [1]


In [None]:
#testing more cases
test_msgs = [
    "Hey,are we meeting today at 5?",
    "URGENT!Your account is suspended. Click this link now!"
]

test_vec = vectorizer.transform(test_msgs)
pred = model.predict(test_vec)

for msg, p in zip(test_msgs, pred):
    print(f"Message: {msg}")
    print("Prediction:", "Spam" if p==1 else "Ham")
    print("------")


Message: Hey,are we meeting today at 5?
Prediction: Ham
------
Message: URGENT!Your account is suspended. Click this link now!
Prediction: Spam
------
