#### Prerequisite Libraries

#### Code and Output


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label', 'message'])

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = ''.join([char for char in text if char not in string.punctuation]).lower()
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

df['message'] = df['message'].apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

y_pred = clf.predict(X_test_counts)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


def predict_message(message, clf, vectorizer):
  processed_message = preprocess_text(message)
  message_counts = vectorizer.transform([processed_message])
  prediction = clf.predict(message_counts)[0]
  return prediction

new_message = "This is a promotional mail"
predicted_label = predict_message(new_message, clf, vectorizer)
print(f"\nPrediction for the message: '{new_message}' is '{predicted_label}'") 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/syntaxintel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9856502242152466

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.97      0.92      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115


Prediction for the message: 'This is a promotional mail' is 'ham'
