In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pawar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# get necessary columns for processing
df = df[['v2', 'v1']]
# df.rename(columns={'v2': 'messages', 'v1': 'label'}, inplace=True)
df = df.rename(columns={'v2': 'messages', 'v1': 'label'})
df.head()

Unnamed: 0,messages,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [5]:
# check for null values
df.isnull().sum()

messages    0
label       0
dtype: int64

In [6]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [7]:
# clean the messages
df['clean_text'] = df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,label,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


In [8]:
X = df['clean_text']
y = df['label']

In [19]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)

# Create a pipeline and train the model
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())])

In [20]:
# Train the model
pipeline.fit(x_train, y_train)

In [21]:
# Save the model to a file
joblib.dump(pipeline, 'spam_classifier_model.pkl')

['spam_classifier_model.pkl']

In [22]:
# Function to classify new SMS
def classify_sms(sms):
    clean_sms = clean_text(sms)
    
    model = joblib.load('spam_classifier_model.pkl')
    
    prediction = model.predict([clean_sms])[0]
    
    return 'Spam' if prediction == 'spam' else 'Not Spam'

In [23]:
# Example usage
new_sms = "Congratulations! You've won a free ticket to Bahamas. Text WIN to 12345."
print(f"Message: {new_sms}")
print(f"Prediction: {classify_sms(new_sms)}")

Message: Congratulations! You've won a free ticket to Bahamas. Text WIN to 12345.
Prediction: Spam


In [24]:
new_sms = "Hey, don't forget we have a meeting tomorrow at 3 PM. Let me know if you need anything."
print(f"Message: {new_sms}")
print(f"Prediction: {classify_sms(new_sms)}")

Message: Hey, don't forget we have a meeting tomorrow at 3 PM. Let me know if you need anything.
Prediction: Not Spam
