In [None]:
#loading the dataset
import pandas as pd

df = pd.read_csv('/content/spam.csv', encoding='latin-1')

print(df.head())

print("\nColumns in dataset:", df.columns)


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Columns in dataset: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
# Keeping only necessary columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
#using spacy for preprocessing steps
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess_text_spacy(text):
    doc = nlp(text)

    cleaned_text = ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return cleaned_text

df['cleaned_message'] = df['message'].apply(preprocess_text_spacy)

df[['message', 'cleaned_message']].head()


Unnamed: 0,message,cleaned_message
0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win FA Cup final tkts 2...
3,U dun say so early hor... U c already then say...,u dun early hor u c
4,"Nah I don't think he goes to usf, he lives aro...",Nah think go usf live


In [5]:
#using tf-idf for vectorisation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['cleaned_message'])

print("Shape of TF-IDF matrix:", X.shape)


Shape of TF-IDF matrix: (5572, 7582)


In [12]:
#using LR with gridSearch which gives us very good accuracy scores
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.3, random_state=42)

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'C': 10, 'solver': 'saga'}
Best Score: 0.9817948717948719
Accuracy: 0.9784688995215312

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1453
        spam       0.95      0.88      0.91       219

    accuracy                           0.98      1672
   macro avg       0.97      0.94      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [13]:
# saving the model
import joblib
joblib.dump(model, 'spam_classifier_model.joblib')

['spam_classifier_model.joblib']

In [15]:
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']