In [73]:
import pandas as pd
import numpy as np
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Perceptron, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


dataset source: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [16]:
data = pd.read_csv('spam', delimiter='\t', names=['label', 'message'])


In [17]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [18]:
data['elabel'] = data['label'].map({'ham': 0, 'spam': 1})

In [19]:
data

Unnamed: 0,label,message,elabel
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [20]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [26]:
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

In [29]:
print(clean_text("hello, I am dilip. how are you !???"))

hello dilip


In [30]:
data['cleaned_msg'] = data['message'].apply(clean_text)

In [31]:
data.isnull().sum()

Unnamed: 0,0
label,0
message,0
elabel,0
cleaned_msg,0


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   label        5572 non-null   object
 1   message      5572 non-null   object
 2   elabel       5572 non-null   int64 
 3   cleaned_msg  5572 non-null   object
dtypes: int64(1), object(3)
memory usage: 174.3+ KB


In [33]:
data

Unnamed: 0,label,message,elabel,cleaned_msg
0,ham,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah dont think goes usf lives around though
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,2nd time tried 2 contact u u £750 pound prize ...
5568,ham,Will ü b going to esplanade fr home?,0,ü b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,pity mood soany suggestions
5570,ham,The guy did some bitching but I acted like i'd...,0,guy bitching acted like id interested buying s...


In [34]:
x = data['cleaned_msg']
y = data['elabel']

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [36]:
vectorizer = TfidfVectorizer(max_features=5000)

In [37]:
vectorizer

In [38]:
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [39]:
x_test_tfidf

<1115x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 7849 stored elements in Compressed Sparse Row format>

In [40]:
x_train_df = pd.DataFrame.sparse.from_spmatrix(x_train_tfidf, columns=vectorizer.get_feature_names_out())
x_test_df = pd.DataFrame.sparse.from_spmatrix(x_test_tfidf,columns=vectorizer.get_feature_names_out())

In [41]:
x_train_df

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,020603,0207,02070836089,02072069400,02073162414,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,üll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
clf = LazyClassifier()

In [43]:
print("Shape of x_train_df:", x_train_df.shape)
print("Shape of x_test_df:", x_test_df.shape)

print("\nData type of shape:", y_train.shape)
print("\nData type of shape:", y_test.shape)

Shape of x_train_df: (4457, 5000)
Shape of x_test_df: (1115, 5000)

Data type of shape: (4457,)

Data type of shape: (1115,)


In [54]:
models_trained= clf.fit(x_train_df, x_test_df, y_train, y_test)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 32/32 [00:26<00:00,  1.19it/s]

(Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: [], Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: [])





In [None]:
print(models_trained)

Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: []


In [48]:
model_list = [
    ('AdaBoostClassifier', AdaBoostClassifier()),
    ('BaggingClassifier', BaggingClassifier()),
    ('BernoulliNB', BernoulliNB()),
    ('CalibratedClassifierCV', CalibratedClassifierCV()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('DummyClassifier', DummyClassifier()),
    ('ExtraTreeClassifier', ExtraTreeClassifier()),
    ('ExtraTreesClassifier', ExtraTreesClassifier()),
    ('GaussianNB', GaussianNB()),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('LabelPropagation', LabelPropagation()),
    ('LabelSpreading', LabelSpreading()),
    ('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()),
    ('LinearSVC', LinearSVC(max_iter=10000)),
    ('LogisticRegression', LogisticRegression(max_iter=1000)),
    ('NearestCentroid', NearestCentroid()),
    ('NuSVC', NuSVC(probability=True)),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier()),
    ('Perceptron', Perceptron()),
    ('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('RidgeClassifier', RidgeClassifier()),
    ('RidgeClassifierCV', RidgeClassifierCV()),
    ('SGDClassifier', SGDClassifier(max_iter=10000, tol=1e-3)),
    ('SVC', SVC(probability=True)),
    ('XGBClassifier', XGBClassifier()),
    ('LGBMClassifier', LGBMClassifier())
]


In [53]:
model_results_manual_all = []
for model_name, model in model_list:
    start_time = time.time()
    try:
        if model_name == 'GaussianNB': # GaussianNB needs dense input
            model.fit(x_train_df.to_dense(), y_train)
            predictions = model.predict(x_test_df.to_dense())
        else:
            model.fit(x_train_df, y_train)
            predictions = model.predict(x_test_df)
        training_time = time.time() - start_time
        results = {
            'Model': model_name,
            'Accuracy': accuracy_score(y_test, predictions),
            'Balanced Accuracy': balanced_accuracy_score(y_test, predictions),
            'F1 Score': f1_score(y_test, predictions),
            'Time Taken': training_time
        }
        model_results_manual_all.append(results)
        print(f"{model_name} Trained and Evaluated (Manual)")
    except Exception as e:
        print(f"Error training {model_name}: {e}")
        model_results_manual_all.append({'Model': model_name, 'Accuracy': np.nan, 'Balanced Accuracy': np.nan, 'F1 Score': np.nan, 'Time Taken': np.nan, 'Error': str(e)}) # Record error

AdaBoostClassifier Trained and Evaluated (Manual)
BaggingClassifier Trained and Evaluated (Manual)
BernoulliNB Trained and Evaluated (Manual)
CalibratedClassifierCV Trained and Evaluated (Manual)
DecisionTreeClassifier Trained and Evaluated (Manual)
DummyClassifier Trained and Evaluated (Manual)
ExtraTreeClassifier Trained and Evaluated (Manual)
ExtraTreesClassifier Trained and Evaluated (Manual)
Error training GaussianNB: 'DataFrame' object has no attribute 'to_dense'
KNeighborsClassifier Trained and Evaluated (Manual)
LabelPropagation Trained and Evaluated (Manual)
LabelSpreading Trained and Evaluated (Manual)
Error training LinearDiscriminantAnalysis: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.
LinearSVC Trained and Evaluated (Manual)
LogisticRegression Trained and Evaluated (Manual)
NearestCentroid Trained and Evaluated (Manual)
Error training NuSVC: 'csr_matrix' object has no attribute 'var'
PassiveAggressiveClassif

In [74]:
models_df=pd.DataFrame(model_results_manual_all)

In [75]:
models_df.sort_values(by='Accuracy', ascending=False,inplace=True)

In [71]:
models_df

Unnamed: 0,index,Model,Accuracy,Balanced Accuracy,F1 Score,Time Taken,Error
0,23,SGDClassifier,0.99,0.96,0.96,0.22,
1,13,LinearSVC,0.99,0.96,0.95,0.22,
2,3,CalibratedClassifierCV,0.99,0.97,0.95,17.08,
3,17,PassiveAggressiveClassifier,0.99,0.96,0.95,0.22,
4,21,RidgeClassifier,0.99,0.95,0.94,0.24,
5,22,RidgeClassifierCV,0.98,0.95,0.94,44.89,
6,18,Perceptron,0.98,0.97,0.94,0.22,
7,2,BernoulliNB,0.98,0.93,0.92,0.21,
8,7,ExtraTreesClassifier,0.98,0.93,0.92,4.01,
9,20,RandomForestClassifier,0.98,0.92,0.92,1.82,


In [69]:
models_df.reset_index(inplace=True)

In [72]:
print('The best model is : ',models_df['Model'][0])

The best model is :  SGDClassifier


In [96]:
param_grid = {
    'loss': ['hinge',  'perceptron'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'learning_rate': ['optimal', 'adaptive'],
}

In [97]:
sgdGS = GridSearchCV(
    SGDClassifier(max_iter=1000, tol=1e-3),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1       )

In [98]:
sgdGS.fit(x_train_df, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [102]:
sgdGS.best_params_

{'learning_rate': 'optimal', 'loss': 'hinge', 'penalty': 'l2'}

In [103]:
bestModel = sgdGS.best_estimator_
test_predictions = bestModel.predict(x_test_df)
test_accuracy = accuracy_score(y_test, test_predictions)
test_balanced_accuracy = balanced_accuracy_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

In [104]:
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Balanced Accuracy: {test_balanced_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.9883
Test Balanced Accuracy: 0.9621
Test F1 Score: 0.9550


In [105]:
model=SGDClassifier(max_iter=1000, tol=1e-3)

In [106]:
model.fit(x_train_df, y_train)

In [107]:
model.score(x_test_df, y_test)

0.9883408071748879