# Importing Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import wordnet
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Reading Data

In [None]:
try_encodings = ['utf-8', 'latin-1', 'utf-16', 'ISO-8859-1']
for encoding in try_encodings:
    try:
        df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding=encoding)
        print(f"Successfully read the file with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to read with encoding: {encoding}")

In [None]:
df.head()

# Data Cleaning

In [None]:
df.info()

In [None]:
# checking missing values
df.isnull().sum()

In [None]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
# checking for duplicates values in the dataset
df.duplicated().sum()

403 duplicated values found in the dataset. We need to remove them

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
# renmaing columns - giving columns meaningful names
df.rename(columns={'v1':'label','v2':'message'},inplace=True)
df.head()

In [None]:
le = LabelEncoder()
le.fit(df['label'])
le.classes_

In [None]:
df['y'] = le.transform(df.label)

In [None]:
df.head()

# Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(4,4))
values = df['label'].value_counts()
plt.pie(values, labels=df.label.unique(), autopct='%1.2f%%',startangle=0,explode=(0,0.1))
plt.show()

In [None]:
ax = sns.barplot(x=df.label.unique(),y=df['label'].value_counts(),)
plt.ylabel('Count')

for i, v in enumerate(df['label'].value_counts()):
    ax.text(i, v+1, str(v), color='black', ha='center', va='bottom')

plt.show()

## Pre-Processing

In [None]:
wordnet = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    # removing special characters & numbers and just keeping alphabets
    var = re.sub("[^a-zA-Z0-9]", " ", text)

    # lowercasing
    var = var.lower()

    var = var.split()
    var = [wordnet.lemmatize(word) for word in var if not word in set(stopwords.words('english'))] # removing stopwords
    var = ' '.join(var)

    return var

In [None]:
print(df['message'][0])
print('>>> After Processing')
preprocess_text(df['message'][0])

In [None]:
df['processed_message'] = df['message'].apply(preprocess_text)
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=300, height=300, min_font_size=10, background_color='white')

In [None]:
fig,axs = plt.subplots(nrows=1,ncols=2,figsize=(10,12))

for i in range(len(df['label'].unique())):
    ax = axs[i%2]
    cloud = wc.generate(df[df['y'] == i]['processed_message'].str.cat(sep=" "))
    ax.imshow(cloud)
    ax.set_title(le.inverse_transform([i])[0])

plt.show()

In [None]:
# Now building a corpus which will be a 2d list with 2 rows one row for each category (ham & spam)
corpus = []
for i in range(len(df['label'].unique())):
    corpus_i = []
    for desc in df[df['y'] == i]['processed_message'].tolist():
        for word in desc.split():
            corpus_i.append(word)
    corpus.append(corpus_i)

print(len(corpus))

In [None]:
from collections import Counter

ham_df = pd.DataFrame(Counter(corpus[0]).most_common(40))
ham_df.rename(columns={0:'word',1:'count'},inplace=True)
ham_df.head()

In [None]:
spam_df = pd.DataFrame(Counter(corpus[1]).most_common(40))
spam_df.rename(columns={0:'word',1:'count'},inplace=True)
spam_df.head()

In [None]:
sns.barplot(x=ham_df['word'],y=ham_df['count'])
plt.xticks(rotation='vertical')
plt.title("Ham Word vs Count Plot")
plt.show()

In [None]:
sns.barplot(x=spam_df['word'],y=spam_df['count'])
plt.xticks(rotation='vertical')
plt.title("Spam Word vs Count Plot")
plt.show()

## Feature Engineering

In order to see whether the length of the message has something to do in predicting the label or not. Let's add features like ```num_characters```, ```num_words``` and ```num_sentences``` to the dataframe. And then check the correlation of these features with the output label.

In [None]:
# Adding num_characters feature
df['num_characters'] = df['message'].apply(len)
df.head()

In [None]:
df['num_words'] = df['message'].apply(lambda x: len(nltk.word_tokenize(x)))
df.head()

In [None]:
df['num_sentences'] = df['message'].apply(lambda x: len(nltk.sent_tokenize(x)))
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# ham
df[df['y']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# spam
df[df['y']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
sns.histplot(data=df,x='num_characters',hue='label')
plt.show()

In [None]:
sns.histplot(data=df,x='num_words',hue='label')
plt.show()

In [None]:
sns.histplot(data=df,x='num_sentences',hue='label')
plt.show()

In [None]:
sns.pairplot(data=df,hue='y')
plt.show()

In [None]:
corr_mat = df.corr()
sns.heatmap(corr_mat,annot=True,cmap='coolwarm',center=0)
plt.xticks(rotation='vertical')
plt.show()

From the above histogram plot of correlation matrix it is clear that ```num_characters``` is more related to output label ```y``` than ```num_words``` and ```num_sentences```.

# Building the Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(df['processed_message']).toarray()
X.shape

In [None]:
y = df['y'].values
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
lg = LogisticRegression()
svc = SVC(kernel='sigmoid', gamma=1.0)

In [None]:
clfs = {
    'gnb':gnb,
    'mnb':mnb,
    'bnb':bnb,
    'lg':lg,
    'svc':svc
}

In [None]:
def train_clfs_and_predict(clfs,X_train,X_test,y_train,y_test):
    acc = []
    prec = []
    conf_mat = []
    classification_rep = []

    for clf in clfs:
        model = clfs[clf]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc.append(accuracy_score(y_test,y_pred))
        prec.append(precision_score(y_test,y_pred))
        conf_mat.append(confusion_matrix(y_test,y_pred))
        classification_rep.append(classification_report(y_test,y_pred))

    return acc,prec,conf_mat,classification_rep

In [None]:
accuracy, precision, conf_mat, class_rep = train_clfs_and_predict(clfs,X_train,X_test,y_train,y_test)
accuracy

In [None]:
precision

In [None]:
performance = {
    'classifiers':list(clfs.keys()),
    'accuracy':accuracy,
    'precision':precision,
    'confusion_matrix':conf_mat,
    'classification_report':class_rep
}

In [None]:
perf_df = pd.DataFrame(performance).sort_values(by='precision',ascending=False)
perf_df

While doing feature engineering, we noticed that ```num_characters``` had a high correlation with the label ```y```. So let's try adding this feature and then check if the performance increases or not.

In [None]:
num_chars = df.num_characters.values.reshape(-1,1)
print("num_chars Min: ",np.min(num_chars))
print("num_chars Max: ",np.max(num_chars),"\n")
print(f"X min: {np.min(X)}")
print(f"X max: {np.max(X)}")

From above cell we see that we need to scale ```num_chars``` and then add it to feature matrix ```X```.

In [None]:
# Not using StandardScaler because it gives both -ve and +ve values and MultinomialNB doesn't accept -ve values. So we're using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaled_num_chars = scaler.fit_transform(num_chars)

# now check the min and max values
print("scaled_num_chars Min: ",np.min(scaled_num_chars))
print("scaled_num_chars Max: ",np.max(scaled_num_chars))

In [None]:
X = np.hstack((X,scaled_num_chars))

In [None]:
X.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
accuracy1, precision1, conf_mat1, class_rep1 = train_clfs_and_predict(clfs,X_train,X_test,y_train,y_test)

In [None]:
perf_df['num_chars_accuracy'] = accuracy1
perf_df['num_chars_percision'] = precision1
perf_df['num_chars_confusion_matrix'] = conf_mat1
perf_df['num_chars_classification_report'] = class_rep1

In [None]:
perf_df

Adding ```num_characters``` feature didn't help. Almost every classifier performed even more worse. So, we won't use this feature.

In [None]:
cols_to_extract = ['classifiers','accuracy','precision','confusion_matrix','classification_report']
final_performance_df = perf_df[cols_to_extract]
final_performance_df

In [None]:
# printing out classification reports
reports = perf_df.classification_report.values
classifiers = perf_df.classifiers.values

for i,clf in enumerate(classifiers):
    print(f"{clf}:\n{reports[i]}\n\n")

In [None]:
sns.set(style="whitegrid")
sns.lineplot(x=final_performance_df.classifiers, y=final_performance_df.accuracy, marker='o', label='Accuracy', data=final_performance_df)
sns.lineplot(x=final_performance_df.classifiers, y=final_performance_df.precision, marker='o', label='Precision', data=final_performance_df)

plt.title("Accuracy and Precision by Classifiers")
plt.xlabel("Classifiers")
plt.ylabel("Value")
plt.legend()
plt.show()

**RESULT**

The above plot shows that svc has the best performance with ```accuracy = 98.06%``` and ```precision = 100%```



---


