<a href="https://colab.research.google.com/github/giocarro/Data_Science_Gio/blob/main/Tareas/SMS_Spam_Detection_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Information

The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...

The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged according being ham (legitimate) or spam.

## Attributes

- SMS Messages
- Label (spam/ham)

## Import modules

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


## Loading the dataset

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# get necessary columns for processing
df = df[['text', 'class']]
df

Unnamed: 0,text,class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


## Preprocessing the dataset

In [None]:
# check for null values
df.isnull().sum()

text     0
class    0
dtype: int64

In [None]:
stops = set(stopwords.words('english'))
print(stops)

{"shouldn't", 'yourselves', 'here', 'ours', 'you', 'having', 'more', 'before', 'out', 'these', 'other', 'y', 'yourself', 'they', 'then', 'further', 'them', 'i', 'it', 'weren', 'very', "won't", 'he', 'mightn', 'through', 'themselves', 'than', 'myself', 'ma', 'at', 'during', "you'd", 'off', 'when', 'of', "should've", "isn't", 'as', "hasn't", 'a', 'few', 're', 'until', 'both', 'isn', 'm', "you've", 'after', 'with', 'its', 'shouldn', 'not', 'have', 'were', 'into', 'doesn', 'now', 'she', 'where', 'are', 'most', 'am', 'was', 've', 'about', 'such', 'our', 'ourselves', 'can', 'how', 'didn', 'all', 'mustn', "you'll", 'don', 'that', 'do', "you're", 't', 'if', 'each', 'shan', 'been', 'up', 'once', 'him', 'who', 'hasn', 'should', 'because', 'is', 'same', "aren't", 'had', 'hers', "wouldn't", "she's", 'while', 'between', "weren't", 'below', 'herself', 'on', 'what', 'has', 'be', "doesn't", 'by', 'ain', 'just', 'over', 'to', "hadn't", "didn't", 'whom', 'nor', 'an', 'for', 'above', 'in', 'did', "couldn

In [None]:
#STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove stopwords
    text = " ".join(word for word in text.split() if word not in stops)
    return text

In [None]:
# clean the messages
df['clean_text'] = df['text'].apply(clean_text)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['text'].apply(clean_text)


Unnamed: 0,text,class,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


## Input Split

In [None]:
class_counts = df['class'].value_counts()
class_counts

ham     4825
spam     747
Name: class, dtype: int64

In [None]:
class_counts.index

Index(['ham', 'spam'], dtype='object')

In [None]:
fig = px.bar(x=class_counts.index, y=class_counts.values, labels={'x': 'Class', 'y': 'Count'}, title='Messages Class Distribution', template = 'plotly_white', text = class_counts.values)
fig.show()

In [None]:
class_1 = df[df['class'] == 'ham']
class_2 = df[df['class'] == 'spam']

In [None]:
len(class_1)/len(df['class'])

# **Creating dictionary**

In [None]:
# taken from R Studio from Abraham code

# Training model using matrix generated in R Studio

In [None]:
df = pd.read_csv('data_messages.csv')
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V4865,V4866,V4867,V4868,V4869,V4870,V4871,V4872,V4873,message_class
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,ham
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5569,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [None]:
# 0 - ham
# 1 - spam
messages = pd.DataFrame(df['message_class'].value_counts().reset_index())
messages

Unnamed: 0,index,message_class
0,ham,4825
1,spam,747


In [None]:
# # X = df['clean_text']
# # y = df['class']

# messages = pd.DataFrame(df['message_class'].value_counts().reset_index())

In [None]:
fig = px.bar(x=messages.index, y=messages.message_class, labels={'x': 'Class (0 -ham, 1 - spam)', 'y': 'Count'}, title='Messages Class Distribution', template = 'plotly_white', text = messages.message_class)
fig.show()

In [None]:
# #Change values for ham = 0 and spam = 1
# df[df['message_class'] == 'ham']
df = df.replace('ham',0)
df = df.replace('spam',1)

In [None]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V4865,V4866,V4867,V4868,V4869,V4870,V4871,V4872,V4873,message_class
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['message_class'], test_size=0.2, random_state=7)


In [None]:
X_train.shape

(4457, 4873)

In [None]:
X_test.shape

(1115, 4873)

In [None]:
y_train.shape

(4457,)

In [None]:
y_train

4459    0
1921    0
5255    0
5507    0
356     1
       ..
4307    0
2550    0
537     0
1220    1
4271    0
Name: message_class, Length: 4457, dtype: int64

In [None]:
y_test.shape

(1115,)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

## KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
knn_pred = knn.predict(X_test)

In [None]:
knn_accuracy = accuracy_score(y_test, knn_pred)
print(f"KNN Accuracy: {knn_accuracy:.2f}")

KNN Accuracy: 0.91


In [None]:
knn_precision = precision_score(y_test, knn_pred)
print(f"KNN Precission: {knn_precision:.2f}")

KNN Precission: 1.00


In [None]:
knn_recall = recall_score(y_test, knn_pred)
print(f"KNN Recall: {knn_recall:.2f}")

KNN Recall: 0.33


In [None]:
knn_f1 = f1_score(y_test, knn_pred)
print(f"KNN F1 Score: {knn_f1:.2f}")

KNN F1 Score: 0.50


In [None]:
knn_report = classification_report(y_test, knn_pred)
print("KNN Classification Report:")
print(knn_report)

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       970
           1       1.00      0.33      0.50       145

    accuracy                           0.91      1115
   macro avg       0.95      0.67      0.72      1115
weighted avg       0.92      0.91      0.89      1115



In [None]:
knn_cm = confusion_matrix(y_test, knn_pred)
plot_confusion_matrix(knn_cm, ['ham (0)', 'spam (1)'])

## Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")

Logistic Regression Accuracy: 0.98


In [None]:
lr_precision = precision_score(y_test, lr_pred)
print(f"Logistic Regression Precission: {lr_precision:.2f}")

Logistic Regression Precission: 0.98


In [None]:
lr_recall = recall_score(y_test, lr_pred)
print(f"Logistic Regression Recall: {lr_recall:.2f}")

Logistic Regression Recall: 0.89


In [None]:
lr_f1 = f1_score(y_test, lr_pred)
print(f"Logistic Regression F1 Score: {lr_f1:.2f}")

Logistic Regression F1 Score: 0.93


In [None]:
lr_report = classification_report(y_test, lr_pred)
print("Logistic Regression Classification Report:")
print(lr_report)

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       970
           1       0.98      0.89      0.93       145

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
lr_cm = confusion_matrix(y_test, lr_pred)
plot_confusion_matrix(lr_cm, ['ham (0)', 'spam (1)'])

## SVM

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

In [None]:
svm_pred = svm.predict(X_test)

In [None]:
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy: {svm_accuracy:.2f}")

SVM Accuracy: 0.98


In [None]:
svm_precision = precision_score(y_test, svm_pred)
print(f"SVM Precission: {svm_precision:.2f}")

SVM Precission: 1.00


In [None]:
svm_recall = recall_score(y_test, svm_pred)
print(f"KNN Recall: {svm_recall:.2f}")

KNN Recall: 0.88


In [None]:
svm_f1 = f1_score(y_test, svm_pred)
print(f"Logistic Regression F1 Score: {svm_f1:.2f}")

Logistic Regression F1 Score: 0.93


In [None]:
svm_report = classification_report(y_test, svm_pred)
print("SVM Classification Report:")
print(svm_report)

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       970
           1       1.00      0.88      0.93       145

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
svm_cm = confusion_matrix(y_test, svm_pred)
plot_confusion_matrix(svm_cm, ['ham (0)', 'spam (1)'])

## Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
nb_pred = nb.predict(X_test)

In [None]:
nb_accuracy = accuracy_score(y_test, nb_pred)
print(f"Naive Bayes Accuracy: {nb_accuracy:.2f}")

Naive Bayes Accuracy: 0.91


In [None]:
nb_precision = precision_score(y_test, nb_pred)
print(f"Naive Bayes Precission: {nb_precision:.2f}")

Naive Bayes Precission: 0.60


In [None]:
nb_recall = recall_score(y_test, nb_pred)
print(f"Naive Bayes Recall: {nb_recall:.2f}")

Naive Bayes Recall: 0.93


In [None]:
nb_f1 = f1_score(y_test, nb_pred)
print(f"Naive Bayes F1 Score: {nb_f1:.2f}")

Naive Bayes F1 Score: 0.73


In [None]:
nb_report = classification_report(y_test, nb_pred)
print("Naive Bayes Classification Report:")
print(nb_report)

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.91      0.95       970
           1       0.60      0.93      0.73       145

    accuracy                           0.91      1115
   macro avg       0.79      0.92      0.84      1115
weighted avg       0.94      0.91      0.92      1115



In [None]:
nb_cm = confusion_matrix(y_test, nb_pred)
plot_confusion_matrix(nb_cm, ['ham (0)', 'spam (1)'])

## Decision Trees

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
dt_pred = dt.predict(X_test)

In [None]:
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Decision Trees Accuracy: {dt_accuracy:.2f}")

Decision Trees Accuracy: 0.96


In [None]:
dt_precision = precision_score(y_test, dt_pred)
print(f"Decision Trees Precission: {dt_precision:.2f}")

Decision Trees Precission: 0.89


In [None]:
dt_recall = recall_score(y_test, dt_pred)
print(f"Decision Trees Recall: {dt_recall:.2f}")

Decision Trees Recall: 0.78


In [None]:
dt_f1 = f1_score(y_test, dt_pred)
print(f"Decision Trees F1 Score: {dt_f1:.2f}")

Decision Trees F1 Score: 0.83


In [None]:
dt_report = classification_report(y_test, dt_pred)
print("Decision Trees Classification Report:")
print(dt_report)

Decision Trees Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       970
           1       0.89      0.78      0.83       145

    accuracy                           0.96      1115
   macro avg       0.93      0.88      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [None]:
dt_cm = confusion_matrix(y_test, dt_pred)
plot_confusion_matrix(dt_cm, ['ham (0)', 'spam (1)'])

## Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

Random Forest Accuracy: 0.97


In [None]:
rf_precision = precision_score(y_test, rf_pred)
print(f"Random Forest Precission: {rf_precision:.2f}")

Random Forest Precission: 0.98


In [None]:
rf_recall = recall_score(y_test, rf_pred)
print(f"Random Forest Recall: {rf_recall:.2f}")

Random Forest Recall: 0.81


In [None]:
rf_f1 = f1_score(y_test, rf_pred)
print(f"Random Forest F1 Score: {rf_f1:.2f}")

Random Forest F1 Score: 0.89


In [None]:
rf_report = classification_report(y_test, rf_pred)
print("Random Forest Classification Report:")
print(rf_report)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       970
           1       0.98      0.81      0.89       145

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
rf_cm = confusion_matrix(y_test, rf_pred)
plot_confusion_matrix(rf_cm, ['ham (0)', 'spam (1)'])

## Comparing models

In [None]:
models_data = {
    'Model': ['Logistic Regression', 'K-Nearest Neighbors', 'SVM', 'Naive Bayes', 'Decision Trees', 'Random Forest'],
    'Accuracy': [lr_accuracy, knn_accuracy, svm_accuracy, nb_accuracy, dt_accuracy, rf_accuracy],
    'Precision': [lr_precision, knn_precision, svm_precision, nb_precision, dt_precision, rf_precision],
    'Recall': [lr_recall, knn_recall, svm_recall, nb_recall, dt_recall, rf_recall],
    'F1-Score': [lr_f1, knn_f1, svm_f1, nb_f1, dt_f1, rf_f1]
}

df_models = pd.DataFrame(models_data)
df_models

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.983857,0.984733,0.889655,0.934783
1,K-Nearest Neighbors,0.913004,1.0,0.331034,0.497409
2,SVM,0.983857,1.0,0.875862,0.933824
3,Naive Bayes,0.910314,0.6,0.931034,0.72973
4,Decision Trees,0.958744,0.889764,0.77931,0.830882
5,Random Forest,0.973094,0.975207,0.813793,0.887218


In [None]:
fig_accuracy = px.bar(df_models, x='Model', y='Accuracy', title='Model Comparison - Accuracy', template = 'plotly_white', text = 'Accuracy')
fig_accuracy.update_layout(yaxis=dict(range=[0, 1]))
fig_accuracy.show()

## Model Training taken from youtube channel

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

def classify(model, X, y):
    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    # model training
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', model)])
    pipeline_model.fit(x_train, y_train)

    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)

#     cv_score = cross_val_score(model, X, y, cv=5)
#     print("CV Score:", np.mean(cv_score)*100)
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy: 96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model, X, y)

Accuracy: 96.69777458722182
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.75      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393



In [None]:
from sklearn.svm import SVC
model = SVC(C=3)
classify(model, X, y)

Accuracy: 98.27709978463747
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       1.00      0.87      0.93       187

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

Accuracy: 97.63101220387652
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1206
        spam       1.00      0.82      0.90       187

    accuracy                           0.98      1393
   macro avg       0.99      0.91      0.94      1393
weighted avg       0.98      0.98      0.98      1393

