<a href="https://colab.research.google.com/github/fvt1h/Encryptix/blob/main/SMS_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SMS Spam Detection

In [1]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp '/content/drive/MyDrive/kaggle.json' ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d uciml/sms-spam-collection-dataset
!unzip sms-spam-collection-dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 57.7MB/s]
Archive:  sms-spam-collection-dataset.zip
  inflating: spam.csv                


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Exploration

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [7]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


In [8]:
df.duplicated().sum()

403

## Pre Processing

In [9]:
df.drop_duplicates(keep='first',inplace=True)

In [10]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [11]:
df.rename(columns={'v1':'category','v2':'text'},inplace=True)

In [12]:
df.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Build Model

In [13]:
# Encode labels
df['category'] = df['category'].map({'ham': 0, 'spam': 1})

In [14]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk

nltk.download('stopwords')

# Cleaning Text function
def cleaning_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()  # Remove leading and trailing spaces
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]  # Stemming
    return ' '.join(words)

# Apply cleaning text
df['text'] = df['text'].apply(cleaning_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
# Vectorize the text data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['category']

In [16]:
# Split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train the SVM model
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

## Evaluation

In [18]:
# Predict on the test set
y_pred = model.predict(X_test)

In [19]:
# Calculate accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.97678916827853


In [20]:
print("Model Score")
print(model.score(X_test, y_test))

Model Score
0.97678916827853


In [21]:
from sklearn.metrics import classification_report
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)
print(classification_report(y_test, y_pred, zero_division=1))

SVM Accuracy: 0.97678916827853
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       889
           1       1.00      0.83      0.91       145

    accuracy                           0.98      1034
   macro avg       0.99      0.92      0.95      1034
weighted avg       0.98      0.98      0.98      1034



## Prediction

In [24]:
# Prediction function for SVM
def predict_spam_svm(message):
    cleaned_message = message.lower()  # Example of cleaning, you can add more steps
    vectorized_message = vectorizer.transform([cleaned_message])
    prediction = model.predict(vectorized_message)
    return 'spam' if prediction[0] == 1 else 'ham'

# Example usage
message = "Congratulations! You've won a free ticket to the Bahamas. Call now!"
print("SVM Prediction:", predict_spam_svm(message))

SVM Prediction: ham
