# SMS Spam Detection

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Load Data into pandas dataframe
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
sms_data = pd.read_csv(url, sep='\t', header=None, names=["label", "message"])

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [5]:
# NLTK stopwords filter for commonly uninformative words (e.g. 'the')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lmh30\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def clean_text(text):
    # filter out non-letters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()

    # Tokenize before filtering
    text = text.split()

    # Remove stopwords and apply stemming so I only have root form
    text = [ps.stem(word) for word in text if not word in stop_words]

    # Return cleaned text as one string with space separators
    text = ' '.join(text)    
    return text

In [7]:
# Preprocess each message in df
sms_data['message'] = sms_data['message'].apply(clean_text)

In [8]:
sms_data.head()

Unnamed: 0,label,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [9]:
# TF-IDF Vectorizer init
tfidf_vectorizer = TfidfVectorizer()

# Fitting and transforming to create features
X = tfidf_vectorizer.fit_transform(sms_data['message'])

# Labeling
y = sms_data['label'].map({'ham': 0, 'spam': 1})

In [10]:
# Test train split 75/25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Logistic Regression b/c this is binary classification
model = LogisticRegression()

# Train!
model.fit(X_train, y_train)

In [11]:
# Evaluate with test set
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9655419956927495
Confusion Matrix:
 [[1206    2]
 [  46  139]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1208
           1       0.99      0.75      0.85       185

    accuracy                           0.97      1393
   macro avg       0.97      0.87      0.92      1393
weighted avg       0.97      0.97      0.96      1393

