# Spam Mail

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving spam.csv to spam.csv


In [3]:
spam = pd.read_csv(filename, sep = ',', encoding = 'latin1')
spam.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [4]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df = spam[['v1', 'v2']]
df.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [6]:
df.shape

(5572, 2)

## Preprocessing

In [7]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
df['clean_v2'] = df.v2.str.replace('[^A-Za-z ]', '')
df.head(3)

Unnamed: 0,v1,v2,clean_v2
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.clean_v2, df.v1, test_size = 0.2,
    stratify = df.v1, random_state = 2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Count Vectorizer

In [12]:
cvect = CountVectorizer(stop_words = 'english', ngram_range = (1, 1))
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((4457, 7260), (1115, 7260))

In [13]:
lr1 = LogisticRegression(max_iter = 500)
%time lr1.fit(X_train_cv, y_train)

CPU times: user 81.1 ms, sys: 826 µs, total: 82 ms
Wall time: 88.2 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
pred1 = lr1.predict(X_test_cv)

In [15]:
score1 = accuracy_score(y_test, pred1)
score1

0.9739910313901345

## TF-IDF

In [16]:
tvect = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 1))
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((4457, 7260), (1115, 7260))

In [17]:
lr2 = LogisticRegression(max_iter = 500)
%time lr2.fit(X_train_tv, y_train)

CPU times: user 78.4 ms, sys: 473 µs, total: 78.9 ms
Wall time: 80 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
pred2 = lr2.predict(X_test_tv)

In [19]:
score2 = accuracy_score(y_test, pred2)
score2

0.9605381165919282