# Email Spam detection EDA
Prepared By Deepa Francis<br>
For BrainStation<br>
On July 20, 2023

In [1]:
# import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

# display all columns in dataframe
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# loading the data
raw_data = pd.read_csv('raw_data.csv')

In [3]:
# Summary of raw data
print(f"Number of Rows: {raw_data.shape[0]}")
print(f"Number of Columns: {raw_data.shape[1]}")
print(f"Number of missing values: {raw_data.isna().sum().sum()}")
print(f"Number of duplicated rows: {raw_data.duplicated().sum()}")

Number of Rows: 75419
Number of Columns: 5
Number of missing values: 2856
Number of duplicated rows: 1512


In [4]:
# Quickly check the data types
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75419 entries, 0 to 75418
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       75419 non-null  int64 
 1   subject     74626 non-null  object
 2   email_to    74843 non-null  object
 3   email_from  75419 non-null  object
 4   message     73932 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.9+ MB


**Missing values**

In [10]:
clean_data = raw_data.copy()
# Check column wise count of missing values
clean_data.isna().sum()

label            0
subject        793
email_to       576
email_from       0
message       1487
dtype: int64

In [11]:
# Calculate the percentage of missing values
percentage_missing = (clean_data.isna().sum() / len(clean_data)) * 100
print(np.round(percentage_missing,2))

label         0.00
subject       1.05
email_to      0.76
email_from    0.00
message       1.97
dtype: float64


In [12]:
# Check column wise count of missing values
clean_data.isna().sum().sum()

2856

In [13]:
# Calculate the percentage of missing values
total_percentage_missing = (clean_data.isna().sum().sum() / len(clean_data)) * 100
print(np.round(total_percentage_missing,2))

3.79


In [14]:
# Delete missing value rows 
clean_data.dropna(inplace = True, axis=0)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72798 entries, 0 to 75418
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       72798 non-null  int64 
 1   subject     72798 non-null  object
 2   email_to    72798 non-null  object
 3   email_from  72798 non-null  object
 4   message     72798 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.3+ MB


**Duplicate Rows**

In [16]:
# Remaining Duplicated rows
clean_data.duplicated().sum()

1354

In [17]:
# Calculate the percentage of duplicate rows
percentage_duplicates = (clean_data.duplicated().sum() / len(clean_data
                                                            )) * 100
print("Percentage of duplicate rows:", np.round(percentage_duplicates,2))

Percentage of duplicate rows: 1.86


We decide to drop duplicate rows because these could be same message content, from same email, sent to same person at different times and we dont have the timestamp to get any additional information from it.

In [18]:
# Drop duplicated rows 
clean_data.drop_duplicates(inplace=True)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71444 entries, 0 to 75418
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       71444 non-null  int64 
 1   subject     71444 non-null  object
 2   email_to    71444 non-null  object
 3   email_from  71444 non-null  object
 4   message     71444 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.3+ MB


In [19]:
clean_data.shape

(71444, 5)

In [20]:
# Summary of clean data
print(f"Number of Rows: {clean_data.shape[0]}")
print(f"Number of Columns: {clean_data.shape[1]}")
print(f"Number of missing values: {clean_data.isna().sum().sum()}")
print(f"Number of duplicated rows: {clean_data.duplicated().sum()}")

Number of Rows: 71444
Number of Columns: 5
Number of missing values: 0
Number of duplicated rows: 0


In [21]:
clean_data.head()

Unnamed: 0,label,subject,email_to,email_from,message
0,1,"Generic Cialis, branded quality@",the00@speedy.uwaterloo.ca,"""Tomas Jacobs"" <RickyAmes@aol.com>",Content-Type: text/html;\nContent-Transfer-Enc...
1,0,Typo in /debian/README,debian-mirrors@lists.debian.org,Yan Morin <yan.morin@savoirfairelinux.com>,"Hi, i've just updated from the gulus and I che..."
2,1,authentic viagra,<the00@plg.uwaterloo.ca>,"""Sheila Crenshaw"" <7stocknews@tractionmarketin...","Content-Type: text/plain;\n\tcharset=""iso-8859..."
3,1,Nice talking with ya,opt4@speedy.uwaterloo.ca,"""Stormy Dempsey"" <vqucsmdfgvsg@ruraltek.com>","Hey Billy, \n\nit was really fun going out the..."
4,1,or trembling; stomach cramps; trouble in sleep...,ktwarwic@speedy.uwaterloo.ca,"""Christi T. Jernigan"" <dcube@totalink.net>",Content-Type: multipart/alternative;\n ...
