In [19]:
# General

import numpy as np
import pandas as pd
import re
import os
import pickle

# EDA

import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from collections import Counter
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/colinhong/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [63]:
data = pd.read_csv("spam.csv", encoding = 'latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [64]:
data.drop(data.columns[[2,3,4]], axis = 1, inplace = True)
data.columns = ['class','message']
data.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
data['no_of_chars'] = data['message'].apply(len)
data['no_of_words'] = data['message'].apply(lambda x: len(nltk.word_tokenize(x)))
data['no_of_sentences'] = data['message'].apply(lambda x: len(nltk.sent_tokenize(x)))
data['Spam'] = data['class'].map({'ham': 0, 'spam': 1})

data.head()

Unnamed: 0,class,message,no_of_chars,no_of_words,no_of_sentences,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",111,24,2,0
1,ham,Ok lar... Joking wif u oni...,29,8,2,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,1
3,ham,U dun say so early hor... U c already then say...,49,13,1,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,15,1,0


### Count plot

In [26]:
temp = pd.DataFrame()
temp['Class'] = ['Ham','Spam']
temp['Count'] = [len(data[data['class'] == 'ham']), len(data[data['class'] == 'spam'])]
temp = temp.sort_values(by = ['Count'], ascending = False)
fig = px.bar(temp, x = 'Class', y = 'Count', 
             color = "Class", text_auto='', width = 600, 
             color_discrete_sequence = ["teal", "gray"],
             template = 'plotly_dark',
            title = "Count Plot")

fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.update_traces(textfont_size=12, textangle = 0, textposition = "outside", cliponaxis = False)
fig.show()

## Exploring Number of Characters per Message

In [41]:
fig = px.histogram(data, x = 'no_of_chars', color = 'class', 
                   template = 'plotly_dark', 
                   color_discrete_sequence = ["teal", "gray"],
                   title = "Histogram for Ham vs Spam based on Number of Characters per Message",
                   width = 1000)

fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.update_layout(barmode = 'overlay')
fig.show()

In [39]:
fig = px.strip(data, x = 'no_of_chars', 
                    color = 'class', template = 'plotly_dark', 
                    color_discrete_sequence = ["teal", "gray"],
                    title = "Strip plot for Ham vs Spam based on Number of Characters per Message",
                    width=1000)
fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.show()

## Exploring Number of Words per Message

In [43]:
fig = px.histogram(data, x = 'no_of_words', color = 'class', 
                   template = 'plotly_dark', 
                   color_discrete_sequence = ["teal", "gray"],
                   title = "Histogram for Ham vs Spam based on Number of Words per Message",
                   width = 1000)

fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.update_layout(barmode = 'overlay')
fig.show()

In [44]:
fig = px.strip(data, x = 'no_of_words', 
                    color = 'class', template = 'plotly_dark', 
                    color_discrete_sequence = ["teal", "gray"],
                    title = "Strip plot for Ham vs Spam based on Number of Words per Message",
                    width=1000)
fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.show()

## Exploring Number of Sentences per Message

In [31]:
fig = px.histogram(data, x = 'no_of_sentences', 
                    color = 'class', template = 'plotly_dark', 
                    color_discrete_sequence = ["teal", "gray"],
                    title = "Histogram for Ham vs Spam based on Number of Sentences per Message")
fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.update_layout(barmode = 'overlay')
fig.show()

In [38]:
fig = px.strip(data, x = 'no_of_sentences', 
                    color = 'class', template = 'plotly_dark', 
                    color_discrete_sequence = ["teal", "gray"],
                    title = "Strip plot for Ham vs Spam based on Number of Sentences per Message",
                    width=1000)
fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)
fig.show()

## Correlation Matrix

In [70]:
corrData = data[["no_of_chars", "no_of_words", "no_of_sentences"]]
corrData.head()

Unnamed: 0,no_of_chars,no_of_words,no_of_sentences
0,111,24,2
1,29,8,2
2,155,37,2
3,49,13,1
4,61,15,1


In [74]:

fig = px.imshow(corrData.corr(), text_auto = True, color_continuous_scale = 'blues', template = 'plotly_dark', title = "Correlation Matrix")
fig.show()