In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [9]:
datasets = pd.read_csv('spam1.csv') 
print(datasets)
datasets = datasets.rename(columns={"v1":"label", "v2":"text"})
datasets['label'] = datasets['label'].replace({'spam':1, 'ham':0})
datasets

       v1                                                 v2
0    spam  Free entry in 2 a wkly comp to win FA Cup fina...
1    spam  FreeMsg Hey there darling it's been 3 week's n...
2    spam  WINNER!! As a valued network customer you have...
3    spam  Had your mobile 11 months or more? U R entitle...
4    spam  SIX chances to win CASH! From 100 to 20,000 po...
..    ...                                                ...
508  spam  This is the 2nd time we have tried 2 contact u...
509   ham              Will �_ b going to esplanade fr home?
510   ham  Pity, * was in mood for that. So...any other s...
511   ham  The guy did some bitching but I acted like i'd...
512   ham                         Rofl. Its true to its name

[513 rows x 2 columns]


Unnamed: 0,label,text
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
508,1,This is the 2nd time we have tried 2 contact u...
509,0,Will �_ b going to esplanade fr home?
510,0,"Pity, * was in mood for that. So...any other s..."
511,0,The guy did some bitching but I acted like i'd...


In [10]:
datasets.describe()

Unnamed: 0,label
count,513.0
mean,0.335283
std,0.47255
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [11]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   513 non-null    int64 
 1   text    513 non-null    object
dtypes: int64(1), object(1)
memory usage: 8.1+ KB


In [12]:
datasets['label'].value_counts() # 1-> spam, 0->ham

0    341
1    172
Name: label, dtype: int64

##Analysis

To analyze the text data, we have to turn the words into numerical numbers. 
We have multiple choices to accomplish this step: 

1) Binary Term Frequency :  count presence(1) or absence(0) for term in document

2) Bag of Words Frequency:  captures the frequency of term in document

3) Term Frequency: 

4) TFIDF :

in this way, if a term appears frequently in a document, it’s important; if a term appears in many documents, it’s not a unique identifier.

Word2Vec.

In [13]:
text = datasets.iloc[:,1]
print(text)

# converting to lower case
lower_case_text = []
lower_case_text = [d.lower() for d in text]
print(lower_case_text)

# removing punctuations
sans_punctuation_text = []
import string
for i in lower_case_text:
    # print(i)
    # print("**********************************")
    sans_punctuation_text.append(i.translate(str.maketrans("","", string.punctuation)))

# tokenization
preprocessed_text = [[w for w in d.split()] for d in sans_punctuation_text]
print(preprocessed_text)


# count frequencies
frequency_list = []
import pprint
from collections import Counter

frequency_list = [Counter(d) for d in preprocessed_text]
pprint.pprint(frequency_list)

0      Free entry in 2 a wkly comp to win FA Cup fina...
1      FreeMsg Hey there darling it's been 3 week's n...
2      WINNER!! As a valued network customer you have...
3      Had your mobile 11 months or more? U R entitle...
4      SIX chances to win CASH! From 100 to 20,000 po...
                             ...                        
508    This is the 2nd time we have tried 2 contact u...
509                Will �_ b going to esplanade fr home?
510    Pity, * was in mood for that. So...any other s...
511    The guy did some bitching but I acted like i'd...
512                           Rofl. Its true to its name
Name: text, Length: 513, dtype: object
["free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's", "freemsg hey there darling it's been 3 week's now and no word back! i'd like some fun you up for it still? tb ok! xxx std chgs to send, �1.50 to rcv", 'winner!! as a valued n

          '21m': 1,
          'from': 1,
          'aberdeen': 1,
          'united': 1,
          'kingdom': 1,
          'check': 1,
          'him': 1,
          'out': 1,
          'httpimg': 1,
          'acwicmb3cktz8r74': 1,
          'no': 1,
          'dates': 1,
          'send': 1,
          'hide': 1}),
 Counter({'and': 2,
          'themob': 1,
          'check': 1,
          'out': 1,
          'our': 1,
          'newest': 1,
          'selection': 1,
          'of': 1,
          'content': 1,
          'games': 1,
          'tones': 1,
          'gossip': 1,
          'babes': 1,
          'sport': 1,
          'keep': 1,
          'your': 1,
          'mobile': 1,
          'fit': 1,
          'funky': 1,
          'text': 1,
          'wap': 1,
          'to': 1,
          '82468': 1}),
 Counter({'think': 1,
          'ur': 1,
          'smart': 1,
          'win': 1,
          '�200': 1,
          'this': 1,
          'week': 1,
          'in': 1,
          'our': 1,

          'send': 1,
          'stop': 1,
          '08452810071': 1,
          '16': 1}),
 Counter({'i': 3,
          'im': 2,
          'sue': 2,
          'text': 2,
          'hi': 1,
          'am': 1,
          '20': 1,
          'years': 1,
          'old': 1,
          'and': 1,
          'work': 1,
          'as': 1,
          'a': 1,
          'lapdancer': 1,
          'love': 1,
          'sex': 1,
          'me': 1,
          'live': 1,
          'my': 1,
          'bedroom': 1,
          'now': 1,
          'to': 1,
          '89555': 1,
          'by': 1,
          'textoperator': 1,
          'g2': 1,
          '1da': 1,
          '150ppmsg': 1,
          '18': 1}),
 Counter({'forwarded': 1,
          'from': 1,
          '448712404000please': 1,
          'call': 1,
          '08712404000': 1,
          'immediately': 1,
          'as': 1,
          'there': 1,
          'is': 1,
          'an': 1,
          'urgent': 1,
          'message': 1,
          'waiting': 1,
 

          '10k': 1,
          '5k': 1,
          '�500': 1,
          'cash': 1,
          'or': 1,
          '�100': 1,
          'travel': 1,
          'voucher': 1,
          'call': 1,
          'now': 1,
          '09064011000': 1,
          'ntt': 1,
          'po': 1,
          'box': 1,
          'cr01327bt': 1,
          'fixedline': 1,
          'cost': 1,
          '150ppm': 1,
          'mobile': 1,
          'vary': 1}),
 Counter({'sunshine': 1,
          'quiz': 1,
          'wkly': 1,
          'q': 1,
          'win': 1,
          'a': 1,
          'top': 1,
          'sony': 1,
          'dvd': 1,
          'player': 1,
          'if': 1,
          'u': 1,
          'know': 1,
          'which': 1,
          'country': 1,
          'liverpool': 1,
          'played': 1,
          'in': 1,
          'mid': 1,
          'week': 1,
          'txt': 1,
          'ansr': 1,
          'to': 1,
          '82277': 1,
          '�150': 1,
          'sptyrone': 1}),
 Counter({'w

          'tat': 1,
          'vidnot': 1,
          'finishd': 1}),
 Counter({'k': 1,
          'im': 1,
          'leaving': 1,
          'soon': 1,
          'be': 1,
          'there': 1,
          'a': 1,
          'little': 1,
          'after': 1,
          '9': 1}),
 Counter({'yeah': 1,
          'work': 1,
          'is': 1,
          'fine': 1,
          'started': 1,
          'last': 1,
          'week': 1,
          'all': 1,
          'the': 1,
          'same': 1,
          'stuff': 1,
          'as': 1,
          'before': 1,
          'dull': 1,
          'but': 1,
          'easy': 1,
          'and': 1,
          'guys': 1,
          'are': 1,
          'fun': 1}),
 Counter({'you': 2,
          'do': 1,
          'your': 1,
          'studies': 1,
          'alone': 1,
          'without': 1,
          'anyones': 1,
          'help': 1,
          'if': 1,
          'cant': 1,
          'no': 1,
          'need': 1,
          'to': 1,
          'study': 1}),
 Counter(

          'lik': 1,
          'you': 1,
          'r': 1,
          'sending': 1,
          'me': 1}),
 Counter({'i': 1,
          'think': 1,
          'steyn': 1,
          'surely': 1,
          'get': 1,
          'one': 1,
          'wicket': 1}),
 Counter({'neither': 1,
          'in': 1,
          'sterm': 1,
          'voice': 1,
          'im': 1,
          'studying': 1,
          'all': 1,
          'fine': 1,
          'with': 1,
          'me': 1,
          'not': 1,
          'sure': 1,
          'the': 1,
          'thing': 1,
          'will': 1,
          'be': 1,
          'resolved': 1,
          'tho': 1,
          'anyway': 1,
          'have': 1,
          'a': 1,
          'fab': 1,
          'hols': 1}),
 Counter({'garbage': 1,
          'bags': 1,
          'eggs': 1,
          'jam': 1,
          'bread': 1,
          'hannaford': 1,
          'wheat': 1,
          'chex': 1}),
 Counter({'no': 1,
          'its': 1,
          'not': 1,
          'pride': 1,
  

 Counter({'otherwise': 1,
          'had': 1,
          'part': 1,
          'time': 1,
          'job': 1,
          'natuition': 1}),
 Counter({'oh': 1,
          'yeah': 1,
          'and': 1,
          'my': 1,
          'diet': 1,
          'just': 1,
          'flew': 1,
          'out': 1,
          'the': 1,
          'window': 1}),
 Counter({'santa': 2,
          'your': 2,
          'call': 2,
          'calling': 1,
          'would': 1,
          'little': 1,
          'ones': 1,
          'like': 1,
          'a': 1,
          'from': 1,
          'xmas': 1,
          'eve': 1,
          '09058094583': 1,
          'to': 1,
          'book': 1,
          'time': 1}),
 Counter({'you': 1, 'didnt': 1, 'complete': 1, 'your': 1, 'gist': 1, 'oh': 1}),
 Counter({'er': 1,
          'yeah': 1,
          'i': 1,
          'will': 1,
          'b': 1,
          'there': 1,
          'at': 1,
          '1526': 1,
          'sorry': 1,
          'just': 1,
          'tell': 1,
        

          'unsubscribe': 1,
          'with': 1,
          'stop': 1,
          'no': 1,
          'extra': 1,
          'charge': 1,
          'help': 1,
          '08702840625comuk': 1,
          '220cm2': 1,
          '9ae': 1}),
 Counter({'why': 1,
          'did': 1,
          'i': 1,
          'wake': 1,
          'up': 1,
          'on': 1,
          'my': 1,
          'own': 1,
          'gt': 1}),
 Counter({'now': 1,
          'get': 1,
          'step': 1,
          '2': 1,
          'outta': 1,
          'the': 1,
          'way': 1,
          'congrats': 1,
          'again': 1}),
 Counter({'love': 1, 'has': 1, 'one': 1, 'law': 1}),
 Counter({'private': 1,
          'your': 1,
          '2003': 1,
          'account': 1,
          'statement': 1,
          'for': 1,
          '07808247860': 1,
          'shows': 1,
          '800': 1,
          'unredeemed': 1,
          's': 1,
          'i': 1,
          'm': 1,
          'points': 1,
          'call': 1,
          '08719

#Next we use CountVectorizer:

More Details and example at:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

count_vector.fit(text)
count_vector.get_feature_names()

doc_array = count_vector.transform(text).toarray()

frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,00,000,0121,02,0207,02073162414,021,03,04,050703,...,yourinclusive,yours,yourself,yoville,yr,yrs,yuo,yup,zed,zouk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
511,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
from sklearn.model_selection import train_test_split

#split data set into train and test sets
train_x, test_x, train_y, test_y = train_test_split(datasets['text'], datasets['label'], test_size = 0.2, random_state = 127) # roll no 127 and 80%-20% split

In [16]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(train_x)

# Transform testing data and return the matrix. 
testing_data = count_vector.transform(test_x)

**Naive Bayes**

In [17]:
from sklearn.naive_bayes import MultinomialNB #it gives more promising result and data is repeating
naive_bayes = MultinomialNB()
final_model = naive_bayes.fit(training_data,train_y)

In [18]:
#making prediction
prediction = naive_bayes.predict(testing_data)
print("prediction:",prediction)

prediction: [0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1
 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0
 1 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


In [19]:
print('Accuracy score: {}'.format(accuracy_score(test_y, prediction)))
print("Confusion Matrix: \n{}".format(confusion_matrix(test_y, prediction)))
print('Precision score: {}'.format(precision_score(test_y, prediction)))
print('Recall score: {}'.format(recall_score(test_y, prediction)))

Accuracy score: 0.970873786407767
Confusion Matrix: 
[[63  0]
 [ 3 37]]
Precision score: 1.0
Recall score: 0.925


**Decision Tree**

In [20]:
from sklearn.tree import DecisionTreeClassifier

#Create a Decision Tree Classifier (using Gini)
DT = DecisionTreeClassifier(max_leaf_nodes = 20,criterion = "gini")

#Train the model using the training sets
final_model = DT.fit(training_data, train_y)

In [21]:
prediction = DT.predict(testing_data)
print("prediction:",prediction)

prediction: [0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1
 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0
 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


In [22]:
print('Accuracy score: {}'.format(accuracy_score(test_y, prediction)))
print("Confusion Matrix: \n{}".format(confusion_matrix(test_y, prediction)))
print('Precision score: {}'.format(precision_score(test_y, prediction)))
print('Recall score: {}'.format(recall_score(test_y, prediction)))

Accuracy score: 0.9029126213592233
Confusion Matrix: 
[[61  2]
 [ 8 32]]
Precision score: 0.9411764705882353
Recall score: 0.8


**Optional Exercise:**
Try this on full spam.csv file and bigram matching instead of unigram matching 