# Spam Ham Classifier

In [1]:
import nltk
import pandas as pd

In [2]:
spam_collection = pd.read_csv('SpamCollection', sep="\t", names=["response", "message"])

In [3]:
spam_collection.head(3)

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
X = spam_collection[['message']]
y = spam_collection['response']

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25 ,random_state=34)

# Data Extraction and Cleaning

### First we do extraction and cleaning for a single message and then apply that code on all the messages using a function

In [7]:
msg = X_train.iloc[11,0]

In [8]:
msg

"Haha mayb u're rite... U know me well. Da feeling of being liked by someone is gd lor. U faster go find one then all gals in our group attached liao."

In [9]:
import string

In [10]:
char_list = [char for char in msg if char not in string.punctuation]
char_list

['H',
 'a',
 'h',
 'a',
 ' ',
 'm',
 'a',
 'y',
 'b',
 ' ',
 'u',
 'r',
 'e',
 ' ',
 'r',
 'i',
 't',
 'e',
 ' ',
 'U',
 ' ',
 'k',
 'n',
 'o',
 'w',
 ' ',
 'm',
 'e',
 ' ',
 'w',
 'e',
 'l',
 'l',
 ' ',
 'D',
 'a',
 ' ',
 'f',
 'e',
 'e',
 'l',
 'i',
 'n',
 'g',
 ' ',
 'o',
 'f',
 ' ',
 'b',
 'e',
 'i',
 'n',
 'g',
 ' ',
 'l',
 'i',
 'k',
 'e',
 'd',
 ' ',
 'b',
 'y',
 ' ',
 's',
 'o',
 'm',
 'e',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 ' ',
 'g',
 'd',
 ' ',
 'l',
 'o',
 'r',
 ' ',
 'U',
 ' ',
 'f',
 'a',
 's',
 't',
 'e',
 'r',
 ' ',
 'g',
 'o',
 ' ',
 'f',
 'i',
 'n',
 'd',
 ' ',
 'o',
 'n',
 'e',
 ' ',
 't',
 'h',
 'e',
 'n',
 ' ',
 'a',
 'l',
 'l',
 ' ',
 'g',
 'a',
 'l',
 's',
 ' ',
 'i',
 'n',
 ' ',
 'o',
 'u',
 'r',
 ' ',
 'g',
 'r',
 'o',
 'u',
 'p',
 ' ',
 'a',
 't',
 't',
 'a',
 'c',
 'h',
 'e',
 'd',
 ' ',
 'l',
 'i',
 'a',
 'o']

In [11]:
message = "".join(char_list)
message

'Haha mayb ure rite U know me well Da feeling of being liked by someone is gd lor U faster go find one then all gals in our group attached liao'

### Tokenizing the messages

In [12]:
from nltk import word_tokenize

In [13]:
token_list = word_tokenize(message)
token_list

['Haha',
 'mayb',
 'ure',
 'rite',
 'U',
 'know',
 'me',
 'well',
 'Da',
 'feeling',
 'of',
 'being',
 'liked',
 'by',
 'someone',
 'is',
 'gd',
 'lor',
 'U',
 'faster',
 'go',
 'find',
 'one',
 'then',
 'all',
 'gals',
 'in',
 'our',
 'group',
 'attached',
 'liao']

### Removing the stopwords

In [14]:
from nltk.corpus import stopwords 

In [15]:
word_list = [word.lower() for word in token_list if word.lower() not in stopwords.words('english')]
word_list

['haha',
 'mayb',
 'ure',
 'rite',
 'u',
 'know',
 'well',
 'da',
 'feeling',
 'liked',
 'someone',
 'gd',
 'lor',
 'u',
 'faster',
 'go',
 'find',
 'one',
 'gals',
 'group',
 'attached',
 'liao']

### Defining a function to apply the above process on all the messages

In [16]:
def cleanup_text(msg):
    char_list = [char for char in msg if char not in string.punctuation]
    message = "".join(char_list)
    token_list = word_tokenize(message)
    word_list = [word.lower() for word in token_list if word.lower() not in stopwords.words('english')]
    return word_list

In [17]:
X_train[:10]['message'].apply(cleanup_text)   # example of output using the function

386                             [took, mr, owl, 3, licks]
5231    [realise, busy, guy, im, trying, bother, get, ...
1512    [oops, sorry, check, dont, mind, picking, tomo...
1332                      [good, morning, plz, call, sir]
3840    [erm, …, ill, pick, 645pm, thatll, give, enoug...
3114                        [wat, time, liao, still, got]
310                      [garage, keys, arent, bookshelf]
1113                          [means, still, think, teju]
4457    [want, mapquest, something, look, usf, dogwood...
2471    [eat, old, airport, road, 630, oredi, got, lot...
Name: message, dtype: object

In [18]:
X_train.head(3)

Unnamed: 0,message
386,It took Mr owl 3 licks
5231,I realise you are a busy guy and i'm trying no...
1512,Oops sorry. Just to check that you don't mind ...


# Feature extraction and data preprocessing

## Extracting features using Count Vectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
vec = CountVectorizer(analyzer=cleanup_text)

In [21]:
vec.fit(X_train.message)

CountVectorizer(analyzer=<function cleanup_text at 0x1a1dfc3048>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [22]:
sp_matrix = vec.transform(X_train.message)

In [23]:
sp_matrix.shape
arr = sp_matrix.toarray()

In [24]:
[i for i in range(arr.shape[1]) if arr[3,i]!=0]

[1619, 3227, 4717, 5432, 6373]

In [25]:
arr.shape

(4179, 8082)

## Adding weightage to relevant words using Tfidf

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
tr = TfidfTransformer()

In [28]:
tr.fit(sp_matrix)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [29]:
fitted = tr.transform(sp_matrix)

In [30]:
fitted

<4179x8082 sparse matrix of type '<class 'numpy.float64'>'
	with 37729 stored elements in Compressed Sparse Row format>

## Building the MultinomialNB model

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
model = MultinomialNB()

In [33]:
X_train_transformed = fitted

model.fit(X_train_transformed, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
t1 = vec.transform(X_test.message)

In [35]:
t2 = tr.transform(t1)

In [36]:
t2

<1393x8082 sparse matrix of type '<class 'numpy.float64'>'
	with 10932 stored elements in Compressed Sparse Row format>

In [37]:
X_test_transformed = t2

In [38]:
model.predict(t2)

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [39]:
model.score(t2, y_test)

0.9576453697056713

## Model score is 96%