In [2]:
# Importing the Libraries for Data Analysing, Data Preppping & Model Building 

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score



In [29]:
# Loading the SMS data into a DataFrame
df= pd.read_csv("../Data-Science/SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [30]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
spam    5572 non-null object
txt     5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [32]:
# Converting "spam" into dummy variables
df['spam'] = pd.get_dummies(df.spam)['spam']

In [67]:
df.head(10)

Unnamed: 0,spam,txt
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [60]:
df.tail(10)

Unnamed: 0,spam,txt
5562,0,Ok lor... Sony ericsson salesman... I ask shuh...
5563,0,Ard 6 like dat lor.
5564,0,Why don't you wait 'til at least wednesday to ...
5565,0,Huh y lei...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...
5571,0,Rofl. Its true to its name


In [48]:
# Checking the count of spam & ham SMS's
df.spam.value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [49]:
# Creating Target variable 
y = df.spam

In [50]:
# Instantiating TF-IDF vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [51]:
X = vectorizer.fit_transform(df.txt)

In [52]:
X.shape

(5572, 8587)

In [53]:
y.shape

(5572,)

In [54]:
# Creating train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, )

In [55]:
# Fitting Naive Bayes to the training split
nbc = naive_bayes.MultinomialNB()
nbc.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
# Checking the ROC score on the testing split
roc_auc_score(y_test, nbc.predict_proba(X_test)[:,1])

0.98589322144123448

In [66]:
# Checking wheather the new SMS is a spam or ham using our NB model
check_array = np.array(["win free cash by clicking the below link"])
check_array_vector = vectorizer.transform(check_array)
print("This SMS is a spam: ", nbc.predict(check_array_vector))

This SMS is a spam:  [1]
