In [115]:
import pandas as pd
import re
from string import punctuation
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [116]:
def get_data():
    df = pd.read_table("/Users/darshandoshi/Desktop/git-repositiories/nanodegree_datascience/bayes theorem/spam classifier/smsspamcollection/SMSSpamCollection.txt",sep="\t",names=['label','sms_message'])
    #print(df.head())
    return df
    

In [117]:
def preview_data(df):
    print(df.head())

In [118]:
def data_processing(df):
    df['label']=df['label'].map({'ham':0, 'spam':1})
    return df

In [119]:
def train_test(data):
    X_train, X_test, y_train, y_test = train_test_split(data['sms_message'], 
                                                    data['label'], 
                                                   random_state=1)
    return X_train, X_test, y_train, y_test 

In [120]:
def bag_of_words(X_train,X_test):
    count_vector = CountVectorizer()

    # Fit the training data and then return the matrix
    training_data = count_vector.fit_transform(X_train)

    # Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
    testing_data = count_vector.transform(X_test)
    return training_data, testing_data

In [121]:
def model(train_data,y_train):
    naive_bayes = MultinomialNB()
    naive_bayes.fit(train_data,y_train)
    return naive_bayes

In [122]:
def pred(model,test_data):
    y_pred=model.predict(test_data)
    return y_pred

In [123]:
def evaluate(y_test,y_pred):
    print('Accuracy score: ', format(accuracy_score(y_test,y_pred)))
    print('Precision score: ', format(precision_score(y_test,y_pred)))
    print('Recall score: ', format(recall_score(y_test,y_pred)))
    print('F1 score: ', format(f1_score(y_test,y_pred)))

In [124]:
def main():
    data=get_data()
    preview_data(data)
    data=data_processing(data)
    X_train, X_test, y_train, y_test=train_test(data)
    train_data,test_data=bag_of_words(X_train, X_test)
    m=model(train_data,y_train)
    y_pred=pred(m,test_data)
    evaluate(y_test,y_pred)


In [125]:
if __name__== "__main__":
      main()


  label                                        sms_message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


In [60]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [68]:
sans_punctuation_documents = []
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

for i in lower_case_documents:
    s=i.maketrans("","", punctuation)
    
    sans_punctuation_documents.append(i.translate(s))
    
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [55]:
tr = str.maketrans("", "", string.punctuation)
s = "hello, call hello you tomorrow?"
s.translate(tr)


'hello call hello you tomorrow'

In [69]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [70]:
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)

[Counter({'how': 1, 'you': 1, 'hello': 1, 'are': 1}),
 Counter({'win': 2, 'money': 1, 'home': 1, 'from': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]
