In [None]:
# '!' allows you to run bash commands from jupyter notebook.
print("List all the files in the current directory\n")
!ls
# The required data table can be found under smsspamcollection/SMSSpamCollection
print("\n List all the files inside the smsspamcollection directory\n")
!ls smsspamcollection

In [None]:
import pandas as pd
# Dataset available using filepath 'smsspamcollection/SMSSpamCollection'
df = pd.read_table('smsspamcollection/SMSSpamCollection', sep='\t', header=None)

# Output printing out first 5 rows
df.columns=['label', 'sms_message']
df.head()

#### Convert ham and spam labels to binary variables 1 and 0

In [None]:
'''
Encode ham and spam labels to 0 and 1
'''
df['label'] = df['label'].map({'ham':0, 'spam':1})
df.shape
df.head()

#### Bag of Words from Scratch

In [None]:
'''
Convert to Lower Case
'''
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

In [None]:
'''
Remove Punctuation
'''
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_documents)

In [None]:
'''
Tokenize
'''
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(" "))
print(preprocessed_documents)

In [None]:
'''
Count Vectorizer: Frequency Counting of Words
'''
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)

#### Bag of Words using Sci-kitlearn

In [None]:
'''Solution'''
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [None]:
'''
Practice node:
Print the 'count_vector' object which is an instance of 'CountVectorizer()'
'''
print(count_vector)

In [None]:
'''
The get_feature_names() method returns our feature names for this dataset, 
which is the set of words that make up our vocabulary for 'documents'.
'''
count_vector.fit(documents)
count_vector.get_feature_names()

In [None]:
'''
Create a matrix of the documents
'''
doc_array = count_vector.transform(documents).toarray()
doc_array

In [None]:
'''
Convert matrix to dataframe with columns being features
'''
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix

#### Creating Train and Test Datasets

In [None]:
'''Split into Train and Test Datasets for Modeling'''

# split into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

In [None]:
'''
Instantiate CountVectorizer and fit/transform for Training and Testing Data
'''
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. We are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

#### Naive Bayes through Scikit-Learn

In [None]:
'''Fit Naive Bayes on Training Data'''
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

In [None]:
'''Predict on Testing Data'''
predictions = naive_bayes.predict(testing_data)

#### Evaluating the Model

In [None]:
'''Model Evaluation'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))