13 March 2024
# <center>Lab 9 Assignment - CS 4315<center>
<center>Doug Andrade</center>

#### 1. Load the SMS file into a pandas dataframe.

In [1]:
#Import Python modules
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Import CountVectorizer for conversion of text to a token count matrix
from sklearn.feature_extraction.text import CountVectorizer
# Import word_tokenize for dividing strings to list of substrings
from nltk import word_tokenize
# Import WordNetLemmatizer for reducing words to base form
from nltk.stem import WordNetLemmatizer 
# Import search() to search for specific regular expressions
from re import search

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
# Read-in the csv file as a Pandas data frame
spam_df = pd.read_csv(filepath_or_buffer = 'SMSSpamCollection.csv', 
                      sep = '\t',
                      header = 0)

spam_df.head(3)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
# Create a target binary labeled dataframe (1 = spam)
spam_binary = spam_df.copy()
spam_binary['Label'] = spam_binary['Label'].replace(to_replace = {'ham': 0, 'spam': 1})

spam_binary.head(3)

Unnamed: 0,Label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


#### 3. Using lemmatization to create count vectors for the train and test SMS messages

In [4]:
print('Note: I elected to count vector using lemmatization prior to splitting into training/test set because I found that if I did this step afterward there would be a mis-match of total keys. This would prevent predicting on the test set since the prediction test input set size would not match the training set input size used to fit the model.')

Note: I elected to count vector using lemmatization prior to splitting into training/test set because I found that if I did this step afterward there would be a mis-match of total keys. This would prevent predicting on the test set since the prediction test input set size would not match the training set input size used to fit the model.


In [5]:
# Create a custom tokenizer with lemmatization
class LemmaTokenizer:
    def __init__(self):
        # Initialize the word reduction function
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        # Regular expressions filter for numeric characters and short words
        regex_num_punctuation = '(\d+)|([^\w\s])'
        regex_little_words = r'(\b\w{1,2}\b)'
        # Tokenize and lemmatize tokens not in the regular expression filter
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)
                if not search(pattern = regex_num_punctuation, string = t) and not 
                search(pattern = regex_little_words, string = t)]

In [6]:
# Initialize the text to token matrix function with lemmatization
text2vec_lemma = CountVectorizer(tokenizer = LemmaTokenizer(),
                                 stop_words = ['english', 'ha', 'le', 'wa'],
                                 lowercase = True)

# Apply the text vectorizer and lemmatization to the data frame's "SMS" column
text2vec_lemma.fit(spam_binary['SMS'])

# Get key (word) form the .vocabulary_ dictionary of key-value pairs
keys = list(text2vec_lemma.vocabulary_.keys())
# Sort the list of the keys alphabetically
keys.sort()

# Transforms the fitted count matrix to vector of total count of each token
vecs_lemma_train = text2vec_lemma.transform(spam_binary['SMS'])

# Create a new DataFrame with count vectors and concatenate it with spam_df
vecs_train = pd.DataFrame(vecs_lemma_train.toarray(),
                          columns = keys)

# Combine the new tokenized vector data frame with the original
spam_binary = pd.concat([spam_binary, vecs_train],
                        axis = 1)

spam_binary.drop(labels = ['SMS'], axis = 1, inplace = True)

spam_binary.head()



Unnamed: 0,Label,____,aah,aaniye,aaooooright,aathi,abbey,abdomen,abeg,abel,...,zed,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 2. Split the SMS data using a 90/10 training/test split.

In [7]:
spam_train, spam_test = train_test_split(spam_binary,
                                         train_size = 0.9,
                                         random_state = 42)

print('The training set shape is: %s \nThe test set shape is: %s' % (str(spam_train.shape), str(spam_test.shape)))

The training set shape is: (5014, 6624) 
The test set shape is: (558, 6624)


#### 4. Fit a SPAM Multinomial Naïve Bayes classification model using the train data.

In [8]:
mnb = MultinomialNB()
mnb.fit(spam_train[spam_train.columns[1:]],
        spam_train[spam_train.columns[0]])

#### 5. Predict SPAM or not using the test SMS data.

In [9]:
mnb_preds = mnb.predict(spam_test[spam_test.columns[1:]])
mnb_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,

#### 6. Create a classification report for the test SPAM predictions.

In [10]:
print(classification_report(spam_test[spam_test.columns[0]].to_numpy(), 
                            mnb_preds))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       485
           1       0.86      0.90      0.88        73

    accuracy                           0.97       558
   macro avg       0.92      0.94      0.93       558
weighted avg       0.97      0.97      0.97       558



#### 7. Load the armada.csv data, drop the 'Battle' column, convert 'Spanish Involvement' to binary, and replace 'Defeat' with 0, 'Draw' with 1, and 'Victory' with 2 in the 'Portuguese Outcome' column to create labels.

In [11]:
# Read-in the csv file as a Pandas dataframe
armada_df = pd.read_csv(filepath_or_buffer = 'armada.csv')

armada_df.drop(labels = ['Battle'], axis = 1, inplace = True)

# Make "Spanish Invovlement binary"
armada_df['Spanish Involvement'].replace(to_replace = ['No','Yes'], value = [0., 1.], inplace = True)

# Replace 'Defeat' with 0, 'Draw' with 1, and 'Victory' with 2
armada_df['Portuguese Outcome'].replace(to_replace = ['Defeat', 'Draw', 'Victory'], value = [0., 1., 2.], inplace = True)

armada_df.head(3)

Unnamed: 0,Year,Portuguese Ships,Dutch Ships,English Ships,Ratio of Portuguese Ships to Dutch/British Ships,Spanish Involvement,Portuguese Outcome
0,1601,6,3,0,2.0,0.0,1.0
1,1606,14,11,0,1.273,0.0,1.0
2,1606,6,9,0,0.667,0.0,0.0


#### 8. Split the armada.csv data using an 80/20 train/test split.

In [12]:
armada_train, armada_test = train_test_split(armada_df,
                                             train_size = 0.8,
                                             random_state = 42)

In [13]:
armada_train.head(3)

Unnamed: 0,Year,Portuguese Ships,Dutch Ships,English Ships,Ratio of Portuguese Ships to Dutch/British Ships,Spanish Involvement,Portuguese Outcome
17,1625,35,20,0,1.75,1.0,2.0
22,1639,51,11,0,4.636,1.0,1.0
11,1637,6,7,0,0.857,0.0,1.0


In [14]:
armada_test.head(3)

Unnamed: 0,Year,Portuguese Ships,Dutch Ships,English Ships,Ratio of Portuguese Ships to Dutch/British Ships,Spanish Involvement,Portuguese Outcome
9,1625,8,4,4,1.0,0.0,1.0
25,1645,6,7,0,0.857,1.0,0.0
8,1622,4,4,2,0.667,0.0,0.0


#### 9. Z-Score all non-label columns of the train and test dataframes.

In [15]:
armada_train_z = (armada_train[armada_train.columns[:-1]] - armada_train[armada_train.columns[:-1]].mean(axis = 0)) / armada_train[armada_train.columns[:-1]].std(axis = 0)
armada_train_z.head(3)

Unnamed: 0,Year,Portuguese Ships,Dutch Ships,English Ships,Ratio of Portuguese Ships to Dutch/British Ships,Spanish Involvement
17,-0.192882,1.133715,0.179608,-0.300537,0.523451,0.977008
22,0.568754,2.053226,-0.183273,-0.300537,3.401687,0.977008
11,0.459949,-0.532898,-0.344553,-0.300537,-0.367146,-0.977008


In [16]:
armada_test_z = (armada_test[armada_train.columns[:-1]] - armada_train[armada_train.columns[:-1]].mean(axis = 0)) / armada_train[armada_train.columns[:-1]].std(axis = 0)
armada_test_z.head(3)

Unnamed: 0,Year,Portuguese Ships,Dutch Ships,English Ships,Ratio of Portuguese Ships to Dutch/British Ships,Spanish Involvement
9,-0.192882,-0.41796,-0.465513,0.300537,-0.224531,-0.977008
25,0.89517,-0.532898,-0.344553,-0.300537,-0.367146,0.977008
8,-0.35609,-0.647837,-0.465513,0.0,-0.556635,-0.977008


#### 10. Predict 'Portuguese Outcome' for the test data using Gaussian Naïve Bayes.

In [17]:
gnb = GaussianNB()

gnb.fit(armada_train_z,
        armada_train[['Portuguese Outcome']].to_numpy().flatten())

In [18]:
gnb_preds = gnb.predict(armada_test_z)
gnb_preds

array([1., 2., 1., 2., 2., 2.])

#### 11. Create a classification report for the test 'Portuguese Outcome' predictions.

In [19]:
gnb.predict_proba(armada_test_z)

array([[1.39345180e-01, 8.60654820e-01, 0.00000000e+00],
       [3.87374020e-06, 2.59549808e-04, 9.99736576e-01],
       [4.44583320e-02, 9.55541668e-01, 0.00000000e+00],
       [3.07550519e-05, 8.54239551e-04, 9.99115005e-01],
       [1.75568917e-06, 1.10429804e-04, 9.99887815e-01],
       [4.98077639e-06, 4.94487477e-04, 9.99500532e-01]])

In [20]:
print(classification_report(armada_test[armada_test.columns[-1]].to_numpy(),
                            gnb_preds))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         2
         1.0       0.50      0.33      0.40         3
         2.0       0.25      1.00      0.40         1

    accuracy                           0.33         6
   macro avg       0.25      0.44      0.27         6
weighted avg       0.29      0.33      0.27         6



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
