***GOOGLE API AUTHORIZATION***<br>
via OAuth 2.0

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install correct versions so that OAuth Flow has run_console() method
!pip install 'google-api-python-client==1.7.2'
!pip install 'google-auth==1.8.0'
!pip install 'google-auth-httplib2==0.0.3'
!pip install 'google-auth-oauthlib==0.4.1'
!pip install nltk==3.5

In [None]:
# Log into gmail to scrape it for email data
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Get the credentials json file and establish scope
credentials = '/content/drive/MyDrive/client_secret.apps.googleusercontent.com.json'
scope = ['https://www.googleapis.com/auth/gmail.readonly']

# Use flow and scope to authorize gmail
flow = InstalledAppFlow.from_client_secrets_file(credentials, scope)
creds = flow.run_console()

***CLEANING AND FORMATTING EMAILS***

In [None]:
import random
service = build('gmail', 'v1', credentials=creds)

# Call the Gmail API to find non-SPAM and SPAM emails
results = service.users().messages().list(userId='me', maxResults=100).execute()
messages = results.get('messages', [])

results = service.users().messages().list(userId='me', labelIds=['SPAM'], maxResults=50).execute()
spam_messages = results.get('messages', [])

#combine SPAM and non-SPAM
messages = messages + spam_messages
random.shuffle(messages)

In [None]:
#breaking down emails
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#download necessary nltk dependencies
nltk.download("stopwords")
nltk.download("punkt")
stop_words = set(stopwords.words("english"))

#initialize all variables
x = []
y = []
word_token={}
itr = 0

#iterate over each message
for message in messages:

  #extract the contents of the message
  msg = service.users().messages().get(userId='me', id=message['id']).execute()
  text = msg['snippet']

  #normalize the text (alphabetic and lowercase) then tokenize
  text = "".join([i.lower() for i in text if (i.isalpha() or i == " ")])
  words = word_tokenize(text)

  #loop over each word in the tokenized message
  w = []
  for word in words:
    #accept words only if they have a length of 2 or greater and not stopword and append
    if word not in stop_words and len(word) > 2:
      #check if the word is already assigned an ID
      if word not in word_token.keys():
        #assign an ID to the word
        word_token[word] = itr
        itr+=1
      w.append(word_token[word])
  x.append(w)

  #append a value indicating if a message is SPAM or not
  if 'SPAM' in msg.get('labelIds', []):
    y.append(1)
  else:
    y.append(0)

#creating bag-of-words:
to_numpy = []
for tokens in x:
  arr=np.zeros(itr)
  for token in tokens:
    arr[token] +=1
  to_numpy.append(arr)

#stack rows to create a numpy matrix
bag_of_words = np.row_stack(to_numpy)




***MOVE FORMATTED DATA TO EXCEL FILE***

In [None]:
new_y = np.vstack(np.asarray(y))
all_data = np.append(bag_of_words, new_y, axis=1)
np.save('/content/drive/MyDrive/CptS_437/email_data.npy', all_data)

***CLASSIFIER WITH RUNTIME VARIABLES***

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# initialize variables for training and testing
n_evaluations = 100
testsize = .9
train_acc=[]
test_acc=[]
# Splitting the dataset into training and testing sets
for i in range(9):
  train_accuracies = []
  test_accuracies = []
  for i in range(n_evaluations):
    X_train, X_test, y_train, y_test = train_test_split(bag_of_words , y, test_size=testsize, random_state = random.randint(0,1000))

    # Train Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Predict on test data
    y_pred = clf.predict(X_test)

    train_accuracy = clf.score(X_train, y_train)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Store accuracies for later analysis or visualization
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
  train_acc.append(sum(train_accuracies) / 100)
  test_acc.append(sum(test_accuracies) / 100)
  testsize -=.1

# Plotting the training and testing accuracies over different evaluation runs
evaluation_runs = [10,20,30,40,50,60,70,80,90]
plt.plot(evaluation_runs, train_acc, label='Training Accuracy', marker='o')
plt.plot(evaluation_runs, test_acc, label='Testing Accuracy', marker='o')

plt.xlabel('% of Training data (remainder is testing)')
plt.ylabel('Accuracy')
plt.title('Training and Testing Accuracies')
plt.ylim(.5, 1)

plt.legend()
plt.show()

***RUN CLASSIFIER WITH SAVED .npy FILE***

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Get .npy file from Drive
drive.mount('/content/drive', force_remount=True)
all_data = np.load('/content/drive/MyDrive/CptS_437/email_data.npy')
x = all_data[:,:-1]
new_y = all_data[:,-1:].reshape(1, -1)[0]

# initialize variables for training and testing
n_evaluations = 100
testsize = .9
train_acc=[]
test_acc=[]
# Splitting the dataset into training and testing sets
for i in range(9):
  train_accuracies = []
  test_accuracies = []
  for i in range(n_evaluations):
    X_train, X_test, y_train, y_test = train_test_split(x , new_y, test_size=testsize, random_state = random.randint(0,1000))

    # Train Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Predict on test data
    y_pred = clf.predict(X_test)

    train_accuracy = clf.score(X_train, y_train)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Store accuracies for later analysis or visualization
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
  train_acc.append(sum(train_accuracies) / 100)
  test_acc.append(sum(test_accuracies) / 100)
  testsize -=.1

# Plotting the training and testing accuracies over different evaluation runs
evaluation_runs = [10,20,30,40,50,60,70,80,90]
plt.plot(evaluation_runs, train_acc, label='Training Accuracy', marker='o')
plt.plot(evaluation_runs, test_acc, label='Testing Accuracy', marker='o')

plt.xlabel('% of Training data (remainder is testing)')
plt.ylabel('Accuracy')
plt.title('Training and Testing Accuracies')
plt.ylim(.5, 1)

plt.legend()
plt.show()