In [None]:
import random

import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics

In [None]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os.path
import pickle

def read_gmail():
    SCOPES = ['https://www.googleapis.com/auth/gmail.modify']
    unread_messages = []
    creds = None
    
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('gmail', 'v1', credentials=creds)

    # Call the Gmail API
    inbox_unreads = service.users().messages().list(userId='me',labelIds = ['INBOX', 'UNREAD']).execute()
    
    if inbox_unreads['resultSizeEstimate'] != 0:
        for unread in inbox_unreads['messages']:
            unread_messages.append(service.users().messages().get(userId='me', id=unread['id']).execute())
            service.users().messages().modify(userId='me', id=unread['id'], body={"removeLabelIds": ["UNREAD"]}).execute()

    return unread_messages

new_gmail_messages = read_gmail()

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datetime import datetime
from dateutil import parser
import base64
import re

def purify_email_address(str_with_email):
    """
    Given string containing an email address potentially surrounded by some characters
    Returns a string that is only the email address
    """
    email_regex = r'[\w\.-]+@[\w\.-]+'
    return re.findall(email_regex, str_with_email)[0]

def separate_url_text(str_with_urls):
    """
    Given string containing any number of urls e.g. https://google.com or http://site.com?q=123
    Returns a set of unique urls found and the original string free of urls 
    """
    url_regex = r'(https?:\/\/(?:(?:[^\s()<>]))+)'
    urls = set(re.findall(url_regex, str_with_urls))
    url_free_text = re.sub(url_regex, '', str_with_urls)
    return urls, url_free_text

def remove_special_characters(str_with_special_chars):
    """
    Given a string containing any number of special characters
    Returns a string free of special characters
    """
    special_char_regex = r'[><+\|_\-=*#$\)(\?\&:.,!@%\^}{]'
    str_with_special_chars = str_with_special_chars.replace('\W', '')
    return re.sub(special_char_regex, '', str_with_special_chars)

def clean_stem_words(words):
    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words if word not in set(stop_words)]

def parse_gmail_headers_array(headers_array, msg_data):
    # Collapses headers given as 
    # [{name: "stuff", value: "value"}, {name: "other", value: "other"}] -> {stuff: "value", other: "other"}
    # and adds header info to the msg_data struct
    for header in headers_array:
        if header['name'] == 'Subject':
            msg_data['subject'] = header['value'].lower()
            msg_data['subject-clean'] = " ".join(clean_stem_words(
                remove_special_characters(msg_data['subject']).split()
            ))
        if header['name'] == 'From':
            frm = purify_email_address(header['value'])
            msg_data['participants'].append(frm)
            msg_data['participants-count'] += 1
        if header['name'] == "Cc":
            for participant in header['value'].split(", "):
                participant_emails = re.findall("[\w\.-]+@[\w\.-]+", participant)
                if len(participant_emails) > 0:
                    msg_data['participants'].append(participant_emails[0])
            msg_data['participants-count'] += len(msg_data['participants'])
        if header['name'] == "References" :
            msg_data["thread-length"] = (len(header['value'].split(" ")) + 1)
        if header['name'] == "Date" :
            msg_data["time-rec"] = parser.parse(header['value']).timestamp()

def parse_gmail_body(gmail_body_parts, msg_data):
    for part in gmail_body_parts:
        if part['body'].get('data'):
            decodedBytes = base64.urlsafe_b64decode(part['body']['data'])
            decodedStr = str(decodedBytes, "utf-8").lower()
            
            if part['mimeType'] == "text/html":
                pass
            elif part['mimeType'] == "text/plain":
                msg_data['urls'], decodedStr = separate_url_text(decodedStr)
                decodedStr = remove_special_characters(decodedStr)
                msg_data['words'] = " ".join(clean_stem_words(decodedStr.split()))
            else:
                print("unrecognized mimeType")
        else:
            if part['body'].get('attachmentId'):
                msg_data['attachments'] += 1
            if part.get('parts'):
                parse_gmail_body(part['parts'], msg_data)
                
def unpack_message(msg):
    msg_data = {
        "id": msg['id'],
        "subject": "",
        "subject-clean": "",
        "words": "",
        "time-proc": datetime.now().timestamp(),
        "attachments": 0,
        "thread-length": 0,
        "participants-count": 0,
        "urls": [],
        "participants": []
    }

    parse_gmail_headers_array(msg['payload']['headers'], msg_data)
    if msg['payload'].get('parts'):
        parse_gmail_body(msg['payload']['parts'], msg_data)
    
    return msg_data

unpacked_messages = []
for msg in new_gmail_messages:
    unpacked_messages.append(unpack_message(msg))
    
action_map = {"ignore": 0, "read": 1, "respond": 2}

def prototype_dataframe(messages):
    columns = [k for k in messages[0].keys() if k not in [
        "id", 
        "subject", 
        "words", 
        "participants", 
        "subject-clean", 
        "time-rec", 
        "time-proc",
        "urls"
    ]]
    index = [msg["id"] for msg in messages]
    
    for msg in messages:
        msg['text'] = msg['subject-clean'] + " " + msg['words'] + " " + " ".join(msg['participants'])
        msg['link-count'] = len(msg['urls'])
        # Generate some random guesses for now... remove later
        if msg['thread-length'] > 1:
            msg['label'] = random.randint(1, 2)
        elif msg['attachments'] > 0:
            msg['label'] = action_map["respond"]
        else:
            msg['label'] = random.randint(0, 2)

    return pd.DataFrame(messages, columns=columns + ["link-count", "text", "label"], index=index)

df = prototype_dataframe(unpacked_messages)


In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
X_ngrams = tfidf_vect.fit_transform(df['text'])
train_messages, test_messages, train_labels, test_labels = train_test_split(X_ngrams, df['label'], test_size=0.3, random_state=42, stratify=df['label'])

In [None]:
mnb = MultinomialNB()
%time mnb.fit(train_messages, train_labels)
mnbpred = mnb.predict(test_messages)

print('Multinomial Naive Bayes F1 Score :', metrics.f1_score(test_labels, mnbpred, average='weighted'))
# cross-validation using confusion matrix
pd.DataFrame(
    metrics.confusion_matrix(test_labels, mnbpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)


In [None]:
abc = AdaBoostClassifier(mnb)
%time abc.fit(train_messages, train_labels)
abcpred = abc.predict(test_messages)

print('AdaBossted Naive Bayes F1 Score :', metrics.f1_score(test_labels, abcpred, average='weighted'))
# cross-validation using confusion matrix
pd.DataFrame(
    metrics.confusion_matrix(test_labels, abcpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
rf = RandomForestClassifier(random_state=42, max_depth=4, bootstrap=False)
%time rf.fit(train_messages, train_labels)
rfpred = rf.predict(test_messages)

print('Random Forest F1 Score :', metrics.f1_score(test_labels, rfpred, average='weighted'))
# cross-validation using confusion matrix
pd.DataFrame(
    metrics.confusion_matrix(test_labels, rfpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
abcrf = AdaBoostClassifier(rf)
%time abcrf.fit(train_messages, train_labels)
abcpred = abcrf.predict(test_messages)

print('AdaBossted Random Forest F1 Score :', metrics.f1_score(test_labels, abcpred, average='weighted'))
# cross-validation using confusion matrix
pd.DataFrame(
    metrics.confusion_matrix(test_labels, abcpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
   
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, refit = True, verbose = 3, n_jobs=6) 
%time grid.fit(train_messages, train_labels)
pgpred = grid.predict(test_messages)
print('SVM F1 Score :', metrics.f1_score(test_labels, pgpred, average='weighted'))
print(grid.best_params_)
pd.DataFrame(
    metrics.confusion_matrix(test_labels, pgpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
from sklearn import svm
svc = SVC()
%time svc.fit(train_messages, train_labels)
svcpred = svc.predict(test_messages)

print('SVM F1 Score :', metrics.f1_score(test_labels, svcpred, average='weighted'))
pd.DataFrame(
    metrics.confusion_matrix(test_labels, svcpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'gamma':['scale', 'auto'],
              'kernel': ['linear']}  
   
grid = GridSearchCV(SVC(random_state=42), param_grid=param_grid, cv= 5) 
%time grid.fit(train_messages, train_labels)
svcgpred = grid.predict(test_messages)
print('SVM F1 Score :', metrics.f1_score(test_labels, svcgpred, average='weighted'))
print(grid.best_params_)
pd.DataFrame(
    metrics.confusion_matrix(test_labels, svcgpred),
    index=[['actual', 'actual', 'actual'], list(action_map.keys())], 
    columns=[['predicted', 'predicted', 'predicted'], list(action_map.keys())]
)

In [None]:
action_map = {"ignore": 0, "read": 1, "respond": 2}

def new_data_dataframe(messages):
    columns = [k for k in messages[0].keys() if k not in [
        "id", 
        "words", 
        "participants", 
        "subject-clean", 
        "time-rec", 
        "time-proc",
        "urls"
    ]]
    index = [msg["id"] for msg in messages]
    
    for msg in messages:
        msg['text'] = msg['subject-clean'] + " " + msg['words'] + " " + " ".join(msg['participants'])
        msg['link-count'] = len(msg['urls'])
        msg['label'] = -1

    return pd.DataFrame(messages, columns=columns + ["link-count", "text", "label"], index=index)

ngms = read_gmail()
ums = []
for msg in ngms:
    ums.append(unpack_message(msg))
    
udf = new_data_dataframe(ums)
new_preds = mnb.predict(tfidf_vect.transform(udf['text']))
i = 0
for c, n in udf.iterrows():
    udf.loc[c, 'label'] = new_preds[i]
    i+=1
udf
# write the new labels back to the model (async process)

In [None]:
# pd.read_excel('../web_api/emails.xlsx', engine='openpyxl').describe()