In [None]:
import email
import glob
import string

import pandas as pd
from pprint import pprint

from collections import defaultdict
# from nltk.tokenize import word_tokenize

In [None]:
def get_body(email_msg):
    """Grab the body from a parsed email object"""
    body = ""
    if email_msg.is_multipart():
        for payload in email_msg.get_payload():
            body+= payload.get_payload().strip()
    else:
        body = email_msg.get_payload().strip()
    if body: 
        return body 
    else: return ''

In [None]:
# Use the built in python email parser to parse the input text
email_parser = email.parser.Parser()
from_list = []
subject_list = []
organization_list = []
body_list = []

for filepath in glob.iglob('../emails/*'):
    with open(filepath,'r',encoding="ISO-8859-1") as file:
        email_obj = email_parser.parse(file)
        from_list.append(email_obj.get('From', ''))
        subject_list.append(email_obj.get('Subject',''))
        organization_list.append(email_obj.get('Organization',''))
        body_list.append(get_body(email_obj))
        
email_df = pd.DataFrame({'from':from_list,'subject':subject_list,'organization':organization_list,'body':body_list})

In [None]:
# with open('../emails/00026','r',encoding="ISO-8859-1") as file:
#     email_obj = email_parser.parse(file)
#     pprint(email_obj.get("From"))
#     pprint(email_obj.get("Subject"))
#     pprint(email_obj.get("Organization"))
#     print()
#     pprint(get_body(email_obj))

In [None]:
# Generate a document by concating the subject with the email body
documents = [subject + body for subject,body in zip(subject_list,body_list)]

# remove common words and tokenize
# Adding in many stop words to try and restrict the appearnce. Should use TF-IDF vectorizer instead of this naive approach

stoplist = set('for a of the and to in re article subject who had out them been know how then did that than get does get some when his hers were more just that you like are with this was have would they their but from can what there would will all one about has not can any your one'.split())
texts = [word_tokenize(document.lower())for document in documents]
texts = [[word for word in document if word not in stoplist and word.isalnum() and len(word)>2] for document in texts]

# remove words that appear only once or twice
frequency = defaultdict(int)
for text in texts:
        for token in text:
            frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 2]
    for text in texts
]

from gensim import corpora
dictionary = corpora.Dictionary(texts)
print(dictionary)
# Converts documents into sparse bow representation
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# Tried using HDP model because it does not require the knowledge of the number of topics apriori
# Learned 150 topics which is far to many and resulted in incoherent topics
from gensim.models import HdpModel
hdp = HdpModel(corpus, dictionary)

In [None]:
from gensim.models import LdaModel
# Fit a LDA model to bag of word represenation of documents, adjust num_topics till coherent topics appear
# Sweeped through a range of topic numbers, saw a few coherent topics but in general not great performance
lda = LdaModel(corpus,num_topics=6,id2word=dictionary)

# Showing the top words per topic
# Again using the LDA model there still does not appear to be coherent topics
lda.show_topics()

In [None]:
# Switching to sklearn and their Clustering API
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AffinityPropagation

In [None]:
# Generate a document by concating the subject with the email body
documents = [subject + body for subject,body in zip(subject_list,body_list)]

# Use the TFidVecotrizer to create BoW represnation of documents
# Increased the min frequency to remove alot of junk words
vectorizer = TfidfVectorizer(min_df=3,stop_words='english')
X= vectorizer.fit_transform(documents)

In [None]:
# We don't know the number of clusters so Sweeped through a range, found that 5 or 6 appeared to be right
# Religion/God
# Encryption
# Windows/IT help
# Isreal,Armenia,Turkey : middle east
# Hockey/Sports
model=KMeans(n_clusters=6)
model.fit(X)

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [None]:
for i in range(model.n_clusters):
    print(f'Cluster {i}:')
    for ind in order_centroids[i, :15]:
        print(f'{terms[ind]}')

In [None]:
# Save out the kmeans model
from joblib import dump, load
dump(model, 'kmeans_model.joblib') 

In [None]:
# Save out the vectorizer
from joblib import dump, load
dump(vectorizer, 'tfidf_vectorizer.joblib') 

In [64]:
import requests

In [65]:
# Article about jersualem recieves a top topic of 4 which matches what we have seen from the top words asscociated with each cluster
bdy = documents[3]

In [66]:
resp = requests.post('http://127.0.0.1:8000/predict',json={'email':bdy})

In [67]:
resp.json()

{'top_topic': 4, 'topic_distribution': [], 'email_topics': []}