## Setup (must run cells marked *)


`pip install beautifulsoup4 bertopic keybert keyphrase_vectorizers libpff-python scikit-learn`

#### Imports*


In [None]:
import os
import random
import textwrap

import pandas as pd

#### Get emails from PST file(s)

In [None]:
import pypff

def get_messages_from_pst(pst_path, output_folder):
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder, exist_ok=True)
    file_ = pypff.open(pst_path)
    root = file_.get_root_folder()
    for x in root.sub_items:
        walk_folder_for_messages(x, output_folder=output_folder)


def walk_folder_for_messages(folder, output_folder):
    for i in folder.sub_items:
        if type(i) == pypff.message:
            subject = i.subject
            text = i.plain_text_body.decode()
            message = f'{subject}\n{text}'
            with open(os.path.join(output_folder, str(i.identifier)), 'w', encoding='utf-8') as f:
                f.write(message)
        elif type(i) == pypff.folder:
            walk_folder_for_messages(i, output_folder=output_folder)

pst_path_list = ['sample.pst', 'test.pst'] # Specify paths
output_folder = 'email_text'
for email_path in pst_path_list:
    get_messages_from_pst(email_path, output_folder)

#### ... Or from files with full headers, etc.


In [None]:
from bs4 import BeautifulSoup

def get_email_text_and_subject(path, combine=True, stripHTML=True):
    lines = open(path, encoding='utf-8').read().split('\n')
    start_index, end_index = None, None
    subject = None
    for index, line in enumerate(lines):
        if line.startswith('Subject:') and subject is None:
            subject = line.split(':', 1)[1].strip()
        if not line and start_index is None:
            start_index = index
        elif '-----Original Message-----' in line:
            end_index = index
            break

    text = '\n'.join(lines[start_index:end_index]).strip()
    if stripHTML:
        text = BeautifulSoup(text, 'html.parser').text
    if combine:
        return f'{subject}\n{text}'
    return {'main_text': text, 'subject': subject}

source_folder = None # Specify source folder
output_folder = 'email_text'
for root, dirs, filenames in os.walk(source_folder):
    for filename in filenames:
        email = get_email_text_and_subject(open(os.path.join(root, filename), encoding='utf-8'))
        with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as f:
            f.write(email)

#### Specify folder contaning outputted email text, one email per file*

In [None]:
email_folder = 'email_text'

## Keyword Extraction

#### Setup


In [None]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

kw_model = KeyBERT()

def get_keywords(text, kwargs):
    if kwargs.get('vectorizer'):
        if kwargs['vectorizer'] == 'keyphrase':
            kwargs['vectorizer'] = KeyphraseCountVectorizer()
        else:
            kwargs['vectorizer'] = CountVectorizer(
                ngram_range=kwargs.get('keyphrase_ngram_range', (1, 1)),
                stop_words=kwargs['stop_words']
            )
    keywords = kw_model.extract_keywords(text, **kwargs)
    keywords = [x[0] for x in keywords]
    print('KEYWORDS:\n')
    print('\n'.join(keywords))

#### Extract keywords from a random or specified email (see KWARGS)

In [None]:
email_path = os.path.join(email_folder, (random.choice(os.listdir(email_folder))))
email = open(email_path, encoding='utf-8').read()

KWARGS = {
    'keyphrase_ngram_range': (1, 3), # Min, max word count for keywords
    'use_mmr': True, # Increases diversity of keywords
    'diversity': .5, # Set diversity between 0 and 1 if using MMR
    'vectorizer': 'keyphrase', # ("keyphrase", True, False) How to represent document. Keyphrase vectorizer should be more coherent
    'stop_words': 'english'
}

print(f'FILENAME: {email_path}\n')
print('TEXT:\n ', "\n".join(textwrap.wrap(email, 100)), '\n')
get_keywords(email, KWARGS)

# Run on same email with different args
# print()
# KWARGS['vectorizer'] = True
# get_keywords(email, KWARGS)

## Topic Modeling

#### Setup

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

#### Get topics and view topic hierarchy (see comments)

In [None]:
# Select subset or all emails in email folder
slice_docs = (None, 1000)
docs = [open(os.path.join(email_folder, filename), encoding='utf-8').read() for 
        filename in os.listdir(email_folder)[slice_docs[0]:slice_docs[1]]]

# Document representations to chain and feed into topic model
representations = [
    KeyBERTInspired(), # Should make topics more coherent
    MaximalMarginalRelevance(diversity=0.3), # Makes topics more diverse
]

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(
        representation_model=representations, # Representations (see above)
        ctfidf_model=ctfidf_model,  # Prevents very frequent words in data from being candidate topics
        nr_topics='auto' # Topic reduction. Set number of desired topics, 'auto' for auto-reduction, 
        # or None. Set to None if there aren't enough topic. 
        )
topics, probabilities = topic_model.fit_transform(docs)
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

#### View topics as table

In [None]:
topic_model.get_topic_info()

#### Show docs per topic(s)

In [None]:
# Select topic(s) by int
topics_to_show = [0, 1]
# Show first n docs
n_docs_to_show = 10
# Show first n characters of each doc
n_chars_per_doc = 500

df = pd.DataFrame({"Document": docs, "Topic": topics})
df = df.loc[df['Topic'].isin(topics_to_show)].head(n_docs_to_show)
docs_ = df['Document'].to_list()
random.shuffle(docs_)
for i, doc in enumerate(docs_):
    print(f'DOC {i + 1}\n----------\n')
    print('\n'.join(textwrap.wrap(doc[:n_chars_per_doc] + ' ...' if n_chars_per_doc else '')), '\n')