
### Email tools
#### Setup (Must run cells marked *)


JS setup
- `npm install pst-extractor` in process_pst folder.

- Comment out lines 162-165 in *process_pst/node_modules/pst-extractor/dist/PSTFolder.class.js*: 

  ```
  if ((emailRow && emailRow.itemIndex == -1) || !emailRow) {
    // no more!
    return null;
  }
  ```

Process PST files
- `node process_pst.js <input folder or file> <output folder>`

Python dependencies

- `pip install beautifulsoup4 bertopic flair keybert keyphrase_vectorizers scikit-learn`

Imports *


In [None]:
import os
import json
import random
import textwrap

import pandas as pd

Specify email folder *

In [None]:
email_folder =  None
email_paths = []
text = None
for root, dirs, files in os.walk(email_folder):
    for file in files:
        if file.endswith('_Note.json'):
            email_paths.append(os.path.join(root, file))

#### Keyword Extraction
- Setup

In [None]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

kw_model = KeyBERT()

def get_keywords(text, kwargs):
    if kwargs.get('vectorizer'):
        if kwargs['vectorizer'] == 'keyphrase':
            kwargs['vectorizer'] = KeyphraseCountVectorizer()
        else:
            kwargs['vectorizer'] = CountVectorizer(
                ngram_range=kwargs.get('keyphrase_ngram_range', (1, 1)),
                stop_words=kwargs['stop_words']
            )
    keywords = kw_model.extract_keywords(text, **kwargs)
    keywords = [x[0] for x in keywords]
    print('KEYWORDS:\n')
    print('\n'.join(keywords))

- Extract keywords from a random or specified email (see KWARGS)

In [None]:
while not text:
    email_path = random.choice((email_paths))
    text = json.load(open(email_path, encoding='utf-8')).get('bodyText')


KWARGS = {
    'keyphrase_ngram_range': (1, 3), # Min, max word count for keywords
    'use_mmr': True, # Increases diversity of keywords
    'diversity': .5, # Set diversity between 0 and 1 if using MMR
    'vectorizer': 'keyphrase', # ("keyphrase", True, False) How to represent document. Keyphrase vectorizer should be more coherent
    'stop_words': 'english'
}

print(f'FILENAME: {email_path}\n')
print('TEXT:\n ', "\n".join(textwrap.wrap(text, 100)), '\n')
get_keywords(text, KWARGS)

# Run on same email with different args
# print()
# KWARGS['vectorizer'] = True
# get_keywords(email, KWARGS)

text = None

#### Topic Modeling

- Imports

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer

- Get topics and view topic hierarchy (see comments)

In [None]:
# Select subset or all emails in email folder
slice_docs = (None, None)
# docs = [open(os.path.join(email_folder, filename), encoding='utf-8').read() for 
#         filename in os.listdir(email_folder)[slice_docs[0]:slice_docs[1]]]
docs = [json.load(open(path, encoding='utf-8')).get('bodyText', '') for 
        path in email_paths[slice_docs[0]:slice_docs[1]]]
docs = [doc for doc in docs if doc.strip()]
# Document representations to chain and feed into topic model
representations = [
    KeyBERTInspired(), # Should make topics more coherent
    MaximalMarginalRelevance(diversity=0.3), # Makes topics more diverse
]

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(
        representation_model=representations, # Representations (see above)
        ctfidf_model=ctfidf_model,  # Prevents very frequent words in data from being candidate topics
        nr_topics='auto' # Topic reduction. Set number of desired topics, 'auto' for auto-reduction, 
        # or None. Set to None if there aren't enough topic. 
        )
topics, probabilities = topic_model.fit_transform(docs)
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

- View topics as table

In [None]:
topic_model.get_topic_info()

- Show docs per topic(s)

In [None]:
# Select topic(s) by number (must be list). Get topic numbers from table or 
# labels in topic tree , ie. "1_trees_grass_nature" topic number is 1. 
topics_to_show = [0]
# Show first n docs
n_docs_to_show = 10
# Show first n characters of each doc
n_chars_per_doc = 500

df = pd.DataFrame({"Document": docs, "Topic": topics})
df = df.loc[df['Topic'].isin(topics_to_show)].head(n_docs_to_show)
docs_ = df['Document'].to_list()
random.shuffle(docs_)
for i, doc in enumerate(docs_):
    print(f'DOC {i + 1}\n----------\n')
    print('\n'.join(textwrap.wrap(doc[:n_chars_per_doc] + ' ...' if n_chars_per_doc else '')), '\n')

#### Get vendors, person-org and sender-recipient pairs

In [None]:
from collections import Counter
from get_kws_and_entities import get_keywords, get_entities, get_person_org_pairs,  get_all_sender_recipient_pairs

# Specify DOCS_FOLDER. Don't need to run "Specify email folder" cell for this. 
DOCS_FOLDER = email_folder
OUTPUT = None
#
KW_PATH = f'{OUTPUT}/kws.json'
KW_BATCH_SIZE = None
ENTITY_PATH = f'{OUTPUT}/entities.json'
RELEVANCE_PATH = f'{OUTPUT}/relevance.json'
RELEVANCE_LABEL = 'invoice synset only'
REF_SYNSETS = ['invoice.n.01']
ENTITY_PAIRS_PATH = f'{OUTPUT}/person-org pairs.csv'
ORGS_PATH = f'{OUTPUT}/vendors.csv'
TO_FROM_PAIRS_PATH = f'{OUTPUT}/to_from_pairs.csv'
RANKED_ENTITIES_PATH = f'{OUTPUT}/ranked_entities.txt'
KW_KWARGS = {'top_n': 10}
GET_KWS = True
GET_ENTITIES = True
GET_PAIRS = True

if not os.path.isdir(OUTPUT):
    os.makedirs(OUTPUT)

if GET_KWS:
    get_keywords(KW_PATH,
                  DOCS_FOLDER,
                    KW_KWARGS,
                      batch_size=KW_BATCH_SIZE)

if GET_ENTITIES:
    get_entities(
        ENTITY_PATH,
        DOCS_FOLDER,
        KW_PATH,
        RELEVANCE_PATH,
        RELEVANCE_LABEL,
        relevance_func_args={
            'ref_synsets': REF_SYNSETS
        })

if GET_PAIRS:
    person_org_pairs = get_person_org_pairs(
        RELEVANCE_LABEL,
        RELEVANCE_PATH,
        ENTITY_PATH,
        KW_PATH,
        RANKED_ENTITIES_PATH
    )
    person_org_pairs.head(5000).to_csv(ENTITY_PAIRS_PATH)
    orgs = Counter(person_org_pairs.loc[:, 'org'])
    orgs = {'org': list(orgs.keys()), 'count': list(orgs.values())}
    orgs = pd.DataFrame(orgs)
    orgs.sort_values(by='count', ascending=False).to_csv(ORGS_PATH)
    to_from_pairs = get_all_sender_recipient_pairs(
            RELEVANCE_PATH, RELEVANCE_LABEL)
    to_from_pairs.head(5000).to_csv(TO_FROM_PAIRS_PATH)