`pip install flair ipywidgets nbformat nltk pandas plotly tqdm`

In [None]:
from collections import Counter
import json
import os

import pandas as pd
import plotly.express as px
from tqdm .notebook import tqdm

#### Get entities
Only need to run once to get entities from emails and save to output. Can add emails to email_dir or increase n_emails and run again.

In [None]:
# Entities to find (should only be ORG for vendors).
entities = ('ORG')
email_dir = 'email_text'
# Find entities in first N emails in email_dir.
n_emails = None
# Path to JSON file
output = 'entity_results.json'

from flair.data import Sentence
from flair.models import SequenceTagger
from nltk import sent_tokenize

def get_entities(tags, folder, output):
    tagger = SequenceTagger.load("flair/ner-english-fast")
    results_dict = {'processed': [], 'entities': {}}
    entity_dict = results_dict['entities']
    for i, filename in tqdm(list(enumerate(os.listdir(folder)[:n_emails]))):
        if filename in results_dict['processed']:
            continue
        results_dict['processed'].append(filename)
        if i and not i % 100:
            json.dump(results_dict, open(
                output, 'w', encoding='utf-8'), default=list)
        email = open(os.path.join(folder, filename), encoding='utf-8').read()
        sents = sent_tokenize(email)
        for sent in sents:
            sent = Sentence(sent)
            tagger.predict(sent)
            for entity in sent.get_spans('ner'):
                if entity.tag in tags:
                    entity_dict.setdefault(
                        name := entity.text, {'tags': set(), 'files': set(), 'count': 0})
                    entity_dict[name]['tags'].add(entity.tag)
                    entity_dict[name]['files'].add(filename)
                    entity_dict[name]['count'] += 1
    json.dump(results_dict, open(output, 'w', encoding='utf-8'),
              default=list)

get_entities(entities, email_dir, output)

### Graph entities 

In [None]:
# Path to entity JSON file
entity_file = 'entity_results.json'
# Only include entities that occur at least this frequntly in emails
min_count = 5
# Top n entities to graph (there will be way too many to graph all of them)
top_n_entities = 20

def graph_entities(data, min_count=0, top_n_entities=None):
    data = json.load(open(data, encoding='utf-8'))
    counts = {word: v['count']
              for word, v in data['entities'].items() if v['count'] >= min_count}
    counts = Counter(counts)
    counts = counts.most_common(top_n_entities)
    df = pd.DataFrame(list(counts))
    df = df.rename(columns={0: 'word', 1: 'count'})
    fig = px.bar(df, x='word', y='count')
    fig.show()

graph_entities(entity_file, min_count=min_count,top_n_entities=top_n_entities)