**IMPORTS**

In [222]:
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')  
from nltk.corpus import stopwords
from nltk.util import bigrams

from collections import Counter

import re
import math


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**LOADING CORPUS**

In [223]:
!ls "/content/drive/MyDrive/AuthorshipAnalysisFiles"

allen-p   bass-e     buy-r	 delainey-d		     fossum-d
arnold-j  beck-s     campbell-l  enron_mail_20150507.tar.gz
badeer-r  brawner-s  carson-m	 farmer-d


In [224]:
# files are organized under
#   /project_dir
#     /name           
#       /_sent_mail   
#         /1.          

# define common path substr's
project_dir = "/content/drive/MyDrive/AuthorshipAnalysisFiles"
sent_suffix = "_sent_mail"

In [225]:
# define distinct authors' directories
project_folder_ls_text = !ls -F $project_dir
project_folder_items = project_folder_ls_text.nlstr.split()

project_authors = [item.strip('/') for item in project_folder_items if item.endswith('/')]

NUM_AUTHORS = 9999 # truncate # of authors (& amount of data/processing) here, if desired
project_authors = project_authors[0:NUM_AUTHORS]

authors_paths =       { author: f"{project_dir}/{author}" for author in project_authors}
authors_sent_paths =  { author: f"{project_dir}/{author}/{sent_suffix}" for author in project_authors}

# DEBUG OUTPUT
print(project_folder_items)
for folder in project_authors:
  print(folder)
for path in authors_paths.values():
  print(path)
for path in authors_sent_paths.values():
  print(path)

['allen-p/', 'bass-e/', 'buy-r/', 'delainey-d/', 'fossum-d/', 'arnold-j/', 'beck-s/', 'campbell-l/', 'enron_mail_20150507.tar.gz', 'badeer-r/', 'brawner-s/', 'carson-m/', 'farmer-d/']
allen-p
bass-e
buy-r
delainey-d
fossum-d
arnold-j
beck-s
campbell-l
badeer-r
brawner-s
carson-m
farmer-d
/content/drive/MyDrive/AuthorshipAnalysisFiles/allen-p
/content/drive/MyDrive/AuthorshipAnalysisFiles/bass-e
/content/drive/MyDrive/AuthorshipAnalysisFiles/buy-r
/content/drive/MyDrive/AuthorshipAnalysisFiles/delainey-d
/content/drive/MyDrive/AuthorshipAnalysisFiles/fossum-d
/content/drive/MyDrive/AuthorshipAnalysisFiles/arnold-j
/content/drive/MyDrive/AuthorshipAnalysisFiles/beck-s
/content/drive/MyDrive/AuthorshipAnalysisFiles/campbell-l
/content/drive/MyDrive/AuthorshipAnalysisFiles/badeer-r
/content/drive/MyDrive/AuthorshipAnalysisFiles/brawner-s
/content/drive/MyDrive/AuthorshipAnalysisFiles/carson-m
/content/drive/MyDrive/AuthorshipAnalysisFiles/farmer-d
/content/drive/MyDrive/AuthorshipAnalysisF

In [226]:
# define list of authors' email paths
authors_sent_email_paths = {}

for author in project_authors:
  path = authors_sent_paths[author]
  sent_dir_ls = !ls $path 
  sent_dir_items = sent_dir_ls.nlstr.split()
  path_list = []
  for item in sent_dir_items:
    path_list.append(f"{path}/{item}")
  authors_sent_email_paths[author] = path_list

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors[0:2]:
  print(author)
  path_list = authors_sent_email_paths[author]
  for path in path_list[0:2]:
    print(f"\t{path}")

SAMPLE
allen-p
	/content/drive/MyDrive/AuthorshipAnalysisFiles/allen-p/_sent_mail/1.
	/content/drive/MyDrive/AuthorshipAnalysisFiles/allen-p/_sent_mail/137.
bass-e
	/content/drive/MyDrive/AuthorshipAnalysisFiles/bass-e/_sent_mail/1.
	/content/drive/MyDrive/AuthorshipAnalysisFiles/bass-e/_sent_mail/1104.


In [227]:
# running this cell w/ 3 authors takes ~= 3 minutes

# define list of author's emails
authors_emails = {}
for author in project_authors:
  path_list = authors_sent_email_paths[author]
  email_list = []
  for path in path_list:
    with open(path, "r") as f:
      email_list.append(f.read())
      f.close()
  authors_emails[author] = email_list

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors[0:3]:
  print(author)
  email_list = authors_emails[author]
  for email in email_list[0:2]:
    print(f"EMAIL START\n{email}\nEMAIL END\n\n\n")

SAMPLE
allen-p
EMAIL START
Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 
EMAIL END



EMAIL START
Message-ID: <23377438.1075855688269.JavaMail.evans@thyme>
Date: Tue, 12 Sep 2000 06:06:00 -0700 (PDT)
From: phillip.allen@enron.com
To: stagecoachmama@hotmail.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: stagecoachmama@hotmail.com
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

Lucy,


You wrote

**PROCESSING EMAILS**

In [228]:
# define list of authors' emails: w/o headers

# forgive me for my RegEx sins

# get a list of emails w/o headers
def get_email_bodies(emails: list):
  # assume header lines are of the form
  #   Header-Name: some <string> of AR8itrary Symbols \n  
  first_divider_line_regex = r"^\-{5,} .*(?=\n)"
  divider_line_regex = r"(?<=\n)\-{5,} .*(?=\n)"
  first_header_line_regex = r"^[a-zA-Z\-]*:\s.*(?=\n)"
  header_line_regex = r"(?<=\n)[a-zA-Z\-]*:\s.*(?=\n)"
  empty_line_regex = r"\n(?=\n)"
  name_line_regex = r"(?<=\n)[\w@]*(\s[\w@]*)?,?(?=\n)"       # <-- catches one or two words (hopefully names) w/ optional ','
  name_end_regex = r"(?<=\n)[\w@]*(\s[\w@]*)?$"               # <-- catches one or two words (hopefully names) at end of string
  date_regex = r"(?<=\s)(\d{1,2})/(\d{1,2})/(\d{2,4})(?=\s)"  # e.g. XX/YY/ZZ or X/Y/ZZZZ or variations (first. approximation)
  time_regex = r"(?<=(\s|\-))(\d{1,2}):(\d{2})(:\d{2})?( AM| PM)?(?=(\s|\-))"  # e.g. 01:02 AM or 01:02:03 PM or variations (first. approximation)

  email_bodies = []
  for email in emails:
    email_body = email
    # remove header lines
    email_body = re.sub(divider_line_regex,      "", email_body)
    email_body = re.sub(first_divider_line_regex,"", email_body)
    email_body = re.sub(header_line_regex,       "", email_body)
    email_body = re.sub(first_header_line_regex, "", email_body)
    # remove signature / address line (not precise, first-order-approximation)
    email_body = re.sub(name_line_regex,         "", email_body)
    email_body = re.sub(name_end_regex,          "", email_body)
    # remove dates / times
    email_body = re.sub(date_regex,              "DATE", email_body)
    email_body = re.sub(time_regex,              "TIME", email_body)
    # remove blank lines
    email_body = re.sub(empty_line_regex,        "", email_body)
    email_body = email_body.strip()
    email_bodies.append(email_body)
  return email_bodies

authors_email_bodies = {}
for author in project_authors:
  email_list = authors_emails[author]
  authors_email_bodies[author] = get_email_bodies(email_list)

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors[0:3]:
  print(author)
  email_list = authors_email_bodies[author]
  for email in email_list[0:5]:
    print(f"EMAIL START\n{email}EMAIL END\n\n\n")
  print("*************")


SAMPLE
allen-p
EMAIL START
Here is our forecastEMAIL END



EMAIL START
You wrote fewer checks this month.  Spent more money on Materials and less on 
Labor.
   June  July  August
Total Materials  2929  4085  4801
Services  53  581  464
Labor   3187  3428  2770
Here are my questions on the August bank statement (attached):
1.  Check 1406  Walmart    Description and unit?
2.  Check 1410  Crumps     Detail description and unit?
3.  Check 1411  Lucy      What is this?
4.  Check 1415  Papes      Detail description and units?
5.  Checks 1416, 1417, and 1425  Why overtime?
6.  Check 1428    Ralph's   What unit?
7.  Check 1438    Walmart?    Description and unit?  
Try and pull together the support for these items and get back to me.EMAIL END



EMAIL START
DATE TIME
I got this request.  On the gas side, I think Kean/Lay need an update to a table you prepared for me a few months ago, which I've attached..  Can you oblige?  Thanks,
DATE TIME
Steve has asked that you update the power point belo

In [229]:
# define list of authors' emails, tokenized
authors_tokenized_emails = {}
for author in project_authors:
  tokenized_emails = []
  for email in authors_email_bodies[author]:
    tokenized = nltk.word_tokenize(email)
    tokenized_emails.append(tokenized)
  authors_tokenized_emails[author] = tokenized_emails

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors:
  print(author)
  email_list = authors_tokenized_emails[author]
  for email_tokens in email_list[0:2]:
    print(f"\t{email_tokens}")

SAMPLE
allen-p
	['Here', 'is', 'our', 'forecast']
	['You', 'wrote', 'fewer', 'checks', 'this', 'month', '.', 'Spent', 'more', 'money', 'on', 'Materials', 'and', 'less', 'on', 'Labor', '.', 'June', 'July', 'August', 'Total', 'Materials', '2929', '4085', '4801', 'Services', '53', '581', '464', 'Labor', '3187', '3428', '2770', 'Here', 'are', 'my', 'questions', 'on', 'the', 'August', 'bank', 'statement', '(', 'attached', ')', ':', '1', '.', 'Check', '1406', 'Walmart', 'Description', 'and', 'unit', '?', '2', '.', 'Check', '1410', 'Crumps', 'Detail', 'description', 'and', 'unit', '?', '3', '.', 'Check', '1411', 'Lucy', 'What', 'is', 'this', '?', '4', '.', 'Check', '1415', 'Papes', 'Detail', 'description', 'and', 'units', '?', '5', '.', 'Checks', '1416', ',', '1417', ',', 'and', '1425', 'Why', 'overtime', '?', '6', '.', 'Check', '1428', 'Ralph', "'s", 'What', 'unit', '?', '7', '.', 'Check', '1438', 'Walmart', '?', 'Description', 'and', 'unit', '?', 'Try', 'and', 'pull', 'together', 'the', 'su

In [230]:
# remove stopwords from emails
eng_stopwords = set(stopwords.words('english'))

for author in project_authors:
  tokenized_emails = authors_tokenized_emails[author]
  for email_token_list in tokenized_emails:
    for token in email_token_list:
      if token in eng_stopwords:
        email_token_list.remove(token)

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors:
  print(author)
  email_list = authors_tokenized_emails[author]
  for email_tokens in email_list[0:2]:
    print(f"\t{email_tokens}")

SAMPLE
allen-p
	['Here', 'our', 'forecast']
	['You', 'wrote', 'fewer', 'checks', 'month', '.', 'Spent', 'money', 'Materials', 'less', 'Labor', '.', 'June', 'July', 'August', 'Total', 'Materials', '2929', '4085', '4801', 'Services', '53', '581', '464', 'Labor', '3187', '3428', '2770', 'Here', 'my', 'questions', 'August', 'bank', 'statement', '(', 'attached', ')', ':', '1', '.', 'Check', '1406', 'Walmart', 'Description', 'unit', '?', '2', '.', 'Check', '1410', 'Crumps', 'Detail', 'description', 'unit', '?', '3', '.', 'Check', '1411', 'Lucy', 'What', 'this', '?', '4', '.', 'Check', '1415', 'Papes', 'Detail', 'description', 'units', '?', '5', '.', 'Checks', '1416', ',', '1417', ',', '1425', 'Why', 'overtime', '?', '6', '.', 'Check', '1428', 'Ralph', "'s", 'What', 'unit', '?', '7', '.', 'Check', '1438', 'Walmart', '?', 'Description', 'unit', '?', 'Try', 'pull', 'together', 'the', 'support', 'these', 'items', 'get', 'back', 'me', '.']
bass-e
	['From', ':', 'Larry', 'Joe', 'Hunter', 'DATE', '

**TRAINING / TESTING SPLIT**

In [231]:
# separate train/test data, define as token lists
train_test_ratio = 0.9 # e.g., for 100 emails, 90 are training and 10 are testing

authors_training_tokenized_emails = {}
authors_testing_tokenized_emails = {}
for author in project_authors:
  emails = authors_tokenized_emails[author]
  num_emails = len(emails)
  training_cutoff = math.floor(train_test_ratio * num_emails)
  training = emails[:training_cutoff]
  testing  = emails[training_cutoff:]
  print(author, num_emails, len(training), len(testing))
  authors_training_tokenized_emails[author] = training
  authors_testing_tokenized_emails[author] = testing

print()
print("all emails:  \t", sum(len(authors_tokenized_emails[author]) for author in project_authors))
print("all training:\t", sum(len(authors_training_tokenized_emails[author]) for author in project_authors))
print("all testing: \t", sum(len(authors_testing_tokenized_emails[author]) for author in project_authors))

allen-p 602 541 61
bass-e 1409 1268 141
buy-r 165 148 17
delainey-d 875 787 88
fossum-d 1099 989 110
arnold-j 814 732 82
beck-s 1093 983 110
campbell-l 200 180 20
badeer-r 52 46 6
brawner-s 145 130 15
carson-m 172 154 18
farmer-d 747 672 75

all emails:  	 7373
all training:	 6630
all testing: 	 743


**PROCESS TRAINING EMAILS**

In [232]:
# for each author, join all email tokens into one list
EMAIL_START_TOK = "EMAIL-START"
EMAIL_END_TOK = "EMAIL-END"

author_tokens = {}
for author in project_authors:
  all_email_tokens = []
  for email_token_list in authors_training_tokenized_emails[author]:
    all_email_tokens.append(EMAIL_START_TOK)
    all_email_tokens.extend(email_token_list)
    all_email_tokens.append(EMAIL_END_TOK)
  author_tokens[author] = all_email_tokens

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors:
  all_email_tokens = author_tokens[author]
  print(author, all_email_tokens[0:20])

SAMPLE
allen-p ['EMAIL-START', 'Here', 'our', 'forecast', 'EMAIL-END', 'EMAIL-START', 'You', 'wrote', 'fewer', 'checks', 'month', '.', 'Spent', 'money', 'Materials', 'less', 'Labor', '.', 'June', 'July']
bass-e ['EMAIL-START', 'From', ':', 'Larry', 'Joe', 'Hunter', 'DATE', 'TIME', 'Can', 'adjust', 'TAGG', 'shortname', 'MIRANTAMEENE', '(', 'currently', 'MIRANTAMEENECAN', ')', '?', 'EMAIL-END', 'EMAIL-START']
buy-r ['EMAIL-START', 'ted.murphy', '@', 'enron.com', ',', 'mark.ruane', '@', 'enron.com', ',', 'steve.young', '@', 'enron.com', 'pam.metoyer', '@', 'enron.com', ',', 'rita.hennessy', '@', 'enron.com', ',']
delainey-d ['EMAIL-START', 'brian.redmond', '@', 'enron.com', ',', 'max.yzaguirre', '@', 'enron.com', ',', 'rob.milnthorp', '@', 'enron.com', 'Guys', ',', 'details', 'the', 'ESA', 'MEH', 'turbines', '-']
fossum-d ['EMAIL-START', 'Dammit', 'Jenkins', ',', "n't", 'even', 'joke', 'stuff', 'like', '!', 'If', 'market', 'ever', 'realized', 'I', 'worked', ',', 'stock', 'would', 'go']
ar

**PROCESSING BIGRAMS**

In [233]:
# for each author, compute bigrams
author_bigrams = {}
for author in project_authors:
  author_bigrams[author] = list(bigrams(author_tokens[author]))

# DEBUG OUTPUT
print("SAMPLE")
for author in project_authors:
  all_bigrams = author_bigrams[author]
  print(author, all_bigrams[0:20])

SAMPLE
allen-p [('EMAIL-START', 'Here'), ('Here', 'our'), ('our', 'forecast'), ('forecast', 'EMAIL-END'), ('EMAIL-END', 'EMAIL-START'), ('EMAIL-START', 'You'), ('You', 'wrote'), ('wrote', 'fewer'), ('fewer', 'checks'), ('checks', 'month'), ('month', '.'), ('.', 'Spent'), ('Spent', 'money'), ('money', 'Materials'), ('Materials', 'less'), ('less', 'Labor'), ('Labor', '.'), ('.', 'June'), ('June', 'July'), ('July', 'August')]
bass-e [('EMAIL-START', 'From'), ('From', ':'), (':', 'Larry'), ('Larry', 'Joe'), ('Joe', 'Hunter'), ('Hunter', 'DATE'), ('DATE', 'TIME'), ('TIME', 'Can'), ('Can', 'adjust'), ('adjust', 'TAGG'), ('TAGG', 'shortname'), ('shortname', 'MIRANTAMEENE'), ('MIRANTAMEENE', '('), ('(', 'currently'), ('currently', 'MIRANTAMEENECAN'), ('MIRANTAMEENECAN', ')'), (')', '?'), ('?', 'EMAIL-END'), ('EMAIL-END', 'EMAIL-START'), ('EMAIL-START', "o'neal.winfree")]
buy-r [('EMAIL-START', 'ted.murphy'), ('ted.murphy', '@'), ('@', 'enron.com'), ('enron.com', ','), (',', 'mark.ruane'), ('ma

In [234]:
# for each author, compute bigram (relative) frequencies

author_bigram_frequencies = {}
for author in project_authors:
  # get count
  bigram_counter = Counter(author_bigrams[author])
  # get rel. freq's
  num_bigrams = sum(bigram_counter.values())
  bigram_rel_frequency = Counter({bigram:freq/num_bigrams for bigram, freq in bigram_counter.items()})
  author_bigram_frequencies[author] = bigram_rel_frequency


print("SAMPLE")
print("Top-20 most common bigrams:")
for author in project_authors:
  bigram_freqs = author_bigram_frequencies[author]
  print(author, bigram_freqs.most_common()[0:20])

print()
print("Middle-X most common bigrams:")
for author in project_authors:
  bigram_freqs = author_bigram_frequencies[author]
  print(author, bigram_freqs.most_common()[100:120])

SAMPLE
Top-20 most common bigrams:
allen-p [(('--', '--'), 0.041531894781928015), (('@', 'ECT'), 0.012480245764839677), (('ECT', ','), 0.011505457338236814), (('EMAIL-END', 'EMAIL-START'), 0.007975541672205237), (('@', 'ENRON'), 0.005878269602847564), (('ENRON', ','), 0.005449953476006912), (('.', 'I'), 0.00508071543562704), (('.', 'EMAIL-END'), 0.004268391746791321), (('*', '*'), 0.00378099753348989), (('@', 'Enron'), 0.0036480718389531363), (('--', '-'), 0.0033526814066492387), (('Enron', ','), 0.0032492947553428744), (('.', 'The'), 0.003219755712112485), (('DATE', 'TIME'), 0.00314590810403651), (('TIME', '--'), 0.0029834433662693665), (('@', 'EES'), 0.002939134801423782), (('EES', ','), 0.002924365279808587), (('@', 'EnronXGate'), 0.002555127239428715), (('EnronXGate', ','), 0.00254035771781352), (('EMAIL-START', 'TIME'), 0.0021120415909728686)]
bass-e [(('--', '--'), 0.03588527915095367), (('>', '>'), 0.02824967190750126), (('*', '*'), 0.011100679008823484), (('@', 'ECT'), 0.010089

**PREDICT AUTHOR FROM TEST DATA**

DEMO, ONE AUTHOR, REUSED TEST/TRAINING DATA

In [235]:
def get_predicted_author_from_tokens(test_email_tokens):
  test_email_bigrams = list(bigrams(test_email_tokens))

  author_test_expected_prob_log = {}
  for author in project_authors:
    bigram_freqs = author_bigram_frequencies[author]
    unseen_bigram_probability = 0.000001 # TODO: calculate this in a more principled way
    # do all probability calculations in log space, avoid min. float value/precision
    email_probability_log = math.log(1.0)
    # print(author, "\tunseen bigram freq:", unseen_bigram_probability, "\tleast common bigram freq:", bigram_freqs.most_common()[-1][1])
    for bigram in test_email_bigrams:
      bigram_probability = bigram_freqs.get(bigram) if bigram in bigram_freqs else unseen_bigram_probability
      # print(email_probability_log, '+', bigram_probability)
      email_probability_log += math.log(bigram_probability)
    author_test_expected_prob_log[author] = email_probability_log

  most_likely_author_and_prob = [None, -math.inf] # {author: probability}
  for author in project_authors:
    predicted_prob = author_test_expected_prob_log[author]
    if predicted_prob > most_likely_author_and_prob[1]:
      most_likely_author_and_prob[0] = author
      most_likely_author_and_prob[1] = predicted_prob

  # DEBUG OUTPUT (function internal)
  # print(test_email_tokens)
  # for author in project_authors:
    # print(author, author_test_expected_prob_log[author])

  return most_likely_author_and_prob

def get_predicted_author(test_email):
  test_email_tokenized = nltk.word_tokenize(test_email)
  prediction = get_predicted_author_from_tokens(test_email_tokenized)
  return prediction

test_email_tokens = authors_testing_tokenized_emails['allen-p'][1]
print("predicted author (from tokens w/ cleanup):\t", get_predicted_author_from_tokens(test_email_tokens))
# test_email = authors_emails['allen-p'][1]
# print("predicted author (raw email):\t\t\t", get_predicted_author(test_email))


predicted author (from tokens w/ cleanup):	 ['allen-p', -1134.1135390384936]


In [236]:
# get predictions for all emails (TODO: only check test emails)
authors_predictions = {}
for author in project_authors:
  email_predictions = []
  for email in authors_testing_tokenized_emails[author]:
    predicted_author = get_predicted_author_from_tokens(email)[0]
    email_predictions.append(predicted_author)
  authors_predictions[author] = email_predictions

print("SAMPLE")
print("emails...")
for author in project_authors:
  print("written by:", author)
  for prediction in authors_predictions[author][0:5]:
    print("\tpredicted author:", prediction)
print("(each line is a new email)")

SAMPLE
emails...
written by: allen-p
	predicted author: allen-p
	predicted author: allen-p
	predicted author: beck-s
	predicted author: farmer-d
	predicted author: allen-p
written by: bass-e
	predicted author: bass-e
	predicted author: arnold-j
	predicted author: beck-s
	predicted author: bass-e
	predicted author: farmer-d
written by: buy-r
	predicted author: delainey-d
	predicted author: farmer-d
	predicted author: buy-r
	predicted author: allen-p
	predicted author: buy-r
written by: delainey-d
	predicted author: delainey-d
	predicted author: delainey-d
	predicted author: fossum-d
	predicted author: delainey-d
	predicted author: delainey-d
written by: fossum-d
	predicted author: fossum-d
	predicted author: fossum-d
	predicted author: fossum-d
	predicted author: beck-s
	predicted author: fossum-d
written by: arnold-j
	predicted author: arnold-j
	predicted author: beck-s
	predicted author: farmer-d
	predicted author: bass-e
	predicted author: arnold-j
written by: beck-s
	predicted autho

**TEST PRECISION & RECALL**

In [237]:
# determine precision, recall for each author
num_emails_by_author = {}
num_true_positives_by_author = {}
num_total_positives_by_author = {}

# calc. num_emails_by_author, num_emails_total
for author in project_authors:
  num_emails_by_author[author] = len(authors_testing_tokenized_emails[author])
num_emails_total = sum(num_emails_by_author.values())

# initialize positive counts
for author in project_authors:
  num_true_positives_by_author[author] = 0
  num_total_positives_by_author[author] = 0
# calc. num positives
for author in project_authors:
  for predicted_auth in authors_predictions[author]:
    if author == predicted_auth:
      num_true_positives_by_author[predicted_auth]+= 1
    num_total_positives_by_author[predicted_auth] += 1

# derive precision, by author (true_pos / tot_pos)
authors_precision = {}
for author in project_authors:
  num_true_pos = num_true_positives_by_author[author]
  num_tot_pos = num_total_positives_by_author[author]
  precision = num_true_pos / num_tot_pos if num_tot_pos != 0 else 0
  authors_precision[author] = precision

# derive recall, by author (true_pos / num_by_auth)
authors_recall = {}
for author in project_authors:
  num_true_pos = num_true_positives_by_author[author]
  num_by_auth = num_emails_by_author[author]
  recall = num_true_pos / num_by_auth if num_by_auth != 0 else 0
  authors_recall[author] = recall


print("num test emails    \t", num_emails_total)
print("num_by_author      \t", num_emails_by_author)
print('num_true_positives \t', num_true_positives_by_author)
print('num_total_positives\t', num_total_positives_by_author)
print()
print('precision')
for entry in authors_precision.items():
  print("\t", entry)
print('recall')
for entry in authors_recall.items():
  print("\t", entry)

num test emails    	 743
num_by_author      	 {'allen-p': 61, 'bass-e': 141, 'buy-r': 17, 'delainey-d': 88, 'fossum-d': 110, 'arnold-j': 82, 'beck-s': 110, 'campbell-l': 20, 'badeer-r': 6, 'brawner-s': 15, 'carson-m': 18, 'farmer-d': 75}
num_true_positives 	 {'allen-p': 40, 'bass-e': 114, 'buy-r': 5, 'delainey-d': 74, 'fossum-d': 105, 'arnold-j': 36, 'beck-s': 100, 'campbell-l': 11, 'badeer-r': 2, 'brawner-s': 7, 'carson-m': 8, 'farmer-d': 65}
num_total_positives	 {'allen-p': 62, 'bass-e': 137, 'buy-r': 8, 'delainey-d': 93, 'fossum-d': 120, 'arnold-j': 40, 'beck-s': 135, 'campbell-l': 13, 'badeer-r': 4, 'brawner-s': 10, 'carson-m': 26, 'farmer-d': 95}

precision
	 ('allen-p', 0.6451612903225806)
	 ('bass-e', 0.8321167883211679)
	 ('buy-r', 0.625)
	 ('delainey-d', 0.7956989247311828)
	 ('fossum-d', 0.875)
	 ('arnold-j', 0.9)
	 ('beck-s', 0.7407407407407407)
	 ('campbell-l', 0.8461538461538461)
	 ('badeer-r', 0.5)
	 ('brawner-s', 0.7)
	 ('carson-m', 0.3076923076923077)
	 ('farmer-d', 0.6

In [238]:
# determine precision, recall across all authors

all_true_pos = sum(num_true_positives_by_author.values()) 
all_pos = sum(num_total_positives_by_author.values())
num_emails_total # defined in previous block

# overall precision
precision = all_true_pos / all_pos
print(f"all-author precision:\t {precision} = {all_true_pos} / {all_pos}")

#overall recall
recall = all_true_pos / num_emails_total
print(f"all-author recall:   \t {recall} = {all_true_pos} / {num_emails_total}")

print("(These numbers are the same because, across all authors, every email is a 'positive' result for someone; i.e. 'all_pos' == 'num_emails_total')")
print("(This program always guesses that the author is a known author.)")


all-author precision:	 0.7631224764468372 = 567 / 743
all-author recall:   	 0.7631224764468372 = 567 / 743
(These numbers are the same because, across all authors, every email is a 'positive' result for someone; i.e. 'all_pos' == 'num_emails_total')
(This program always guesses that the author is a known author.)
