In [1]:
from dateutil import parser
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import random
import json

In [2]:
def processData(filename, separator):
    data = pd.read_csv(filename, sep=separator) 
    data["Timestamp"] = [parser.parse(x) for x in data["Timestamp"]]
    data.sort_values(by=['Sender', 'Timestamp'])
    return data

In [3]:
def extractEmail(sender):
    if '<' in sender:
        return re.search(r'<(.+)>', sender).group(1)
    return sender

In [4]:
def isNew(df, idx1, idx2):
    if df['Sender'][idx1] != df['Sender'][idx2]:
        return True
    if (df['Timestamp'][idx2] - df['Timestamp'][idx1]).total_seconds() > 40:
        return True
    return False

In [5]:
def addEmbedding(senders, sender, transitions):
    if sender not in senders:
        senders[sender] = []
    senders[sender].append(transitions)

In [6]:
def analyzeSenderActivity(df):
    senders = {}
    prevIdx = df.index[0]
    transitions = [df['Action'][0]]


    for currIdx in df.index[1:]:
        sender = extractEmail(df['Sender'][currIdx])
        
        if isNew(df, prevIdx, currIdx):
            if prevSender not in senders:
                senders[prevSender] = []
            # senders[prevSender].append(transitions)
            senders[prevSender].append(" -> ".join(transitions))
            transitions = []
            
        action = df['Action'][currIdx]
        if action == 'DURATION':
            transitions.append('OPEN')
            transitions.append('CLOSED')
            transitions.append('DURATION')
        elif action == 'ENGAGED' and action in transitions:
            continue
        else:
            transitions.append(action)

        prevIdx = currIdx
        prevSender = sender

    return senders

In [7]:
def stripToLastActivity(senders):
    lastUpdated = {}
    for key, value in senders.items():
        lastUpdated[key] = value[-1]
        
    return lastUpdated

In [8]:
def stripMostCommonActivity(senders):
    lastUpdated = {}
    for key, value in senders.items():
        lastUpdated[key] = max(set(value), key=value.count)
        
    return lastUpdated

In [9]:
def cluster(d):
    flipped = {} 

    for key, value in d.items(): 
        if value not in flipped: 
            flipped[value] = [key] 
        else: 
            flipped[value].append(key) 

    return flipped

In [10]:
# df = processData('tmp.csv', '{')
df = processData('tabbed.csv', '\t')
senders = analyzeSenderActivity(df)
print(json.dumps(senders, indent=1))

{
 "messages-noreply@linkedin.com": [
  "FROM -> SUBJECT -> DATE -> OPEN -> CLOSED -> DURATION -> ENGAGED",
  "FROM -> DELETE"
 ],
 "chestnutsquare@studenthousing.com": [
  "FROM",
  "SUBJECT -> DATE -> ENGAGED",
  "FROM -> DELETE",
  "FROM -> DELETE"
 ],
 "no-reply@jumpstart.me": [
  "FROM -> DELETE",
  "FROM",
  "DELETE",
  "FROM -> DELETE"
 ],
 "paypal@mail.paypal.com": [
  "FROM -> SUBJECT -> DATE -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE"
 ],
 "OliveGarden@e.olivegarden.com": [
  "FROM -> DATE -> SUBJECT",
  "OPEN -> CLOSED -> DURATION -> ENGAGED",
  "DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE"
 ],
 "mail@mail.adobe.com": [
  "FROM -> DELETE",
  "FROM -> DELETE",
  "FROM -> DELETE"
 ],
 "calendar-notification@google.com": [
  "FROM -> SUBJECT -> OPEN -> CLOSED -> DURATION -> ENGAGED -> IMPORTANT -> FROM -> SUBJECT -> OPEN -> CLOSED -> DURATION -> IMPO

In [12]:
def calculate_probabilities(actions):
    count = len(actions)

    hist_freq = {}
    hist_discounted = {}
    weighted_probability = {}

    for idx, action in enumerate(actions):
        hist_freq = calculate_histogram(hist_freq, action)
        hist_discounted = calculate_discounted_histogram(hist_discounted, action, idx, count)  

    for action in list(dict.fromkeys(actions)):
        weighted_probability[action] = hist_freq[action] + hist_discounted[action]

    normalized_weighted_probability = calculate_normalized_histogram(weighted_probability)
    return normalized_weighted_probability



def calculate_histogram(prev_hist, action):
    if action in prev_hist:
        prev_hist[action] += 1
    else:
        prev_hist[action] = 1
    return prev_hist

def calculate_discounted_histogram(prev_hist, action, loc, total):
    weight = (loc+1) / total

    if action in prev_hist:
        prev_hist[action] += weight
    else:
        prev_hist[action] = weight
    return prev_hist

def calculate_normalized_histogram(hist):
    total_count = sum(hist.values())
    normalized_hist = {}

    for action, freq in hist.items():
        normalized_hist[action] = freq / total_count

    return normalized_hist

In [13]:
def classify(predicted_distribution):
    max_k = None
    max_v = 0
    
    for k, v in predicted_distribution.items():
        if v > 0.5:
            return k
        if v > max_v:
            max_v = v
            max_k = k
    return max_k

In [14]:
def classify_rest_emails(example, starting_index=1):
    total_correct = 0
    total_classified = 0
    total_wrong = 0
    
    for i in range(starting_index, len(example)):
        train_example = example[:i]
        predicted_distribution = calculate_probabilities(train_example)
        classified_result = classify(predicted_distribution)
        
        ## Check if classified result is correct
        # print("{} == {}".format(classified_result, example[i]))
        total_classified += 1
        if classified_result == example[i]:
            total_correct += 1
        else:
            total_wrong += 1
    
    return total_classified, total_correct, total_wrong

In [15]:
def classify_all_senders(senders, history):
    classified_senders = 0
    correct_senders = 0
    wrong_senders = 0
    
    classify_senders = {}
    for sender in senders:
        if len(senders[sender]) > history + 1:
            classified_senders += 1
            classify_senders[sender] = senders[sender]

    total_classified = 0
    total_correct = 0
    total_wrong = 0

    for sender in classify_senders:
        count, corr, wrong = classify_rest_emails(classify_senders[sender], history)
        total_classified += count
        total_correct += corr
        total_wrong += wrong
        
        if corr == count:
            correct_senders += 1
        elif wrong == count:
            wrong_senders += 1
            
        
    return total_classified, total_correct, total_wrong, classified_senders, correct_senders, wrong_senders

In [16]:
for i in range(10):
    total_classified, total_correct, total_wrong, classified_senders, correct_senders, wrong_senders = classify_all_senders(senders,i)
    print("\nUsing History No. {}".format(i))
    print("Accuracy: {0:10.4f}".format(total_correct / total_classified))
    print("Completely Correct: {0:10.4f}".format(correct_senders / classified_senders))
    print("Completely Incorrect: {0:10.4f}".format(wrong_senders / classified_senders))
#     print("Total: {}\nCorrect: {}\nWrong: {}".format(total_classified, total_correct, total_wrong))


Using History No. 0
Accuracy:     0.5976
Completely Correct:     0.0000
Completely Incorrect:     0.2627

Using History No. 1
Accuracy:     0.7361
Completely Correct:     0.3514
Completely Incorrect:     0.1892

Using History No. 2
Accuracy:     0.7946
Completely Correct:     0.3958
Completely Incorrect:     0.1042

Using History No. 3
Accuracy:     0.8242
Completely Correct:     0.4359
Completely Incorrect:     0.1026

Using History No. 4
Accuracy:     0.8605
Completely Correct:     0.5625
Completely Incorrect:     0.0312

Using History No. 5
Accuracy:     0.8593
Completely Correct:     0.5385
Completely Incorrect:     0.0000

Using History No. 6
Accuracy:     0.8462
Completely Correct:     0.5217
Completely Incorrect:     0.0435

Using History No. 7
Accuracy:     0.8373
Completely Correct:     0.5238
Completely Incorrect:     0.0476

Using History No. 8
Accuracy:     0.8315
Completely Correct:     0.4706
Completely Incorrect:     0.0588

Using History No. 9
Accuracy:     0.8323
Comp