# Code for Chapter 4 

In this case study we will attempt to write a "priority inbox" algorithm for ranking email by some measures of importance. We will define these measures based on a set of email features, which moves beyond the simple work counts used in Chapter 3.

Set the global paths

In [None]:
import os

dataSpamDir = '../03-Classification/data/easy_ham/'
mailPaths = os.listdir(dataSpamDir)
mailPaths = [f'../03-Classification/data/easy_ham/{i}' for i in mailPaths]

We define a set of function that will extract the data for the feature set we have defined to rank email impportance. This includes the following: message body, message source, message subject, and date the message was sent.

In [None]:
#Simply returns the text of a given email message
def readMsg(path):
    lines = open(path, encoding="latin-1").readlines()
    return lines

In [None]:
#Similar to the function from Chapter 3, this returns only the message body for a given email.
def getBodyMsg(lines):
    startIndex = lines.index('\n')
    return ''.join(lines[startIndex+1 : len(lines)])

In [None]:
# Retuns the email address of the sender for a given email message
import re

def getSendersEmail(lines):
    sendersEmail = ''
    for line in lines:
        if line.startswith('From:'):
            sendersEmail = re.search(r'[\w\.-]+@[\w\.-]+', line).group(0)
            break

    return sendersEmail.lower()

In [None]:
# Retuns the subject string for a given email message
def getSubject(lines):
    prefix = 'Subject:'
    subject = ''
    for line in lines:
        if line.startswith(prefix):
            subject = line[len(prefix):].strip()
            break

    return subject.lower()

In [None]:
# Retuns the date a given email message was received
from datetime import datetime
import re

def tryParsingDate(text):
    date = text.rsplit('(edt)', 1)[0].strip()
    date = date.rsplit('(cest)', 1)[0].strip()
    date = date.rsplit('(pdt)', 1)[0].strip()
    date = date.rsplit('(bst)', 1)[0].strip()
    date = date.rsplit('(ist)', 1)[0].strip()
    date = date.rsplit('(cdt)', 1)[0].strip()
    date = date.rsplit('(est)', 1)[0].strip()
    date = date.rsplit('(eest)', 1)[0].strip()
    date = date.rsplit('(msd)', 1)[0].strip()
    date = date.rsplit('(gmt)', 1)[0].strip()
    date = date.rsplit('(pst)', 1)[0].strip()
    date = date.rsplit('ut', 1)[0].strip()
    date = date.rsplit('edt', 1)[0].strip()
    
    for fmt in ('%a, %d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S %Z', '%d %b %Y %H:%M:%S %z', '%a, %d %b %Y %H:%M:%S'):
        try:
            return datetime.strptime(date, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found ', date)

def getDate(lines):
    firstPrefix = 'Date:'
    secondPrefix = 'X-Original-Date:'
    date = ''
    for line in lines:
        if line.startswith(firstPrefix):
            date = line[len(firstPrefix):].strip()
            break
        elif line.startswith(secondPrefix):
            date = line[len(secondPrefix):].strip()
            break
    
    if date == '':
        return ''
    
    return tryParsingDate(date.lower())

## Create DataFrame with data

In [None]:
# This function ties all of the above helper functions together.
# It returns a vector of data containing the feature set
# used to categorize data as priority or normal HAM
import pandas as pd

df = pd.DataFrame({}, columns = ['Date','Email', 'Subject', 'Body', 'Path'])

for mailPath in mailPaths:
    if ('.ipynb_checkpoints' not in mailPath):
        msgLines = readMsg(mailPath)
        date = getDate(msgLines)
        email = getSendersEmail(msgLines)
        subject = getSubject(msgLines)
        bodyMsg = getBodyMsg(msgLines)
        df = df.append({'Date': date, 'Email': email, 'Subject': subject, 'Body': bodyMsg, 'Path': mailPath}, ignore_index=True)

In [None]:
# Order the messages chronologically
df['Date'] = pd.to_datetime(df.Date, utc=True)
df = df.sort_values(by=['Date'])

In [None]:
df.head()

In [None]:
df.shape

Create train and test dataset

In [None]:
# We will use the first half of the priority.df to train our priority in-box algorithm.
# Later, we will use the second half to test.
import numpy as np

rows = int(df.shape[0] / 2)
df_train = pd.DataFrame(df.iloc[:rows])
df_test = pd.DataFrame(df.iloc[rows+1:])

print([df_train.shape, df_test.shape])

Group messages by thread

In [None]:
def cleanSubject(subject):
    if subject.startswith('re: '):
        return subject[4:]
    else:
        subject
    
        
df_train['Clean_Subject'] = df_train.apply(lambda row: cleanSubject(row.Subject), axis = 1) 

uniqueSubjects = df_train['Clean_Subject'].unique()
uniqueSubjects = pd.DataFrame({'Clean_Subject': uniqueSubjects})
uniqueSubjects['Thread_Index'] = uniqueSubjects.index

df_train = pd.merge(df_train, uniqueSubjects, on='Clean_Subject')

Group messages by mail

In [None]:
uniqueMails = df_train['Email'].unique()
uniqueMails = pd.DataFrame({'Email': uniqueMails})
uniqueMails['Email_Index'] = uniqueMails.index

df_train = pd.merge(df_train, uniqueMails, on='Email')

Show thread and email popularity

In [None]:
df_train['Clean_Subject'].value_counts()[:10] 

In [None]:
df_train['Email'].value_counts()[:10] 

Calculating the length of each thread 

In [None]:
df_threads = pd.DataFrame({}, columns = ['Thread_Index', 'Length'])

for thread_index in df_train['Thread_Index'].unique():
    dates = df_train[df_train['Thread_Index'] == thread_index].Date
    length = (dates.max() - dates.min()).total_seconds()
    df_threads = df_threads.append({'Thread_Index': thread_index, 'Length': length}, ignore_index=True)

df_threads.sort_values(by=['Length'], ascending = False).head()

Calculating the number of mais per seconds in each thread

In [None]:
mails_in_threads = df_train['Thread_Index'].value_counts()

def calculateMailsPerSecond(row):
    thread_index = row['Thread_Index']
    thread_length = row['Length']
    
    if (thread_index == 0) or (thread_length == 0):
        mails_in_thread = 1
        mails_per_second = 0
    else:
        mails_in_thread = mails_in_threads[thread_index]
        mails_per_second = mails_in_thread / thread_length
    
    return mails_per_second
    
        
df_threads['Mails_Per_Second'] = df_threads.apply(lambda row: calculateMailsPerSecond(row), axis = 1) 
df_threads.head()

## Calculating scoring
Scoring is a result of multiply two other scores:
 - scoring of mail popularity, if the mail is more popular (exists more often), the result is greater,
 - scoring of thread popularity, if the thread contains more messages per second, it's better.
 
We need to add '1' to basic value because we can't calculate logarithm for '0' value. Additionally, we add 10 to result value, because, for value (0,1), the logarithm result is negative.

In [None]:
import math
mails_popularity = df_train['Email'].value_counts()

def calculateScore(row):
    mail_popularity = mails_popularity[row['Email']]
    mail_score = 10 + math.log(mail_popularity + 1)
    
    mails_per_second = df_threads[df_threads['Thread_Index'] == row['Thread_Index']].Mails_Per_Second
    mails_per_second_score = 10 + math.log(mails_per_second + 1)
    
    return mail_score * mails_per_second_score

df_train['Score'] = df_train.apply(lambda row: calculateScore(row), axis = 1) 

Show the result. Messages grouped in the same thread and mail have bigger scoring

In [None]:
df_train.sort_values(by=['Score'], ascending = False).head()

## To be continue