In [None]:
%matplotlib inline
import re
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob


Input format for raw data should be tab-separated file with at least 2 columns
- Timestamp (header == 'SubmissionDate')
- Email thread text (header == Emails)


In [None]:
df=pd.read_table("./raw_data_tfidf.txt")
df.SubmissionDate = pd.to_datetime(df.SubmissionDate)
df.index = df.SubmissionDate

total_cts_per_mth = pd.DataFrame()
total_cts_per_mth['mth_counts'] = df.Emails.resample('M').count()
total_cts_per_mth.plot();

In [None]:
keywords = {
        'dusa':'training', 
        'utrain' : 'training',
        'ldrd':'ldrd', 
        'ouo':'cui',
        'cui':'cui',
        'dc':'dc', 
        'author':'author', 
        'coffee':'negctrl', 
        'please':'polite',
        'thank':'polite',
        'thanks':'polite',
}
cat = {
    'cost code':'ldrd',     
}
ldrd_cc_pattern = 'X(W|X)([A-Z0-9]{2})'
ldrd_cc = re.compile(ldrd_cc_pattern)

## textblob -- sentiment, keyword and ngrams
def textblob_run(df):
    email_threads = TextBlob(df['Emails'])
    count = {
        'negctrl':0,
        'polite':0,
        'ldrd':0, 
        'training':0, 
        'cui':0,
        'author':0,
        'dc':0
    }
    kw = []
    pos = []
    neg = []
    # keywords
    for word in set(email_threads.words):
        wl = word.lower()
        if wl in keywords:
            count[keywords[wl]] = count[keywords[wl]] + 1
            if keywords[wl] == 'negctrl':
                neg.append(wl)
            else:
                kw.append(wl)
        elif word.startswith('LA-CP-'):
            count['cui'] = count['cui'] + 1
        elif ldrd_cc.search(word):
            count['ldrd'] = count['ldrd'] + 1
    
    # ngrams
    for bigrams in email_threads.ngrams(n=2):
        bg = " ".join(bigrams)
        if bg in cat:
            if cat[bg] in count:
                count[cat[bg]] = count[cat[bg]] + 1
        if bg[0].lower() == 'by':
            nn = TextBlob(bg[1])
            if nn.tags[0][1] == 'CD':
                count['urgent'] = count['urgent'] + 1
   
    kw_found = ", ".join(kw)
    kw_neg = ", ".join(neg)
            
    # return
    df['target_kw'] = kw_found
    df['negctrl_kw'] = kw_neg
    df['negctrl_ct'] = True if count['negctrl'] > 0 else False
    df['polite_ct'] = True if count['polite'] > 0 else False
    df['training_counts'] = True if count['training'] > 0 else False
    df['ldrd_counts'] = True if count['ldrd'] > 0 else False
    df['author_counts'] = True if count['author'] > 0 else False
    df['cui_counts'] = True if count['cui'] > 0 else False
    df['dc_counts'] = True if count['dc'] > 0 else False
    return(df)



In [None]:
df = df.apply(textblob_run, axis=1)
#df.to_csv('./raw_data_nlp.txt', sep="\t")

In [None]:
df = df.applymap(lambda x: 1 if x == True else x)
df = df.applymap(lambda x: 0 if x == False else x)
df.head()

In [None]:
col_to_resample_ct = ['negctrl_ct', 'polite_ct', 'training_counts', 'ldrd_counts', 'author_counts',
                      'cui_counts', 'dc_counts']

## file containing thg number of records per day
## tab separated, with <recordID><status record><timestamp>
recordsfile = "recID-status-ts.txt"
recdf = pd.read_table(recordsfile)
recdf.datetm = pd.to_datetime(recdf.datetm)
recdf.index = recdf.datetm

## crete matrix of data per month
cts_per_mth = pd.DataFrame()
for cols in col_to_resample_ct:
    cts_per_mth[cols] = df[cols].resample('M').sum()
    
cts_per_mth['all_emails'] = df.RequestID.resample('M').count()
cts_per_mth['rassti_rec'] = recdf.status.resample('M').count()

#cts_per_mth.to_csv("./counts_per_month.txt", sep="\t")
cts_per_mth.head()