In [None]:
%matplotlib inline
import enchant
import nltk
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

#nltk.download('stopwords')
from nltk.corpus import stopwords

Input format for raw data should be tab-separated file with 2 columns
- Timestamp (header == 'SubmissionDate')
- Email thread text (header == Emails)


In [None]:
## get all email data
df = pd.read_table("./raw_data_tfidf.txt")
df.SubmissionDate = pd.to_datetime(df.SubmissionDate)
df.index = df.SubmissionDate

## group email texts per month
dfm = pd.DataFrame()
dfm['Emails'] = df.groupby(pd.Grouper(freq="M"))['Emails'].apply("; ".join)

# save month matrix
dfm.to_csv('./raw_data_tfidf_month.txt', sep="\t")

In [None]:
ok_words = []
d = enchant.Dict("en_US")
def cleanup_text_first_pass(df):
    email_texts = TextBlob(df['Emails'])
    to_keep = []
    to_rm = []
    to_check = []
    for x in email_texts.words:
        flag = True
        not_in_d = False
        # remove stop words
        if x.lower() in stopwords.words('english'):
            flag = False
        # remove numbers
        if x.strip().isdigit():
            flag = False
        # remove non english words
        if not d.check(x.strip().lower()):
            flag = False
            not_in_d = True
        if flag:
            to_keep.append(x)
        elif not_in_d:
             to_check.append(x.lower())
        else:
            to_rm.append(x)
    df['processedEmails'] = ' '.join(to_keep)
    df['filteredwords'] = ' '.join(to_rm)
    df['tocheck'] = ' '.join(to_check)
    return(df)

In [None]:
# pre-process email threads with above function.
dfm_preprocessed = dfm.apply(cleanup_text_first_pass, axis=1)
dfm_preprocessed.to_csv("./tf_idf_month_preprocessedtext.txt", sep="\t")

Before going further, we need to double-check some of the token as acronyms and other actual words might have been filtered out. This process can be automated somewhat by looking at the frequency of excluded tokens overall

In [None]:
### Get data for tokens to double check:
all_to_check = ' '.join(dfm_preprocessed.tocheck)
all_words_to_check = all_to_check.split(' ')
check_words = TextBlob(' '.join(dfm_preprocessed.tocheck))
tot_check = len(all_to_check.split(' '))
print("Total number of tokens: ", tot_check)
print("Number of unique tokens: ", len(set(all_words_to_check)))
#34170
#179460

ct_words = {}
freq_words = {}
for i in range(0, tot_check):
    x = all_words[i]
    if x.strip():
        if x in ct_words:
            ct_words[x] = ct_words[x] + 1
        else:
            ct_words[x] = 1

print("count done --", len(ct_words), " (should be number of unique tokens).")

for uw in ct_words:
    freq_words[uw] = ct_words[uw] / tot_check

# sort tokens by frequency, store in new DF
sorted_freq_words = sorted(freq_words.items(), key=lambda t: t[1], reverse=True)
sorteddf = pd.DataFrame(sorted_freq_words)
sorteddf.columns = ['token', 'freq']

In [None]:
## Look at frequency distribution
plt.hist(sorteddf.freq, bins=100)
plt.yscale('log')
plt.show()

In [None]:
## decide on a threshold based on distribution -- we double-checked tokens appearing > 2 times
threshold_freq = 0.0001
over_couple_time_tokens = sorteddf[sorteddf.freq > threshold_freq]
over_couple_time_tokens.shape
over_couple_time_tokens.to_csv("words_to_check.tsv", sep="\t")

words_to_check.tsv contains the tokens that need to be visually inspected. Tokens to ultimately re-introduced can be saved in "ok_words.txt"  to be read in and included in the tokens to analyze.

In [None]:
ok_words = []
with open('./ok_words.txt', 'r') as ow:
    for l in ow:
        ok_words.append(l.strip())

def cleanup_text(df):
    # to do the initial splitting/lowercasing etc
    email_threads = TextBlob(df['Emails'])
    to_keep = []
    to_rm = []
    to_check = []
    all_the_words = []
    for x in email_threads.words:
        # split on '.' and strip apostrophes
        tmplist = x.strip("'").split(".")
        for w in tmplist:
            if w.strip():
                all_the_words.append(w.lower())
                flag = True
                not_in_d = False
                # remove stop words
                if w.lower() in stopwords.words('english'):
                    flag = False
                # remove numbers
                if w.strip().isdigit():
                    flag = False
                # remove non english words
                if not d.check(w.strip().lower()):
                    flag = False
                    not_in_d = True
                if flag:
                    to_keep.append(w)
                elif not_in_d:
                    if w.lower() in ok_words:
                        to_keep.append(w)
                    else :
                        to_check.append(w.lower())
                else:
                    to_rm.append(w)
    df['allwords'] = ' '.join(all_the_words)
    df['processedEmails'] = ' '.join(to_keep)
    df['filteredwords'] = ' '.join(to_rm)
    df['tocheck'] = ' '.join(to_check)
    return(df)

In [None]:
## process email threads
dfmprocessed = dfm.apply(cleanup_text, axis=1)
dfmprocessed.head()

In [None]:
totalwords = ' '.join(dfmprocessed.allwords)
all_ze_words = totalwords.split(' ')
totw_check = len(all_ze_words)
print(totw_check)
print(len(set(all_ze_words)))

In [None]:
# TF-IDF analysis
tfidfm = TfidfVectorizer()
corpus_month = [x for x in dfmprocessed.processedEmails]
resp_m = tfidfm.fit_transform(corpus_month)
feature_names_mth = tfidfm.get_feature_names()

#per month
monthly_words = []
dense_month = resp_m.todense()
monthly_top20 = []
for mh in range(0, resp_m.shape[0]):
    month_data = dense_month[mh].tolist()[0]
    tot_data = len(month_data)
    score_month = [x for x in zip(range(0, tot_data), month_data) if x[1] > 0]
    rs_scores = sorted(score_month, key=lambda t: t[1] * -1)[:20]
    top20 = []
    for phrase, score in [(feature_names_mth[word_id], score) for (word_id, score) in rs_scores][:20]:
        top20.append(phrase + ":" + str(score))    
        monthly_words.append(phrase)
    monthly_top20.append(";".join(top20))


In [None]:
dfmprocessed['top20'] = monthly_top20
dfmprocessed.head()

In [None]:
dfmprocessed[['top20']].to_csv("./tfidf_month_processedtext_top20.txt", sep="\t")

In [None]:
## Look at TF-IDF for specific words apperaing in top20:
words_to_check = ['please', 'thanks', 'dusa', 'training', 'utrain', 'ldrd', 'code', 'dc', 'author']
wtck = {}
wtfidf = []
for i in range(0, len(feature_names_mth)):
    if feature_names_mth[i] in words_to_check:
        wtck[feature_names_mth[i]] = i
for mh in range(0, resp_m.shape[0]):
    month_data = dense_month[mh].tolist()[0]
    tot_data = len(month_data)
    score_month = [x for x in zip(range(0, tot_data), month_data) if x[1] > 0]
    tfidfs = []
    for wtc in words_to_check:
        score = -1
        for stuff in score_month:
            if stuff[0] == wtck[wtc]:
                score = stuff[1]
                break
        tfidfs.append(wtc + ":" + str(score))
    
    wtfidf.append(";".join(tfidfs))
dfmprocessed['wordstfidf'] = wtfidf
dfmprocessed.head()

In [None]:
dfmprocessed[['wordstfidf']].to_csv("./top20_select_tfidf.txt", sep="\t")