This notebook follows the tutorial found at:
https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/


In [15]:
import mailparser
import sys
from os import listdir
from os.path import isfile, join
import pandas as pd
import email
import numpy as np
from talon.signature.bruteforce import extract_signature
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import rouge

#one time executions
#nltk.download('punkt')
#nltk.download('stopwords')
#! wget http://nlp.stanford.edu/data/glove.6B.zip
#! unzip glove*.zip
#!pip install py-rouge


Load dataframe produced by the Data_Wranlging notebook. 

In [20]:
#Load Data
#PICKLE_LOC = "../data/dataframes/wrangled_enron_df.pkl" #Single Mailbox
ENRON_PICKLE_LOC = "../data/dataframes/wrangled_enron_full_df.pkl"
BC3_EMAIL_PICKLE_LOC = "../data/dataframes/wrangled_BC3_email_df.pkl"
BC3_SUMMARY_PICKLE_LOC = "../data/dataframes/wrangled_BC3_summary_df.pkl"

enron_df = pd.read_pickle(ENRON_PICKLE_LOC)
BC3_emails_df = pd.read_pickle(BC3_EMAIL_PICKLE_LOC)
BC3_summary_df = pd.read_pickle(BC3_SUMMARY_PICKLE_LOC)

In [21]:
#Outputs a subset of the enron dataset masked by the person and a timeframe. 
def subset_emails(df, start_date, end_date, person):   
    summarization_mask = (enron_df['Date'] >= start_date) & (enron_df['Date'] <= end_date) & (enron_df['Employee'] == person)
    enron_masked_df = df.loc[summarization_mask]
    return enron_masked_df   

In [22]:
#Retrieve original sentences and index them. This will be used to generate the extracted summaries. 
def get_extractive_sentences(df):
    sentences = df.Extractive_Sentences.tolist()
    #flatten list as tuples containting (sentence, dataframe index) to be used to reassociate summary with original email. 
    sentences = []
    sentences_list = df.Extractive_Sentences.tolist()
    for counter, sublist in enumerate(sentences_list):
        for item in sublist:
            sentences.append([counter, item]) 
    return sentences

In [23]:
#Pull out clean tokenized sentences. 
def get_tokenized_sentences(df):
    clean_sentences = df.Tokenized_Body.tolist()
    #flatten list
    clean_sentences = [y for x in clean_sentences for y in x]
    return clean_sentences

In [24]:
#get glove word vectors
def extract_word_vectors():
    word_embeddings = {}
    f = open('glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    return word_embeddings

In [25]:
#Create sentence_vectors
def create_sentence_vectors(clean_sentences, word_embeddings):
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((300,))
        sentence_vectors.append(v)
    return sentence_vectors

In [26]:
#Returns a list of sorted scores with the index of the email the extracted sentence came from. 
def rank_sentences(sentences, sentence_vectors):
    sim_mat = np.zeros([len(sentences), len(sentences)])
    #Initialize matrix with cosine similarity scores. 
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
              sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    #Pair sentence with it's similarity score then sort. 
    ranked_sentences = sorted(((scores[i],s[0],s[1]) for i,s in enumerate(sentences)), reverse=True)
    return ranked_sentences

In [27]:
#color scheme to help distinguish summarizaiton text. 
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def display_summary(enron_masked_df, ranked_sentences):
  # Specify number of sentences as a fraction of total emails. 
  sn = len(enron_masked_df) // 10

  # Generate summary
  for i in range(sn):
    #pull date and subject from original email
    email_date = str(enron_masked_df['Date'].iloc[ranked_sentences[i][1]])
    email_subject = str(enron_masked_df['Subject'].iloc[ranked_sentences[i][1]])
    email_from = str(enron_masked_df['From'].iloc[ranked_sentences[i][1]])
    print( bcolors.BOLD + "Date: "+ email_date  + 
          " Subject: " + email_subject +
          " From: " + email_from + bcolors.ENDC +
          "\nSummary: " + str(ranked_sentences[i][2]))

In [28]:
#Function to wrap up summarization process
def summarize_emails(word_embeddings, masked_df):
    print("Total number of emails to summarize: " + str(len(masked_df)))
    sentences = get_extractive_sentences(masked_df)
    clean_sentences = get_tokenized_sentences(masked_df)
    #Generate sentence vectors
    sentence_vectors = create_sentence_vectors(clean_sentences, word_embeddings)
    #Create a list of ranked sentences. 
    ranked_sentences = rank_sentences(sentences, sentence_vectors)
    #return enron_masked_df, ranked_sentences
    display_summary(masked_df, ranked_sentences)
    return ranked_sentences

In [29]:
#Extract word vectors. Only need to be done once. 
word_embeddings = extract_word_vectors()

# Summarizing BC3 Dataset and Evaluate with Rouge
Using: https://pypi.org/project/py-rouge/

The rouge metric is an evaluation metric used to test machine generated summaries against a human "Gold standard". Using the same Text rank summarization methods used on the Enron dataset, the following evaluates the algorithim against the BC3 Corpus. This is one of the few email datasets that contain human summarizations. 

In [71]:
#Look into summarizing single email
masked_df = BC3_emails_df[:1]
masked_summaries = BC3_summary_df['Summary'].loc[(BC3_summary_df['Listno'] == masked_df.iloc[0]['Listno']) & (BC3_summary_df['Email_num'] == str(masked_df['Email_num'].iloc[0]))]
ranked_sentences = summarize_emails(word_embeddings, masked_df)

Total number of emails to summarize: 1


In [72]:
#There are three different human summaries for the same email. 
masked_summaries

117    Jacob suggested to hold two week meetings, the...
122    Jacob suggests that future IETF meetings be sp...
127    The topic is the logistics of scheduling IETF ...
Name: Summary, dtype: object

In [79]:
evaluator = rouge.Rouge(metrics=['rouge-n'],
                           max_n=1,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
full_body = masked_df['Body'].iloc[0]
hypothesis = ranked_sentences[0][2]
reference = masked_summaries.iloc[0]

#scores = evaluator.get_scores(hypothesis, reference)
print("Full Email: " + full_body + '\n')
print("ML Summary: " + hypothesis + "\n") 
print("Human Summary: " + reference + "\n") 

Full Email: The IETF meetings tend to become too large, creating logistics and planning problems. I suggest that future meetings are held for two weeks, with applications and user services issues the first week, and all other issues the second week. Those who so wish could attend both weeks, and other people could attend only one week. Those who choose to attend both weeks would be able to cover more groups and do better liaisons between the different areas. The Friday of the first week could discuss applications issues which might be of special interest to the other areas, and the Monday of the second week would schedule other groups which might be of special interest to applications people, so some people could attend MondayMonday or FridayFriday. Jacob Palme &lt;jpalme@dsv.su.se&gt; (Stockholm University and KTH) for more info see URL: http://www.dsv.su.se/~jpalme

ML Summary: The Friday of the first week could discuss applications issues which might be of special interest to the ot

The following shows the R-1 scores. Current benchmarks for text summarization can be found at: https://summari.es/

In [80]:
perfect_score = evaluator.get_scores(reference, reference)
scores = evaluator.get_scores(hypothesis, reference)
print(scores)

{'rouge-1': {'f': 0.33333333333333337, 'p': 0.2708333333333333, 'r': 0.43333333333333335}}


# Summarizing Enron Dataset

In [17]:
#Define emails to be summarized. 
start_date = '2001-10-01 00:00:00'
end_date = '2001-10-14 23:59:59'
person = 'skilling-j'
masked_df = subset_emails(enron_df, start_date, end_date, person)
ranked_sentences = summarize_emails(word_embeddings, masked_df)

Total number of emails to summarize: 72
[1mDate: 2001-10-08 08:36:38 Subject: Update - Basel Conference New York From: news@ibcuk.co.uk[0m
Summary: However, as the business community appears to be trying to get back to normal as far as possible, we have decided that the conference should proceed as planned.
[1mDate: 2001-10-02 19:04:24 Subject: Jeffrey Skilling, your October E-lert is now available From: mccann@nc.rr.com[0m
Summary: A complete story on the origin of Halloween will be on the Business Cafe Web site October 2531 at http://www.BusinessCafeOnline.com Your October 2001 issue of Elert for personal development in business is now available on http://www.BusinessCafeOnline.com Included in the October 2001 issue are these three articles: Flying Off Over Office Politics What Men Can Learn
[1mDate: 2001-10-04 03:13:05 Subject: h: Eyeforenergy Briefing From: bruno@eyeforenergy.com[0m
Summary: EDITORIAL A Focus on the latest developments in Europe ARTICLES European Utility Sect

In [19]:
#Examples of a full email. 
masked_df['Body'].iloc[ranked_sentences[0][1]]

"Dear Mr Skilling, http://www.ibcfinancial.com/bm1272/?source=bm1272em2 Update on IBC's major international conference: BASEL MEETING THE PRACTICAL CHALLENGES 31st October and 1st November 2001 New York We have obviously considered very carefully whether to continue with this event in the light of the recent terrible events in New York. However, as the business community appears to be trying to get back to normal as far as possible, we have decided that the conference should proceed as planned. We therefore hope you will take this excellent opportunity to hear the industry response to the latest Basel proposals on Capital Adequacy and Risk Management, and to gain practical advice on meeting the significant business challenges that these proposals pose to the management of risk within the financial services industry. Conference highlights include: A keynote address from William Rutledge, Executive Vice President, FEDERAL RESERVE BANK OF NEW YORK An impressive panel of leading industry s

In [20]:
#Summarization from another inbox
start_date = '2001-10-01 00:00:00'
end_date = '2001-10-14 23:59:59'
person = 'arnold-j'
masked_df = subset_emails(enron_df, start_date, end_date, person)
ranked_sentences = summarize_emails(word_embeddings, masked_df)

Total number of emails to summarize: 234
[1mDate: 2001-10-05 01:39:58 Subject: When will you accept Credit Cards?               wugiptuyduicmw From: herthateng4882@excite.com[0m
Summary: If you would like to speak to someone right now we would be more then happy to answer any questions you might have please provide: Name: Your Phone Number: Best time to call: Merchant Status will help you increase sales by an incredible 50% to 100%.
[1mDate: 2001-10-05 01:39:58 Subject: When will you accept Credit Cards?               wugiptuyduicmw From: herthateng4882@excite.com[0m
Summary: If you would like to speak to someone right now we would be more then happy to answer any questions you might have please provide: Name: Your Phone Number: Best time to call: Merchant Status will help you increase sales by an incredible 50% to 100%.
[1mDate: 2001-10-05 01:39:58 Subject: When will you accept Credit Cards?               wugiptuyduicmw From: herthateng4882@excite.com[0m
Summary: If you would li

In [21]:
#Examples of a full email. 
masked_df['Body'].iloc[ranked_sentences[0][1]]

'HOW TO SUBSTANTIALLY INCREASE SALES: MessageId: <200110042136812.SM00207@gmgfbljvm.networksolutions.com> Date: Thu, 4 Oct 2001 21:39:51 0400 Easily accept major credit cards right away! If you would like to speak to someone right now we would be more then happy to answer any questions you might have please provide: Name: Your Phone Number: Best time to call: Merchant Status will help you increase sales by an incredible 50% to 100%. Stop losing valuable sales! With one phone call you can be: Accepting all major credit cards! Accepting checks over the net or by Fax! Accepting real time processing for member sites! Gaining customer loyalty and trust! Close the sale now. No more wondering if "The check is in the mail" We specialize in helping businesses who are just starting out with no credit poor credit or even if you have great credit. Almost everyone is approved! (All information is kept securely and will never be shared with a third party) If you wish to be removed from our mailing l

In [22]:
#One more example
start_date = '2001-10-01 00:00:00'
end_date = '2001-10-14 23:59:59'
person = 'lenhart-m'
masked_df = subset_emails(enron_df, start_date, end_date, person)
ranked_sentences = summarize_emails(word_embeddings, masked_df)

Total number of emails to summarize: 121
[1mDate: 2001-10-03 20:00:52 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: i think i might need to pay up to get a house
[1mDate: 2001-10-05 14:51:13 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: i can think of a few things
[1mDate: 2001-10-05 15:30:00 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: i will let you know later.
[1mDate: 2001-10-09 19:26:43 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: this is a photo of the girl i think you look like but it isn't really a good pic.
[1mDate: 2001-10-04 16:24:25 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: i think i need to pay more than i want to.
[1mDate: 2001-10-10 17:10:04 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: i dont think you look like her anymore.
[1mDate: 2001-10-10 17:01:34 Subject: RE: From: matthew.lenhart@enron.com[0m
Summary: let me know what she says.
[1mDate: 2001-10-09 20:32:30 Subject: RE: From: m

In [23]:
#Examples of a full email. 
masked_df['Body'].iloc[ranked_sentences[0][1]]

'lets bet your dinner on it right now. i think i might need to pay up to get a house'