In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [112]:
import model
import train
import evaluate
import train_util
import data_util.data
import data_util.batcher
import data_util.config
import data_util.preprocess

importlib.reload(train)
importlib.reload(model)
importlib.reload(evaluate)
importlib.reload(train_util)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)
importlib.reload(data_util.preprocess)

from train import *
from evaluate import *
from model import *
from train_util import *
from data_util.data import *
from data_util.batcher import *
from data_util.preprocess import *

In [10]:
!pwd

/Users/rowancassius/Desktop/capstone/LSTM_Summarizer


In [11]:
config.log_root

'/Users/rowancassius/Desktop/capstone/LSTM_Summarizer'

In [12]:
data_path

'/data/context_task_data.tsv'

In [13]:
# load real data
data_path = os.path.join(config.log_root, 'data/context_task_data.tsv')
dat = pd.read_csv(data_path, sep='\t')

# fill nas
dat.fillna('', inplace=True)

# train/test split
np.random.seed(111)
dat = dat.sample(frac=1)
train_size = int(.8*dat.shape[0])
train_data = dat[:train_size]
test_data = dat[train_size:]



In [5]:
pd.set_option('display.max_colwidth', -1)

In [6]:
train_data.sample(5)

Unnamed: 0,#,Task,Context,TaskSentence,Summary,Labeler,NoRequestInContext,Urgent,NotRequest,Unsure/Discuss,RandomNumber
1166,5482,Please add Angela Davis to all e-mails regarding Project Tahiti.,,Please add Angela Davis to all e-mails regarding Project Tahiti.,Add Angela Davis to emails regarding Project Tahiti,Natalie,1.0,0.0,0.0,,0.504486
93,94,Susan: Please change the signature block as set forth below and I'll,,Susan: Please change the signature block as set forth below and I'll review.,Change signature block,Rowan,,,,,0.555939
915,5231,"comments, please call Jack Cashin at 202/ 5085-499 .","The FERC staff report on western markets and the causes of the Summer 2000 Price Abormalities, entitled, Part I of Staff Report on U.S.. Bulk Power Markets, is available at the following website: http://www.FERC.Fed.US/Electric/BulkPower.htm.","If you have any questions or comments, please call Jack Cashin at 202/ 5085-499 .",Call Jack Cashin at 202-5085-499 with questions,Natalie,1.0,0.0,0.0,,0.619421
782,5098,Could you get me a copy please?,"I'm going over the Park & Loan discounts on CNG for May.. I need to see the invoice to compare what we were billed to what CNG is showing on the discount letter.. Gloria, Terry does not have the invoice.",Could you get me a copy please?,Give SENDER a copy of the invoice for CNG,Natalie,1.0,0.0,0.0,,0.420132
162,163,Review the space plans and give your comments,"Let's schedule a meeting for Mark Taylor, you and me with the consultant who we already had conversations with.. We can formulate our response.. Mark. Pursuant to your request, I called Deborah Bubenko regarding what information she needs from you regarding space planning for the new building.. Deborah had Kim Kaase extension 35349 return my call.. Listed below are the next steps:",Review the space plans and give your comments,Review space plans,Rowan,1.0,,,,0.104132


##  Vocabulary Curation

In [23]:
eng_path = os.path.join(config.log_root, 'data/vocab/eng_1000.txt')
eng = pd.read_csv(eng_path)

In [102]:
vocab_sents = [vocab_process_text(s) for s in 
               train_data.Context.tolist() +  
               train_data.TaskSentence.tolist() + 
               eng.words.tolist()]

In [103]:
# fit tokenizer
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(vocab_sents + list(ENT_TAGS))
assert all([t in tokenizer.word_index for t in ENT_TAGS])

In [104]:
len(tokenizer.word_counts)

6635

In [105]:
words2vocabfile(tokenizer.word_index.keys(), os.path.join(config.log_root, 'data/vocab/vocab.txt'))

In [113]:
vocab = Vocab(os.path.join(config.log_root, 'data/vocab/vocab.txt'))

Finished constructing vocabulary of 6639 total words. Last word added: LANGUAGE


In [109]:
len(vocab._word_to_id.keys())

6639

In [110]:
words2vocabfile(vocab._word_to_id.keys(), os.path.join(config.log_root, 'data/vocab/vocab.txt'))

## Text cleaning development

In [393]:
i = 925
context = train_data.loc[i]['Context']
task    = train_data.loc[i]['TaskSentence']
summary = train_data.loc[i]['Summary']


context = article_process_text(context)
task = article_process_text(task)
summary = summary_process_text(summary)

In [394]:
import data_util.batcher
importlib.reload(data_util.batcher)
from data_util.batcher import *

In [395]:
ex = TaskExample(context, task, summary, vocab)

In [396]:
ex.pretty_print()

CONTEXT: Message sent from the pjm-customer-info mailing list at pjm-customer-info@majordomo.pjm.com:. The new PJM Enhanced Energy Scheduler EES will go into production at 10:00 this morning 4/17/00 at https://ees.pjm.com/mui/index.htm . Schedules may be submitted for energy that begins on or after tomorrow, 4/18/00. 

TASK:    For questions please contact PJM at 610-6662-270 . 

SUMMARY: contact PJM at 610-6662-270 with questions


In [397]:
[vocab.id2word(i) for i in ex.dec_input]

['[START]', 'contact', 'ORG', 'at', 'PHONENUMBER', 'with', 'questions']

In [398]:
len(ex.enc_input)

51

In [402]:
ex.dec_input[1:]

[230, 970, 2988, 1387, 2268, 3035]

In [401]:
ex.target[:-1]

[230, 4178, 2988, 4184, 2268, 3035]

In [403]:
ex.entity_label_map

{'pjm-customer-info@majordomo.pjm.com': 'EMAILADDRESS',
 'PJM Enhanced Energy Scheduler EES': 'ORG',
 '10:00 this morning': 'TIME',
 'https://ees.pjm.com/mui/index.htm': 'WEBSITE',
 'PJM': 'ORG',
 '610-6662-270': 'PHONENUMBER'}

In [392]:
ex.article_oovs

['Benji', ' ', 'Dinner', '22nd']

In [357]:
doc = nlp("Hey George, please email me asap at rowan@gmail.net. Take the survey at extraction.com. Also my friend at Google wants you to edit Article III, and call him at (445) 7680099")

In [345]:
{e.text: e.label_ for e in doc.ents}

{'George': 'PERSON',
 'rowan@gmail.net': 'EMAILADDRESS',
 'extraction.com': 'WEBSITE',
 'Google': 'ORG',
 'Article III': 'PRODUCT',
 '(445) 7680099': 'PHONENUMBER'}

In [346]:
list(doc)

[Hey,
 George,
 ,,
 please,
 email,
 me,
 asap,
 at,
 rowan@gmail.net,
 .,
 Take,
 the,
 survey,
 at,
 extraction.com,
 .,
 Also,
 my,
 friend,
 at,
 Google,
 wants,
 you,
 to,
 edit,
 Article III,
 ,,
 and,
 call,
 him,
 at,
 (445) 7680099]

In [221]:
nlp('Term Sheet').ents

()

In [None]:
from spacy.tokenizer import wor