In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data.tsv'
dat = pd.read_csv(data_path, sep='\t')

In [5]:
%%writefile preproces.py
#!/usr/bin/python

# imports
import re
import numpy as np
from nltk.tokenize import sent_tokenize

# Regexes
HTML = r'</?\w+/?>|>|<'
BR = r'</?br/?>'
BRBR = BR+BR
MARK = r'</?mark/?>|>|<'
WHITE = r'\s+'
HYPHENS = r'---+'
YW = "you wrote:"
NON_NUM = '[^0-9]'

EMAIL_TIME = "[0-9]?[0-9]:[0-9][0-9]\s[AP]M"
EA = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
PH = r"(\d{0,2}[\s\.-]{0,3}\(?\d{0,3}\)?[\s\.-]{0,3}\d{3}[\s\.-]{0,3}\d{4})"
BLACK = "[^A-Za-z0-9\s\?!,'\.;:/\-@*%#~&]+"


# Strings
BREAK = 'BREAK'
FORWARD = 'Forwarded by'
SPACE = ' '
EMAIL = ' EMAILADDRESS '
PHONE = ' PHONE '

# components
email_components = [
    'Date:',
    'From:',
    'To:',
    'Subject:',
    'Re:',
    'Mime-Version:',
    'Content-Type:',
    'Content-Transfer-Encoding:',
    '-From:',
    '-To:',
    '-cc:',
    '-bcc:',
    '-Folder:',
    '-Origin:',
    '-FileName:'
]

##### Preprocessing Functions ####

def make_regex(lst):
    return '|'.join(lst)

def phone_repl(matchobj):
    return format_phone_number(matchobj.group())

break_regex = make_regex([BRBR, HYPHENS, YW])
comp_regex = make_regex(email_components+[EMAIL_TIME])

def clean(text):
    text = re.sub(break_regex, BREAK, text)
    text = re.sub(HTML, SPACE, text)
    text = re.sub(PH, phone_repl, text)
    text = re.sub(BLACK, SPACE, text)
    text = re.sub(WHITE, SPACE, text)
    text = text.strip()
    return text


def clean_info(text):
    text = re.sub(EA, EMAIL, text)
    text = re.sub(PH, PHONE, text)
    text = re.sub(WHITE, SPACE, text)
    text = text.strip()
    return text
        
    
def trim_sents(sents, max_tokens=75):
    """Take the most sentences from the tail that together meet the tokens requirement
    """
    lens = [len(s.split()) for s in sents]
    trimmed_sents = [sents[i] for i in range(len(lens)) 
                     if sum(lens[i:]) <= max_tokens]
    return trimmed_sents


def format_phone_number(phone_number):
    """Formatting a phone number according to NANP Style
    """
    # remove non digits
    digs = re.sub(NON_NUM, '', phone_number)
    # break into area and local
    area, loc3, loc4 = digs[:-7], digs[-7:-3], digs[-3:]
    
    if len(area) == 0:
        return f' {loc3}-{loc4} '
    elif len(area) <= 3:
        return f' {area}-{loc3}-{loc4} '
    else:
        country, area = area[:-3], area[-3:]
        return f' {country}-{area}-{loc3}-{loc4} '

Writing preproces.py


In [6]:
import preprocess 
importlib.reload(preprocess)
from preprocess import *

In [7]:
dat

Unnamed: 0,#,Task,Context,TaskSentence,Summary,Labeler,NoRequestInContext,Urgent,NotRequest,Unsure/Discuss,RandomNumber
0,1,Please forward back with any changes you might...,Chris - Attached is the spreadsheet shared wit...,Please forward back with any changes you might...,Forward spreadsheet back to SENDER with changes,Rowan,1.0,,,,0.147883
1,2,Please communicate with Mark Holsworth.,I agree with your proposal.,Please communicate with Mark Holsworth.,Communicate with Mark Holsworth,Rowan,,,,,0.532136
2,3,Andy - can you please forward this email to yo...,,Andy - can you please forward this email to yo...,Forward email to book person to change deals,Rowan,,,,,0.815909
3,4,please schedule,,please schedule,Please schedule,Rowan,,,,,0.146584
4,5,Could you give me a brief description of what ...,,Could you give me a brief description of what ...,Describe this for SENDER,Rowan,,,,,0.092197
...,...,...,...,...,...,...,...,...,...,...,...
1230,5546,Can you review the list and make any necessary...,Attached is a list of footnotes we have come u...,Can you review the list and make any necessary...,Review footnotes for transactional report and ...,Natalie,1.0,0.0,0.0,,0.829084
1231,5547,Please review and let me know how we want to t...,Attached is a report for significant changes i...,Please review and let me know how we want to t...,Review report for significant changes in nom'd...,Natalie,1.0,0.0,0.0,,0.438604
1232,5548,"Alternatively, please forward this e-mail to a...",* An impressive panel of leading industry spea...,"Alternatively, please forward this e-mail to a...",Forward email to interested colleagues,Natalie,1.0,0.0,1.0,,0.800775
1233,5549,Can you take a first stab?,Gus -. Many of these are environmental in nature.,Can you take a first stab?,Try IT,Natalie,1.0,0.0,0.0,,0.143425


Summary Modifications:
* lowercase first letter
* filter punctuation

Article Modifications (ideal):
* 