In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(os.path.join('/Users/rowancassius/Desktop/capstone/data', 'context_task_data.tsv'), sep = '\t')

In [4]:
df.describe()

Unnamed: 0,#,NoRequestInContext,Urgent,NotRequest,RandomNumber
count,2820.0,1155.0,2395.0,699.0,2820.0
mean,3013.021631,0.948052,0.041754,0.406295,0.498823
std,1998.136736,0.222018,0.200067,0.491493,0.28729
min,1.0,0.0,0.0,0.0,0.000344
25%,708.75,1.0,0.0,0.0,0.252866
50%,3688.5,1.0,0.0,0.0,0.494803
75%,4729.25,1.0,0.0,1.0,0.750499
max,5550.0,1.0,1.0,1.0,0.99957


In [5]:
df['Urgent'].value_counts()

0.0    2295
1.0     100
Name: Urgent, dtype: int64

In [8]:
pd.set_option('display.max_colwidth', -1)

In [10]:
df[df['Urgent']==1][['TaskSentence']].sample(20)

Unnamed: 0,TaskSentence
2253,Could you sign off ASAP.
816,"The card will remain on the server for about two weeks, so please pick it up as soon as you can."
2769,PLEASE READ THE ATTACHED FILE Ethical Wall Procedure AND PRINT AND SIGN THE EMPLOYEE CERTIFICATION below .
126,PLEASE REVIEW THE NOTICE AND DISCLAIMER BELOW
352,Please remove any items you have in the refrigerator immediately.
2676,Please review and let me know as soon as possible if you have changes as I need to send this to the conference people today.
2177,Sue: Please review ASAP after Jeff.
460,"PLEASE FORWARD YOUR SHIRT SIZE TO ME S, M, L, XL, ETC. AS SOON AS POSSIBLE,"
48,plz respond with comments asap.
2092,"Seating will be limited, so please reserve your seat as soon as possible via E-Mail to jennifer.wilson@enron.com."


In [11]:
def clean(text):
    return text.lower()

In [21]:
is_na = df['Urgent'].isna()
df = df[~is_na]
df['Urgent'] = df['Urgent'].astype(int)

In [22]:
np.random.seed(111)
df = df.sample(frac=1)

train_size = int(.8*df.shape[0])
train_data = df[:train_size]
test_data = df[train_size:]

In [42]:
tfidf = TfidfVectorizer(lowercase=False)
x_train = tfidf.fit_transform(train_data['TaskSentence'])
x_test = tfidf.transform(test_data['TaskSentence'])

In [139]:
y_train = train_data['Urgent']
y_test = test_data['Urgent']

In [140]:
type(x_train)

scipy.sparse.csr.csr_matrix

In [141]:
x_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [142]:
x_train.shape

(1916, 4347)

In [143]:
y_test.shape

(479,)

In [144]:
from sklearn.linear_model import LogisticRegression

In [145]:
lr = LogisticRegression(class_weight={0: 0.05, 1: 0.95})
lr.fit(X=x_train.toarray(), y=y_train)

LogisticRegression(C=1.0, class_weight={0: 0.05, 1: 0.95}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [146]:
y_pred = lr.predict(X=x_test.toarray())

In [147]:
from sklearn.metrics import classification_report

In [148]:
report = classification_report(y_test, y_pred, output_dict=True)

In [149]:
report

{'0': {'precision': 0.9847161572052402,
  'recall': 1.0,
  'f1-score': 0.9922992299229924,
  'support': 451},
 '1': {'precision': 1.0,
  'recall': 0.75,
  'f1-score': 0.8571428571428571,
  'support': 28},
 'accuracy': 0.9853862212943633,
 'macro avg': {'precision': 0.99235807860262,
  'recall': 0.875,
  'f1-score': 0.9247210435329247,
  'support': 479},
 'weighted avg': {'precision': 0.9856095759907376,
  'recall': 0.9853862212943633,
  'f1-score': 0.9843986486331305,
  'support': 479}}

In [150]:
err = (y_pred != y_test)
correct = y_pred == y_test
pos = y_test == 1

In [151]:
# True Positives
test_data[['TaskSentence']][correct & y_test]

Unnamed: 0,TaskSentence
1737,Please give me your comments ASAP.
632,"etc. , please respond to this e-mail ASAP."
2581,"It is currently in Stage, and as soon as you both get a chance to test it, and give it your ok, I can get it moved to production."
991,"Please respond with a deal number, or further suggestions for resolution so that I can clear this up as soon as possible."
2616,Please review attached letter to customers ASAP.
266,PLEASE CONFIRM THIS .
1093,Please send as soon as possible.
2582,Please login to iPayit and resolve this invoice as soon as possible.
1001,Please contact us as soon as possible with any further comments and to coordinate execution.
1895,Please forward this information to me as soon as possible.


In [152]:
# False Negatives
test_data[['TaskSentence']][err & y_test]

Unnamed: 0,TaskSentence
566,"Stephanie - As discussed, please shut down all Duke entities immediately for all CAISO products on EOL."
2425,"Would you please check into this and make sure they post it ASAP, especially since our time frame is quite tight."
48,plz respond with comments asap.
730,"If you have received it in error, please notify the sender immediately and delete the original."
2692,"If you have any questions or are not certain of the status of a transaction, please contact us immediately on:"
1283,Pls send first thing.
1190,Can we please get a list of the attorneys in your groups that work on EGM cases by the end of today?


In [153]:
# False Positives
test_data[['TaskSentence']][err & ~y_test]

Unnamed: 0,TaskSentence


In [136]:
for i in [2581, 991, 2568]:
    test_data['Urgent'][i] = 1

In [137]:
test_data['Urgent'][2581] = 1

In [138]:
test_data.loc[2581]

#                     4697                                                                                                                                                                                                                                                                                                                                                                                 
Task                  Stage, and as soon as you both get a chance to test it, and give it your ok,                                                                                                                                                                                                                                                                                                         
Context               Hi Kate, Cara,. I'm the new Enpower QA here in Houston, replacing Nadine who has moved on to another project.. There was a bug found in the Fwd Obl report, concerning the desk choice dis