In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(os.path.join('/Users/rowancassius/Desktop/capstone/data', 'context_task_data.tsv'), sep = '\t')

In [4]:
df.describe()

Unnamed: 0,#,NoRequestInContext,Urgent,NotRequest,RandomNumber
count,2820.0,1155.0,2395.0,699.0,2820.0
mean,3013.021631,0.948052,0.041754,0.406295,0.498823
std,1998.136736,0.222018,0.200067,0.491493,0.28729
min,1.0,0.0,0.0,0.0,0.000344
25%,708.75,1.0,0.0,0.0,0.252866
50%,3688.5,1.0,0.0,0.0,0.494803
75%,4729.25,1.0,0.0,1.0,0.750499
max,5550.0,1.0,1.0,1.0,0.99957


In [5]:
df['Urgent'].value_counts()

0.0    2295
1.0     100
Name: Urgent, dtype: int64

In [8]:
pd.set_option('display.max_colwidth', -1)

In [10]:
df[df['Urgent']==1][['TaskSentence']].sample(20)

Unnamed: 0,TaskSentence
2253,Could you sign off ASAP.
816,"The card will remain on the server for about two weeks, so please pick it up as soon as you can."
2769,PLEASE READ THE ATTACHED FILE Ethical Wall Procedure AND PRINT AND SIGN THE EMPLOYEE CERTIFICATION below .
126,PLEASE REVIEW THE NOTICE AND DISCLAIMER BELOW
352,Please remove any items you have in the refrigerator immediately.
2676,Please review and let me know as soon as possible if you have changes as I need to send this to the conference people today.
2177,Sue: Please review ASAP after Jeff.
460,"PLEASE FORWARD YOUR SHIRT SIZE TO ME S, M, L, XL, ETC. AS SOON AS POSSIBLE,"
48,plz respond with comments asap.
2092,"Seating will be limited, so please reserve your seat as soon as possible via E-Mail to jennifer.wilson@enron.com."


In [11]:
def clean(text):
    return text.lower()

In [21]:
is_na = df['Urgent'].isna()
df = df[~is_na]
df['Urgent'] = df['Urgent'].astype(int)

In [22]:
np.random.seed(111)
df = df.sample(frac=1)

train_size = int(.8*df.shape[0])
train_data = df[:train_size]
test_data = df[train_size:]

In [42]:
tfidf = TfidfVectorizer(lowercase=False)
x_train = tfidf.fit_transform(train_data['TaskSentence'])
x_test = tfidf.transform(test_data['TaskSentence'])

In [44]:
y_train = train_data['Urgent']
y_test = test_data['Urgent']

In [45]:
type(x_train)

scipy.sparse.csr.csr_matrix

In [46]:
x_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
x_train.shape

(1916, 4347)

In [48]:
y_test.shape

(479,)

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
lr = LogisticRegression()
lr.fit(X=x_train.toarray(), y=y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
y_pred = lr.predict(X=x_test.toarray())

In [52]:
from sklearn.metrics import classification_report

In [53]:
report = classification_report(y_test, y_pred, output_dict=True)

In [54]:
report

{'0': {'precision': 0.9537815126050421,
  'recall': 1.0,
  'f1-score': 0.9763440860215054,
  'support': 454},
 '1': {'precision': 1.0,
  'recall': 0.12,
  'f1-score': 0.21428571428571425,
  'support': 25},
 'accuracy': 0.954070981210856,
 'macro avg': {'precision': 0.9768907563025211,
  'recall': 0.56,
  'f1-score': 0.5953149001536098,
  'support': 479},
 'weighted avg': {'precision': 0.9561937509868249,
  'recall': 0.954070981210856,
  'f1-score': 0.936570684573917,
  'support': 479}}