# Deciding whether to escalate a customer support issue

## Part 1: Load and examine the data

In [1]:
dataset = 'inbound.csv'

In [2]:
import pandas as pd                               
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%time
df = pd.read_csv(f'{dataset}')
display(df.head())

Unnamed: 0,tweet_id,author_id,created_at,in_reply_to,text,escalate
0,2,115712,Tue Oct 31 22:11:45 +0000 2017,sprintcare,@sprintcare and how do you propose we do that,False
1,3,115712,Tue Oct 31 22:08:27 +0000 2017,sprintcare,@sprintcare I have sent several private messag...,True
2,5,115712,Tue Oct 31 21:49:35 +0000 2017,sprintcare,@sprintcare I did.,False
3,16,115713,Tue Oct 31 20:00:43 +0000 2017,sprintcare,@sprintcare Since I signed up with you....Sinc...,False
4,22,115716,Tue Oct 31 22:16:48 +0000 2017,Ask_Spectrum,@Ask_Spectrum Would you like me to email you a...,False


CPU times: user 1.09 s, sys: 187 ms, total: 1.27 s
Wall time: 1.27 s


In [4]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['escalate'].value_counts())

Number of rows in dataset: 520793
False    417800
True     102993
Name: escalate, dtype: int64


## Part 2: Get the data into the right shape

In [5]:
train_df, val_df, _, _ = train_test_split(df, df['escalate'], test_size=0.2, random_state=0)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

416634 rows in training data
104159 rows in validation data


## Part 3: Create training and validation datasets

In [6]:
%%time

def preprocess(df):
    all_rows = df.values.tolist()
    transformed_rows = list(map(transform_instance, all_rows))
    transformed_df = pd.DataFrame(transformed_rows)
    return transformed_df

def transform_instance(row):
    cur_row = []
    label = "__label__1" if row[5] == True else "__label__0" # Prefix 0 or 1 from sentiment
    new = []
    new.append(label)
    new.append(" ".join(nltk.word_tokenize(row[4].lower())))
    cur_row.append(new)
    return cur_row

transformed_validation_rows = preprocess(val_df)
display(transformed_validation_rows.head())

Unnamed: 0,0
0,"[__label__1, @ 115990 no joke ... this is one ..."
1,"[__label__0, @ amazonhelp primeira camada ... ..."
2,"[__label__1, @ microsofthelps my mistake]"
3,"[__label__1, @ 770932 @ americanair they notor..."
4,"[__label__1, @ amazonhelp neither man seems to..."


CPU times: user 25.3 s, sys: 94 ms, total: 25.4 s
Wall time: 25.4 s


In [7]:
%%time
transformed_train_rows = preprocess(train_df)
display(transformed_train_rows.head())

Unnamed: 0,0
0,"[__label__0, @ amazonhelp et en plus se faire ..."
1,"[__label__1, @ morrisons @ 641226 standard rep..."
2,"[__label__1, @ idea_cares @ 1936 @ 116590 this..."
3,"[__label__0, @ askamex yes i did weeks ago and..."
4,"[__label__0, @ amazonhelp i do n't want your s..."


CPU times: user 1min 39s, sys: 386 ms, total: 1min 39s
Wall time: 1min 39s


In [8]:
transformed_validation_rows.columns = ["both"]
transformed_train_rows.columns = ["both"]

transformed_validation_rows["text"] = transformed_validation_rows["both"].apply(lambda x:x[1])
transformed_validation_rows["label"] = transformed_validation_rows["both"].apply(lambda x:x[0])

transformed_train_rows["text"] = transformed_train_rows["both"].apply(lambda x:x[1])
transformed_train_rows["label"] = transformed_train_rows["both"].apply(lambda x:x[0])

In [9]:
transformed_validation_rows.head()

Unnamed: 0,both,text,label
0,"[__label__1, @ 115990 no joke ... this is one ...",@ 115990 no joke ... this is one of the worst ...,__label__1
1,"[__label__0, @ amazonhelp primeira camada ... ...",@ amazonhelp primeira camada ... https : //t.c...,__label__0
2,"[__label__1, @ microsofthelps my mistake]",@ microsofthelps my mistake,__label__1
3,"[__label__1, @ 770932 @ americanair they notor...",@ 770932 @ americanair they notoriously do n't...,__label__1
4,"[__label__1, @ amazonhelp neither man seems to...",@ amazonhelp neither man seems to know how to ...,__label__1


In [10]:
transformed_train_rows.head()

Unnamed: 0,both,text,label
0,"[__label__0, @ amazonhelp et en plus se faire ...",@ amazonhelp et en plus se faire engueuler par...,__label__0
1,"[__label__1, @ morrisons @ 641226 standard rep...",@ morrisons @ 641226 standard reply . morrison...,__label__1
2,"[__label__1, @ idea_cares @ 1936 @ 116590 this...",@ idea_cares @ 1936 @ 116590 this is gr8 ... u...,__label__1
3,"[__label__0, @ askamex yes i did weeks ago and...",@ askamex yes i did weeks ago and no response yet,__label__0
4,"[__label__0, @ amazonhelp i do n't want your s...",@ amazonhelp i do n't want your stupid automat...,__label__0


In [11]:
transformed_validation_rows["label_code"] = transformed_validation_rows.label.apply(lambda x: 0 if x == "__label__0" else 1)
transformed_train_rows["label_code"] = transformed_train_rows.label.apply(lambda x: 0 if x == "__label__0" else 1)

In [12]:
transformed_validation_rows.head()

Unnamed: 0,both,text,label,label_code
0,"[__label__1, @ 115990 no joke ... this is one ...",@ 115990 no joke ... this is one of the worst ...,__label__1,1
1,"[__label__0, @ amazonhelp primeira camada ... ...",@ amazonhelp primeira camada ... https : //t.c...,__label__0,0
2,"[__label__1, @ microsofthelps my mistake]",@ microsofthelps my mistake,__label__1,1
3,"[__label__1, @ 770932 @ americanair they notor...",@ 770932 @ americanair they notoriously do n't...,__label__1,1
4,"[__label__1, @ amazonhelp neither man seems to...",@ amazonhelp neither man seems to know how to ...,__label__1,1


In [13]:
transformed_train_rows.head()

Unnamed: 0,both,text,label,label_code
0,"[__label__0, @ amazonhelp et en plus se faire ...",@ amazonhelp et en plus se faire engueuler par...,__label__0,0
1,"[__label__1, @ morrisons @ 641226 standard rep...",@ morrisons @ 641226 standard reply . morrison...,__label__1,1
2,"[__label__1, @ idea_cares @ 1936 @ 116590 this...",@ idea_cares @ 1936 @ 116590 this is gr8 ... u...,__label__1,1
3,"[__label__0, @ askamex yes i did weeks ago and...",@ askamex yes i did weeks ago and no response yet,__label__0,0
4,"[__label__0, @ amazonhelp i do n't want your s...",@ amazonhelp i do n't want your stupid automat...,__label__0,0


## Part 4: Train the model

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

In [15]:
count_vect = CountVectorizer()
train_input = count_vect.fit_transform(transformed_train_rows["text"])

In [16]:
validation_input = count_vect.transform(transformed_validation_rows["text"])

In [17]:
tfidf = TfidfTransformer()
train_idf = tfidf.fit_transform(train_input)

In [18]:
validation_idf = tfidf.transform(validation_input)

In [19]:
classifier_1 = BernoulliNB()
classifier_2 = MultinomialNB()
classifier_3 = LogisticRegression()

In [20]:
classifier_1.fit(train_idf, transformed_train_rows["label_code"])
classifier_2.fit(train_idf, transformed_train_rows["label_code"])
classifier_3.fit(train_idf, transformed_train_rows["label_code"])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
(transformed_validation_rows["label_code"] == classifier_1.predict(validation_idf)).mean()

0.807736249387955

In [22]:
(transformed_validation_rows["label_code"] == classifier_2.predict(validation_idf)).mean()

0.8082642882516153

In [23]:
(transformed_validation_rows["label_code"] == classifier_3.predict(validation_idf)).mean()

0.90742038614042

### It turns out that the Logistic Regression performed best on the validation data. 

## Test the Model

In [24]:
tweet = "I'm not angry!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet))]

In [25]:
tokenized_tweet

["I 'm not angry !"]

In [26]:
data = pd.DataFrame({"text":tokenized_tweet})
input_x = count_vect.transform(data["text"])
input_tfidf = tfidf.transform(input_x)

In [27]:
classifier_1.predict(input_tfidf)

array([0])

In [28]:
classifier_2.predict(input_tfidf)

array([1])

In [29]:
classifier_3.predict(input_tfidf)

array([1])