# Customer Service Query Classification 

### Import libaries
---

In [1]:
import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import get_scorer 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import random

random.seed(42)

In [2]:
# dictionary to store metrics of different models

modelMetrics = {
                'svc': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []},
                'xgboost': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []},
                'knn': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []}
                }

### Load data
---

In [3]:
df = pd.read_csv("data/Customer_Service_Questions_Multiclass.csv")

### Initial data preparation
---

In [4]:
# check for missing values
# missingno.matrix(df) # there are no missing values

In [None]:
#'''
# distribution of the topics column
sns.countplot(x=df["topic"], color = "grey")
plt.title("Distribution by Department")
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=65)
#'''

In [6]:
#'''
# encoding the labels
encoder = LabelEncoder()

encoder.fit(df['topic'])
df['category'] = encoder.transform(df['topic'])
#'''

## Train-test split
---

In [7]:
# train-test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.question, df.category, stratify=df.category, test_size=0.2, random_state=42)

## Get data in different formats

1. TF-IDF vector
2. TF-IDF vector of n-grams
3. Word vectors (GloVe)
4. Document vectors (Doc2Vec)
---

TF-IDF vectors

In [8]:
# TF-IDF vector
vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.95, min_df=0.05, max_features=500
                            )
vectorizer.fit(df.question)

dfTfidf_train = vectorizer.transform(xtrain)
dfTfidf_test = vectorizer.transform(xtest)

TF-IDF n-grams vectors

In [9]:
# TF-IDF vector of n-grams
ngrams_vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.95, min_df=0.05, max_features=500
                            )
ngrams_vectorizer.fit(df.question)

dfTfidf_ngrams_train = ngrams_vectorizer.transform(xtrain)
dfTfidf_ngrams_test = ngrams_vectorizer.transform(xtest)

Word vectors

In [10]:
# Word vectors 
from gensim.models import Word2Vec

wordvec = Word2Vec(xtrain, window=8, min_count=2, sample=1e-3, sg=1, workers=8)
vocab = set(wordvec.wv.index_to_key)

num_features = 100

def average_word_vectors(tokens, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    ntokens = 0.
    for t in tokens:
        if t in vocabulary: 
            ntokens = ntokens + 1.
            feature_vector = np.add(feature_vector, model.wv[t])
    if ntokens:
        feature_vector = np.divide(feature_vector, ntokens)
    return feature_vector


word2vec_train = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
               for sent_tokens in xtrain]
avg_word2vec_train = np.array(word2vec_train)

word2vec_test = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
              for sent_tokens in xtest]
avg_word2vec_test = np.array(word2vec_test)

print('Train features shape:', avg_word2vec_train.shape, 
      '\nTest features shape:', avg_word2vec_test.shape)

Train features shape: (4000, 100) 
Test features shape: (1000, 100)


Document vectors

In [11]:
# TF-IDF vector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(xtrain)]
docvec = Doc2Vec(vector_size=100, window=3, min_count=4, workers=4, epochs=40)
docvec.build_vocab(docs)
docvec.train(docs, total_examples=docvec.corpus_count, epochs=docvec.epochs)

from gensim.utils import simple_preprocess
xtrainTokenized = [simple_preprocess(h) for h in xtrain]
xtestTokenized = [simple_preprocess(h) for h in xtest]

docvec_train = [docvec.infer_vector(i) for i in xtrainTokenized]
docvec_test =  [docvec.infer_vector(i) for i in xtestTokenized]

Models to consider:
-
1. One multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM)
2. One ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)
3. One other model of your choice whose code is NOT provided in class handouts

Input features to consider for each model:
-
1. TF-IDF vector of tokenized words
2. TF-IDF vector of n-grams (of range 4-5)
3. Word vectors (Glove, Word2Vec, or FastText)
4. Document vectors (Doc2Vec)

#### Build model

In [12]:
# svc
from sklearn.svm import SVC
svc = SVC()

#### Train model and make predictions

# SVC

In [13]:
# tf-idf
model_SVC_tfidf = svc.fit(dfTfidf_train, ytrain)
svc_tfidf_pred = model_SVC_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_SVC_ngramTfidf = svc.fit(dfTfidf_ngrams_train, ytrain)
svc_ngram_tfidf_pred = model_SVC_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_SVC_word2vec = svc.fit(avg_word2vec_train, ytrain)
svc_word2vec_pred = model_SVC_word2vec.predict(avg_word2vec_test)

# document vectors
model_SVC_doc2vec = svc.fit(docvec_train, ytrain)
svc_doc2vec_pred = model_SVC_doc2vec.predict(docvec_test)

#### Evaluate model

In [None]:
acc = get_scorer("accuracy")

acc._score_func(ytrue, ypred)

In [14]:
# svc tf-idf
print("\nTF-IDF\n", classification_report(ytest, svc_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
               precision    recall  f1-score   support

           0       0.90      0.83      0.87        90
           1       0.53      0.83      0.65       167
           2       0.69      0.49      0.57       161
           3       0.67      0.65      0.66       168
           4       0.98      0.97      0.98       153
           5       0.70      0.49      0.57       101
           6       0.93      0.92      0.92       160

    accuracy                           0.75      1000
   macro avg       0.77      0.74      0.75      1000
weighted avg       0.76      0.75      0.75      1000



In [15]:
# svc n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, svc_ngram_tfidf_pred)) 


TF-IDF n-grams
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        90
           1       0.81      0.89      0.85       167
           2       0.88      0.66      0.76       161
           3       0.60      0.83      0.69       168
           4       0.78      0.82      0.80       153
           5       0.77      0.48      0.59       101
           6       0.85      0.84      0.85       160

    accuracy                           0.77      1000
   macro avg       0.79      0.75      0.76      1000
weighted avg       0.79      0.77      0.77      1000



In [16]:
# word vectors
print("\nWord vectors\n", classification_report(ytest, svc_word2vec_pred)) 


Word vectors
               precision    recall  f1-score   support

           0       0.90      0.10      0.18        90
           1       0.47      0.87      0.61       167
           2       0.74      0.52      0.61       161
           3       0.45      0.39      0.41       168
           4       0.54      0.56      0.55       153
           5       0.78      0.07      0.13       101
           6       0.47      0.74      0.58       160

    accuracy                           0.52      1000
   macro avg       0.62      0.47      0.44      1000
weighted avg       0.59      0.52      0.48      1000



In [17]:
# document vectors
print("\nDocument Vectors\n", classification_report(ytest, svc_doc2vec_pred)) 


Document Vectors
               precision    recall  f1-score   support

           0       1.00      0.14      0.25        90
           1       0.80      0.60      0.68       167
           2       0.23      0.35      0.28       161
           3       0.19      0.34      0.25       168
           4       0.44      0.31      0.37       153
           5       0.98      0.41      0.57       101
           6       0.23      0.25      0.24       160

    accuracy                           0.36      1000
   macro avg       0.55      0.34      0.38      1000
weighted avg       0.50      0.36      0.38      1000



### Hyperparameter optimization

In [18]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf', 'sigmoid']} 
  
grid = GridSearchCV(svc, param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(dfTfidf_train, ytrain)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
grid_predictions = grid.predict(dfTfidf_test)
  
# print classification report
print(classification_report(ytest, grid_predictions))

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.672 total time=   0.3s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.677 total time=   0.3s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.639 total time=   0.3s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.657 total time=   0.3s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.657 total time=   0.3s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.7s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.693 total time=   0.4s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.681 total time=   0.4s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.682 total time=   0.5s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.5s
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.651 total time=   0.4s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid

### Select Model

# XGBoost

Works well with scaled/normalized data

In [19]:
# xgboost
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [20]:
# tf-idf
model_xgb_tfidf = xgb_cl.fit(dfTfidf_train, ytrain)
xgb_tfidf_pred = model_xgb_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_xgb_ngramTfidf = xgb_cl.fit(dfTfidf_ngrams_train, ytrain)
xgb_ngram_tfidf_pred = model_xgb_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_xgb_word2vec = xgb_cl.fit(avg_word2vec_train, ytrain)
xgb_word2vec_pred = model_xgb_word2vec.predict(avg_word2vec_test)

# document vectors
model_xgb_doc2vec = xgb_cl.fit(docvec_train, ytrain)
xgb_doc2vec_pred = model_xgb_doc2vec.predict(docvec_test)

In [21]:
# xgboost tf-idf
print("\nTF-IDF\n", classification_report(ytest, xgb_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
               precision    recall  f1-score   support

           0       0.95      0.82      0.88        90
           1       0.54      0.84      0.66       167
           2       0.75      0.50      0.60       161
           3       0.68      0.68      0.68       168
           4       0.93      0.98      0.95       153
           5       0.81      0.53      0.64       101
           6       0.96      0.94      0.95       160

    accuracy                           0.76      1000
   macro avg       0.80      0.76      0.77      1000
weighted avg       0.79      0.76      0.76      1000



In [22]:
# xgboost n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, xgb_ngram_tfidf_pred)) 


TF-IDF n-grams
               precision    recall  f1-score   support

           0       0.91      0.79      0.85        90
           1       0.81      0.90      0.86       167
           2       0.87      0.70      0.77       161
           3       0.60      0.82      0.69       168
           4       0.78      0.82      0.80       153
           5       0.86      0.50      0.64       101
           6       0.87      0.87      0.87       160

    accuracy                           0.79      1000
   macro avg       0.82      0.77      0.78      1000
weighted avg       0.81      0.79      0.79      1000



In [23]:
# xgboost word vectors
print("\nWord vectors\n", classification_report(ytest, xgb_word2vec_pred)) 


Word vectors
               precision    recall  f1-score   support

           0       0.86      0.86      0.86        90
           1       0.94      0.90      0.92       167
           2       0.76      0.81      0.78       161
           3       0.73      0.68      0.70       168
           4       0.89      0.93      0.91       153
           5       0.89      0.81      0.85       101
           6       0.88      0.94      0.91       160

    accuracy                           0.84      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.84      0.84      1000



In [24]:
# xgboost document vectors
print("\nDocument Vectors\n", classification_report(ytest, xgb_doc2vec_pred)) 


Document Vectors
               precision    recall  f1-score   support

           0       0.51      0.22      0.31        90
           1       0.81      0.66      0.73       167
           2       0.22      0.30      0.26       161
           3       0.21      0.27      0.24       168
           4       0.36      0.37      0.36       153
           5       0.94      0.59      0.73       101
           6       0.27      0.29      0.28       160

    accuracy                           0.39      1000
   macro avg       0.48      0.39      0.42      1000
weighted avg       0.45      0.39      0.41      1000



### Hyperparameter optimization

# KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [26]:
# tf-idf
model_knn_tfidf = knn_clf.fit(dfTfidf_train, ytrain)
knn_tfidf_pred = model_knn_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_knn_ngramTfidf = knn_clf.fit(dfTfidf_ngrams_train, ytrain)
knn_ngram_tfidf_pred = model_knn_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_knn_word2vec = knn_clf.fit(avg_word2vec_train, ytrain)
knn_word2vec_pred = model_knn_word2vec.predict(avg_word2vec_test)

# document vectors
model_knn_doc2vec = knn_clf.fit(docvec_train, ytrain)
knn_doc2vec_pred = model_knn_doc2vec.predict(docvec_test)

In [27]:
# knn n-grams tf-idf
print("\nTF-IDF\n", classification_report(ytest, knn_tfidf_pred))


TF-IDF
               precision    recall  f1-score   support

           0       0.38      0.93      0.54        90
           1       0.75      0.54      0.63       167
           2       0.63      0.42      0.50       161
           3       0.63      0.56      0.59       168
           4       0.92      0.95      0.94       153
           5       0.70      0.58      0.64       101
           6       0.93      0.94      0.93       160

    accuracy                           0.69      1000
   macro avg       0.71      0.70      0.68      1000
weighted avg       0.73      0.69      0.69      1000



In [28]:
# knn n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, knn_ngram_tfidf_pred)) 


TF-IDF n-grams
               precision    recall  f1-score   support

           0       0.76      0.74      0.75        90
           1       0.56      0.87      0.68       167
           2       0.83      0.68      0.75       161
           3       0.66      0.63      0.64       168
           4       0.72      0.79      0.76       153
           5       0.78      0.43      0.55       101
           6       0.87      0.76      0.81       160

    accuracy                           0.71      1000
   macro avg       0.74      0.70      0.71      1000
weighted avg       0.74      0.71      0.71      1000



In [29]:
# knn word vectors
print("\nWord vectors\n", classification_report(ytest, knn_word2vec_pred)) 


Word vectors
               precision    recall  f1-score   support

           0       0.67      0.88      0.76        90
           1       0.89      0.84      0.87       167
           2       0.76      0.83      0.79       161
           3       0.79      0.48      0.59       168
           4       0.80      0.86      0.83       153
           5       0.86      0.75      0.80       101
           6       0.74      0.91      0.81       160

    accuracy                           0.79      1000
   macro avg       0.79      0.79      0.78      1000
weighted avg       0.79      0.79      0.78      1000



In [30]:
# knn document vectors
print("\nDocument Vectors\n", classification_report(ytest, knn_doc2vec_pred)) 


Document Vectors
               precision    recall  f1-score   support

           0       0.16      0.20      0.17        90
           1       0.49      0.61      0.54       167
           2       0.18      0.20      0.19       161
           3       0.22      0.23      0.22       168
           4       0.36      0.31      0.33       153
           5       0.35      0.38      0.36       101
           6       0.29      0.16      0.21       160

    accuracy                           0.30      1000
   macro avg       0.29      0.30      0.29      1000
weighted avg       0.30      0.30      0.30      1000



# BERT 

We use PyTorch in this case. Refere here: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f and here: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894.

In [None]:
# pip install transformers

#from transformers import BertTokenizer

#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Explainable AI: LIME/SHAP

In [None]:
# LIME requires a model pipeline as input; so we have to convert the TDIDF vectorizer into a pipeline

from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

pipeline = make_pipeline(tfidf, model)
list_X_test = list(X_test)                         # Save X_test as a list of strings
class_names = {0: 'non-disaster', 1:'disaster'}  # Save class names in a dictionary for interpretability

In [None]:
# Create the LIME explainer, with class names for interpretability
exp = LimeTextExplainer(class_names=["Non-disaster", "Disaster"])

# Choose a random single prediction and explain the chosen prediction using the probability results 
# of the logistic regression predict_proba
idx = 18
idx_exp = exp.explain_instance(list_X_test[idx], pipeline.predict_proba)

# Print results
print('Document ID: %d' % idx)
print('Tweet: ', list_X_test[idx])
print('Probability disaster =', pipeline.predict_proba([list_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(y_test)[idx]))

In [None]:
# Display LIME results graphically, showing contribution of each word

idx_exp.show_in_notebook(text=True)

# SHAP

In [None]:
%%time
import shap

# SHAP is quite computation intensive; we sample data from train and test set to reduce time taken
X_train_sample = shap.sample(X_train_tfidf, 200)
X_test_sample = shap.sample(X_test_tfidf, 10)

# Using SHAP's KernelExplainer (very slow)
exp = shap.KernelExplainer(model.predict, X_train_sample)

# Calculate shap values of test sample using the explainer 
shap_values = exp.shap_values(X_test_sample)

In [None]:
# Create summary plot of shap values
# To do this, we must first convert test samples to a dataframe in order to add feature values 
# to non-tabular data for the visualisation 

color_test = pd.DataFrame(X_test_sample.todense())
shap.summary_plot(shap_values, color_test, feature_names=tfidf.get_feature_names())

In [None]:
# Violin plot to better display the distribution of shapley values 
shap.summary_plot(shap_values, color_test, feature_names=tfidf.get_feature_names(), plot_type="violin")

In [None]:
shap.initjs()
shap.force_plot(exp.expected_value, shap_values[1,:], 
                color_test.iloc[1,:], feature_names=tfidf.get_feature_names())