# Customer Service Query Classification 

### Import libaries
---

In [19]:
import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import random

random.seed(42)

In [3]:
# dictionary to store metrics of different models

modelMetrics = {
                'svc': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []},
                'xgboost': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []},
                'knn': {'tfidf': [], 'tfidf_ngrams': [], 'word2vec': [], 'doc2vec': []}
                }

### Load data
---

In [4]:
df = pd.read_csv("data/Customer_Service_Questions_Multiclass.csv")

### Initial data preparation
---

In [None]:
# check for missing values
# missingno.matrix(df) # there are no missing values

In [None]:
#'''
# distribution of the topics column
sns.countplot(x=df["topic"], color = "grey")
plt.title("Distribution by Department")
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=65)
#'''

In [5]:
#'''
# encoding the labels
encoder = LabelEncoder()

encoder.fit(df['topic'])
df['category'] = encoder.transform(df['topic'])
#'''

## Train-test split
---

In [6]:
# train-test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.question, df.category, stratify=df.category, test_size=0.2, random_state=42)

## Get data in different formats

1. TF-IDF vector
2. TF-IDF vector of n-grams
3. Word vectors (GloVe)
4. Document vectors (Doc2Vec)
---

TF-IDF vectors

In [7]:
# TF-IDF vector
vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.95, min_df=0.05, max_features=500
                            )
vectorizer.fit(df.question)

dfTfidf_train = vectorizer.transform(xtrain)
dfTfidf_test = vectorizer.transform(xtest)

TF-IDF n-grams vectors

In [8]:
# TF-IDF vector of n-grams
ngrams_vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.95, min_df=0.05, max_features=500
                            )
ngrams_vectorizer.fit(df.question)

dfTfidf_ngrams_train = ngrams_vectorizer.transform(xtrain)
dfTfidf_ngrams_test = ngrams_vectorizer.transform(xtest)

Word vectors

In [10]:
# Word vectors 
from gensim.models import Word2Vec

wordvec = Word2Vec(xtrain, window=8, min_count=2, sample=1e-3, sg=1, workers=8)
vocab = set(wordvec.wv.index_to_key)

num_features = 100

def average_word_vectors(tokens, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    ntokens = 0.
    for t in tokens:
        if t in vocabulary: 
            ntokens = ntokens + 1.
            feature_vector = np.add(feature_vector, model.wv[t])
    if ntokens:
        feature_vector = np.divide(feature_vector, ntokens)
    return feature_vector


word2vec_train = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
               for sent_tokens in xtrain]
avg_word2vec_train = np.array(word2vec_train)

word2vec_test = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
              for sent_tokens in xtest]
avg_word2vec_test = np.array(word2vec_test)

print('Train features shape:', avg_word2vec_train.shape, 
      '\nTest features shape:', avg_word2vec_test.shape)

Train features shape: (4000, 100) 
Test features shape: (1000, 100)


Document vectors

In [11]:
# TF-IDF vector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(xtrain)]
docvec = Doc2Vec(vector_size=100, window=3, min_count=4, workers=4, epochs=40)
docvec.build_vocab(docs)
docvec.train(docs, total_examples=docvec.corpus_count, epochs=docvec.epochs)

from gensim.utils import simple_preprocess
xtrainTokenized = [simple_preprocess(h) for h in xtrain]
xtestTokenized = [simple_preprocess(h) for h in xtest]

docvec_train = [docvec.infer_vector(i) for i in xtrainTokenized]
docvec_test =  [docvec.infer_vector(i) for i in xtestTokenized]

Models to consider:
-
1. One multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM)
2. One ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)
3. One other model of your choice whose code is NOT provided in class handouts

Input features to consider for each model:
-
1. TF-IDF vector of tokenized words
2. TF-IDF vector of n-grams (of range 4-5)
3. Word vectors (Glove, Word2Vec, or FastText)
4. Document vectors (Doc2Vec)

#### Build model

In [12]:
# svc
from sklearn.svm import SVC
svc = SVC()

#### Train model and make predictions

# SVC

In [13]:
# tf-idf
model_SVC_tfidf = svc.fit(dfTfidf_train, ytrain)
svc_tfidf_pred = model_SVC_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_SVC_ngramTfidf = svc.fit(dfTfidf_ngrams_train, ytrain)
svc_ngram_tfidf_pred = model_SVC_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_SVC_word2vec = svc.fit(avg_word2vec_train, ytrain)
svc_word2vec_pred = model_SVC_word2vec.predict(avg_word2vec_test)

# document vectors
model_SVC_doc2vec = svc.fit(docvec_train, ytrain)
svc_doc2vec_pred = model_SVC_doc2vec.predict(docvec_test)

#### Evaluate model

In [14]:
# svc tf-idf
print("\nTF-IDF\n", classification_report(ytest, svc_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
               precision    recall  f1-score   support

           0       0.90      0.83      0.87        90
           1       0.53      0.83      0.65       167
           2       0.69      0.49      0.57       161
           3       0.67      0.65      0.66       168
           4       0.98      0.97      0.98       153
           5       0.70      0.49      0.57       101
           6       0.93      0.92      0.92       160

    accuracy                           0.75      1000
   macro avg       0.77      0.74      0.75      1000
weighted avg       0.76      0.75      0.75      1000



In [None]:
# svc n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, svc_ngram_tfidf_pred)) 

In [None]:
# word vectors
print("\nWord vectors\n", classification_report(ytest, svc_word2vec_pred)) 

In [None]:
# document vectors
print("\nDocument Vectors\n", classification_report(ytest, svc_doc2vec_pred)) 

### Hyperparameter optimization

In [21]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf', 'sigmoid']} 
  
grid = GridSearchCV(svc, param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(dfTfidf_train, ytrain)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.672 total time=   0.4s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.677 total time=   0.5s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.639 total time=   0.3s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.657 total time=   0.3s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.657 total time=   0.3s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.5s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.693 total time=   0.6s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.681 total time=   0.6s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.682 total time=   0.4s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.705 total time=   0.5s
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.651 total time=   0.5s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid

In [22]:
# print best parameter after tuning
print(grid.best_params_)

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}


In [23]:
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

SVC(C=100, gamma=1)


In [24]:
grid_predictions = grid.predict(dfTfidf_test)
  
# print classification report
print(classification_report(ytest, grid_predictions))

              precision    recall  f1-score   support

           0       0.90      0.83      0.87        90
           1       0.54      0.84      0.66       167
           2       0.74      0.47      0.58       161
           3       0.66      0.65      0.65       168
           4       0.94      0.97      0.96       153
           5       0.78      0.51      0.62       101
           6       0.94      0.94      0.94       160

    accuracy                           0.75      1000
   macro avg       0.78      0.75      0.75      1000
weighted avg       0.77      0.75      0.75      1000



### Select Model

# XGBoost

Works well with scaled/normalized data

In [None]:
# pip install xgboost

In [None]:
#dfTfidf_train
#dfTfidf_ngrams_train
#avg_word2vec_train
#docvec_train

In [None]:
# xgboost
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [None]:
# tf-idf
model_xgb_tfidf = xgb_cl.fit(dfTfidf_train, ytrain)
xgb_tfidf_pred = model_xgb_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_xgb_ngramTfidf = xgb_cl.fit(dfTfidf_ngrams_train, ytrain)
xgb_ngram_tfidf_pred = model_xgb_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_xgb_word2vec = xgb_cl.fit(avg_word2vec_train, ytrain)
xgb_word2vec_pred = model_xgb_word2vec.predict(avg_word2vec_test)

# document vectors
model_xgb_doc2vec = xgb_cl.fit(docvec_train, ytrain)
xgb_doc2vec_pred = model_xgb_doc2vec.predict(docvec_test)

In [None]:
# xgboost tf-idf
print("\nTF-IDF\n", classification_report(ytest, xgb_tfidf_pred)) # not removing stopwords improves the model significantly

In [None]:
# xgboost n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, xgb_ngram_tfidf_pred)) 

In [None]:
# xgboost word vectors
print("\nWord vectors\n", classification_report(ytest, xgb_word2vec_pred)) 

In [None]:
# xgboost document vectors
print("\nDocument Vectors\n", classification_report(ytest, xgb_doc2vec_pred)) 

### Hyperparameter optimization

# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [10]:
# tf-idf
model_knn_tfidf = knn_clf.fit(dfTfidf_train, ytrain)
knn_tfidf_pred = model_knn_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_knn_ngramTfidf = knn_clf.fit(dfTfidf_ngrams_train, ytrain)
knn_ngram_tfidf_pred = model_knn_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_knn_word2vec = knn_clf.fit(avg_word2vec_train, ytrain)
knn_word2vec_pred = model_knn_word2vec.predict(avg_word2vec_test)

# document vectors
model_knn_doc2vec = knn_clf.fit(docvec_train, ytrain)
knn_doc2vec_pred = model_knn_doc2vec.predict(docvec_test)

In [13]:
# knn n-grams tf-idf
print("\nTF-IDF\n", classification_report(ytest, knn_tfidf_pred))


TF-IDF
               precision    recall  f1-score   support

           0       0.91      0.83      0.87        90
           1       0.72      0.56      0.63       167
           2       0.39      0.58      0.47       161
           3       0.67      0.62      0.65       168
           4       0.94      0.95      0.94       153
           5       0.67      0.52      0.59       101
           6       0.92      0.91      0.91       160

    accuracy                           0.71      1000
   macro avg       0.75      0.71      0.72      1000
weighted avg       0.74      0.71      0.72      1000



In [15]:
# knn n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, knn_ngram_tfidf_pred)) 


TF-IDF n-grams
               precision    recall  f1-score   support

           0       0.80      0.72      0.76        90
           1       0.79      0.85      0.82       167
           2       0.63      0.81      0.71       161
           3       0.67      0.63      0.65       168
           4       0.72      0.75      0.73       153
           5       0.76      0.47      0.58       101
           6       0.83      0.79      0.81       160

    accuracy                           0.73      1000
   macro avg       0.74      0.72      0.72      1000
weighted avg       0.74      0.73      0.73      1000



In [16]:
# knn word vectors
print("\nWord vectors\n", classification_report(ytest, knn_word2vec_pred)) 


Word vectors
               precision    recall  f1-score   support

           0       0.62      0.92      0.74        90
           1       0.88      0.80      0.84       167
           2       0.72      0.80      0.76       161
           3       0.75      0.42      0.54       168
           4       0.76      0.87      0.81       153
           5       0.84      0.70      0.76       101
           6       0.78      0.89      0.83       160

    accuracy                           0.76      1000
   macro avg       0.76      0.77      0.75      1000
weighted avg       0.77      0.76      0.75      1000



In [17]:
# knn document vectors
print("\nDocument Vectors\n", classification_report(ytest, knn_doc2vec_pred)) 


Document Vectors
               precision    recall  f1-score   support

           0       0.20      0.28      0.23        90
           1       0.46      0.63      0.53       167
           2       0.22      0.22      0.22       161
           3       0.20      0.18      0.19       168
           4       0.31      0.31      0.31       153
           5       0.34      0.38      0.36       101
           6       0.19      0.09      0.12       160

    accuracy                           0.29      1000
   macro avg       0.27      0.30      0.28      1000
weighted avg       0.28      0.29      0.28      1000



# BERT 

We use PyTorch in this case. Refere here: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f and here: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894.

In [None]:
# pip install transformers

#from transformers import BertTokenizer

#tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Explainable AI: LIME/SHAP