# Customer Service Query Classification 

### Import libaries
---

In [1]:
#!pip install missingno

In [1]:
import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

### Load data
---

In [2]:
df = pd.read_csv("data/Customer_Service_Questions_Multiclass.csv")

### Initial data preparation
---

In [5]:
# check for missing values
# missingno.matrix(df) # there are no missing values

In [6]:
'''
# distribution of the topics column
sns.countplot(x=df["topic"])
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=65)
'''

'\n# distribution of the topics column\nsns.countplot(x=df["topic"])\nplt.xlabel("Topic")\nplt.ylabel("Frequency")\nplt.xticks(rotation=65)\n'

In [3]:
#'''
# encoding the labels
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(df['topic'])
df['category'] = encoder.transform(df['topic'])
#'''

## Train-test split
---

In [4]:
# train-test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.question, df.category, stratify=df.category, test_size=0.2, random_state=42)

## Get data in different formats

1. TF-IDF vector
2. TF-IDF vector of n-grams
3. Word vectors (GloVe)
4. Document vectors (Doc2Vec)
---

TF-IDF vectors

In [5]:
# TF-IDF vector
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.95, min_df=0.05, max_features=500
                            )
vectorizer.fit(df.question)

dfTfidf_train = vectorizer.transform(xtrain)
dfTfidf_test = vectorizer.transform(xtest)

TF-IDF n-grams vectors

In [6]:
# TF-IDF vector of n-grams
ngrams_vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.95, min_df=0.05, max_features=500
                            )
ngrams_vectorizer.fit(df.question)

dfTfidf_ngrams_train = ngrams_vectorizer.transform(xtrain)
dfTfidf_ngrams_test = ngrams_vectorizer.transform(xtest)

Word vectors

In [7]:
# Word vectors 
from gensim.models import Word2Vec

wordvec = Word2Vec(xtrain, window=8, min_count=2, sample=1e-3, sg=1, workers=8)
vocab = set(wordvec.wv.index_to_key)

num_features = 100

def average_word_vectors(tokens, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    ntokens = 0.
    for t in tokens:
        if t in vocabulary: 
            ntokens = ntokens + 1.
            feature_vector = np.add(feature_vector, model.wv[t])
    if ntokens:
        feature_vector = np.divide(feature_vector, ntokens)
    return feature_vector


word2vec_train = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
               for sent_tokens in xtrain]
avg_word2vec_train = np.array(word2vec_train)

word2vec_test = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
              for sent_tokens in xtest]
avg_word2vec_test = np.array(word2vec_test)

print('Train features shape:', avg_word2vec_train.shape, 
      '\nTest features shape:', avg_word2vec_test.shape)

Train features shape: (4000, 100) 
Test features shape: (1000, 100)


Document vectors

In [8]:
# TF-IDF vector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(xtrain)]
docvec = Doc2Vec(vector_size=100, window=3, min_count=4, workers=4, epochs=40)
docvec.build_vocab(docs)
docvec.train(docs, total_examples=docvec.corpus_count, epochs=docvec.epochs)

from gensim.utils import simple_preprocess
xtrainTokenized = [simple_preprocess(h) for h in xtrain]
xtestTokenized = [simple_preprocess(h) for h in xtest]

docvec_train = [docvec.infer_vector(i) for i in xtrainTokenized]
docvec_test =  [docvec.infer_vector(i) for i in xtestTokenized]

Models to consider:
-
1. One multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM)
2. One ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)
3. One other model of your choice whose code is NOT provided in class handouts

Input features to consider for each model:
-
1. TF-IDF vector of tokenized words
2. TF-IDF vector of n-grams (of range 4-5)
3. Word vectors (Glove, Word2Vec, or FastText)
4. Document vectors (Doc2Vec)

#### Build model

In [13]:
# svc
from sklearn.svm import SVC
svc = SVC()

#### Train model and make predictions

# SVC

In [14]:
# tf-idf
model_SVC_tfidf = svc.fit(dfTfidf_train, ytrain)
svc_tfidf_pred = model_SVC_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_SVC_ngramTfidf = svc.fit(dfTfidf_ngrams_train, ytrain)
svc_ngram_tfidf_pred = model_SVC_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_SVC_word2vec = svc.fit(avg_word2vec_train, ytrain)
svc_word2vec_pred = model_SVC_word2vec.predict(avg_word2vec_test)

# document vectors
model_SVC_doc2vec = svc.fit(docvec_train, ytrain)
svc_doc2vec_pred = model_SVC_doc2vec.predict(docvec_test)

#### Evaluate model

In [15]:
# svc tf-idf
from sklearn.metrics import classification_report
print("\nTF-IDF\n", classification_report(ytest, svc_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
                         precision    recall  f1-score   support

           Omnichannel       0.90      0.83      0.87        90
  Product Availability       0.53      0.83      0.65       167
    Product Comparison       0.69      0.49      0.57       161
Product Specifications       0.67      0.65      0.66       168
     Returns & Refunds       0.98      0.97      0.98       153
      Sales/Promotions       0.70      0.49      0.57       101
              Shipping       0.93      0.92      0.92       160

              accuracy                           0.75      1000
             macro avg       0.77      0.74      0.75      1000
          weighted avg       0.76      0.75      0.75      1000



In [16]:
# svc n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, svc_ngram_tfidf_pred)) 


TF-IDF n-grams
                         precision    recall  f1-score   support

           Omnichannel       0.85      0.74      0.79        90
  Product Availability       0.81      0.89      0.85       167
    Product Comparison       0.88      0.66      0.76       161
Product Specifications       0.60      0.83      0.69       168
     Returns & Refunds       0.78      0.82      0.80       153
      Sales/Promotions       0.77      0.48      0.59       101
              Shipping       0.85      0.84      0.85       160

              accuracy                           0.77      1000
             macro avg       0.79      0.75      0.76      1000
          weighted avg       0.79      0.77      0.77      1000



In [17]:
# word vectors
print("\nWord vectors\n", classification_report(ytest, svc_word2vec_pred)) 


Word vectors
                         precision    recall  f1-score   support

           Omnichannel       0.86      0.07      0.12        90
  Product Availability       0.48      0.86      0.62       167
    Product Comparison       0.72      0.48      0.57       161
Product Specifications       0.45      0.40      0.43       168
     Returns & Refunds       0.55      0.59      0.57       153
      Sales/Promotions       0.87      0.13      0.22       101
              Shipping       0.46      0.73      0.56       160

              accuracy                           0.52      1000
             macro avg       0.63      0.47      0.44      1000
          weighted avg       0.59      0.52      0.48      1000



In [18]:
# document vectors
print("\nDocument Vectors\n", classification_report(ytest, svc_doc2vec_pred)) 


Document Vectors
                         precision    recall  f1-score   support

           Omnichannel       1.00      0.16      0.27        90
  Product Availability       0.82      0.60      0.69       167
    Product Comparison       0.21      0.30      0.25       161
Product Specifications       0.22      0.36      0.27       168
     Returns & Refunds       0.44      0.32      0.37       153
      Sales/Promotions       1.00      0.46      0.63       101
              Shipping       0.24      0.28      0.26       160

              accuracy                           0.36      1000
             macro avg       0.56      0.35      0.39      1000
          weighted avg       0.50      0.36      0.39      1000



### Hyperparameter optimization

### Select Model

# XGBoost

Works well with scaled/normalized data

In [34]:
# pip install xgboost

In [35]:
#dfTfidf_train
#dfTfidf_ngrams_train
#avg_word2vec_train
#docvec_train

In [9]:
# xgboost
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [10]:
# tf-idf
model_xgb_tfidf = xgb_cl.fit(dfTfidf_train, ytrain)
xgb_tfidf_pred = model_xgb_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_xgb_ngramTfidf = xgb_cl.fit(dfTfidf_ngrams_train, ytrain)
xgb_ngram_tfidf_pred = model_xgb_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_xgb_word2vec = xgb_cl.fit(avg_word2vec_train, ytrain)
xgb_word2vec_pred = model_xgb_word2vec.predict(avg_word2vec_test)

# document vectors
model_xgb_doc2vec = xgb_cl.fit(docvec_train, ytrain)
xgb_doc2vec_pred = model_xgb_doc2vec.predict(docvec_test)

In [12]:
# svc tf-idf
print("\nTF-IDF\n", classification_report(ytest, xgb_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
               precision    recall  f1-score   support

           0       0.95      0.82      0.88        90
           1       0.54      0.84      0.66       167
           2       0.75      0.50      0.60       161
           3       0.68      0.68      0.68       168
           4       0.93      0.98      0.95       153
           5       0.81      0.53      0.64       101
           6       0.96      0.94      0.95       160

    accuracy                           0.76      1000
   macro avg       0.80      0.76      0.77      1000
weighted avg       0.79      0.76      0.76      1000



In [13]:
# svc n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, xgb_ngram_tfidf_pred)) 


TF-IDF n-grams
               precision    recall  f1-score   support

           0       0.91      0.79      0.85        90
           1       0.81      0.90      0.86       167
           2       0.87      0.70      0.77       161
           3       0.60      0.82      0.69       168
           4       0.78      0.82      0.80       153
           5       0.86      0.50      0.64       101
           6       0.87      0.87      0.87       160

    accuracy                           0.79      1000
   macro avg       0.82      0.77      0.78      1000
weighted avg       0.81      0.79      0.79      1000



In [14]:
# word vectors
print("\nWord vectors\n", classification_report(ytest, xgb_word2vec_pred)) 


Word vectors
               precision    recall  f1-score   support

           0       0.91      0.86      0.88        90
           1       0.89      0.89      0.89       167
           2       0.74      0.81      0.77       161
           3       0.72      0.69      0.71       168
           4       0.90      0.90      0.90       153
           5       0.87      0.80      0.84       101
           6       0.88      0.91      0.89       160

    accuracy                           0.83      1000
   macro avg       0.84      0.83      0.84      1000
weighted avg       0.84      0.83      0.83      1000



In [15]:
# document vectors
print("\nDocument Vectors\n", classification_report(ytest, xgb_doc2vec_pred)) 


Document Vectors
               precision    recall  f1-score   support

           0       0.56      0.26      0.35        90
           1       0.82      0.65      0.72       167
           2       0.22      0.28      0.24       161
           3       0.25      0.33      0.28       168
           4       0.48      0.37      0.42       153
           5       0.84      0.61      0.71       101
           6       0.21      0.28      0.24       160

    accuracy                           0.39      1000
   macro avg       0.48      0.40      0.42      1000
weighted avg       0.46      0.39      0.41      1000



### Hyperparameter optimization

# BERT 

We use PyTorch in this case. Refere here: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f.

In [None]:
# pip install transformers

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

### Deploy model?