# Customer Service Query Classification 

### Import libaries
---

In [None]:
#!pip install missingno

In [81]:
import pandas as pd
import numpy as np
import missingno
import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(42)

### Load data
---

In [2]:
df = pd.read_csv("data/Customer_Service_Questions_Multiclass.csv")

In [3]:
df.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


### Initial data preparation
---

In [None]:
# check for missing values
# missingno.matrix(df) # there are no missing values

In [None]:
'''
# distribution of the topics column
sns.countplot(x=df["topic"])
plt.xlabel("Topic")
plt.ylabel("Frequency")
plt.xticks(rotation=65)
'''

In [None]:
'''
# encoding the labels
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['category'] = encoder.fit_transform(df['topic'])
'''

## Train-test split
---

In [4]:
# train-test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.question, df.topic, stratify=df.topic, test_size=0.2, random_state=42)

## Get data in different formats

1. TF-IDF vector
2. TF-IDF vector of n-grams
3. Word vectors (GloVe)
4. Document vectors (Doc2Vec)
---

1. TF-IDF vectors

In [None]:
# TF-IDF vector
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', 
                            stop_words='english', max_df=0.95, min_df=0.05, max_features=500
                            )
vectorizer.fit(df.question)

dfTfidf_train = vectorizer.transform(xtrain)
dfTfidf_test = vectorizer.transform(xtest)

2. TF-IDF n-grams vectors

In [76]:
# TF-IDF vector of n-grams
ngrams_vectorizer = TfidfVectorizer(
                            strip_accents="unicode", lowercase=True, analyzer='word', ngram_range=(2,3), 
                            max_df=0.95, min_df=0.05, max_features=500
                            )
ngrams_vectorizer.fit(df.question)

dfTfidf_ngrams_train = ngrams_vectorizer.transform(xtrain)
dfTfidf_ngrams_test = ngrams_vectorizer.transform(xtest)

3. Word vectors

In [82]:
# Word vectors 
from gensim.models import Word2Vec

wordvec = Word2Vec(xtrain, window=8, min_count=2, sample=1e-3, sg=1, workers=8)
vocab = set(wordvec.wv.index_to_key)

num_features = 100

def average_word_vectors(tokens, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    ntokens = 0.
    for t in tokens:
        if t in vocabulary: 
            ntokens = ntokens + 1.
            feature_vector = np.add(feature_vector, model.wv[t])
    if ntokens:
        feature_vector = np.divide(feature_vector, ntokens)
    return feature_vector


word2vec_train = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
               for sent_tokens in xtrain]
avg_word2vec_train = np.array(word2vec_train)

word2vec_test = [average_word_vectors(sent_tokens, wordvec, vocab, num_features) 
              for sent_tokens in xtest]
avg_word2vec_test = np.array(word2vec_test)

print('Train features shape:', avg_word2vec_train.shape, 
      '\nTest features shape:', avg_word2vec_test.shape)

Train features shape: (4000, 100) 
Test features shape: (1000, 100)


4. Document vectors

In [87]:
# TF-IDF vector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(xtrain)]
docvec = Doc2Vec(vector_size=100, window=3, min_count=4, workers=4, epochs=40)
docvec.build_vocab(docs)
docvec.train(docs, total_examples=docvec.corpus_count, epochs=docvec.epochs)

from gensim.utils import simple_preprocess
xtrainTokenized = [simple_preprocess(h) for h in xtrain]
xtestTokenized = [simple_preprocess(h) for h in xtest]

docvec_train = [docvec.infer_vector(i) for i in xtrainTokenized]
docvec_test =  [docvec.infer_vector(i) for i in xtestTokenized]

Models to consider:
-
1. One multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM)
2. One ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)
3. One other model of your choice whose code is NOT provided in class handouts

Input features to consider for each model:
-
1. TF-IDF vector of tokenized words
2. TF-IDF vector of n-grams (of range 4-5)
3. Word vectors (Glove, Word2Vec, or FastText)
4. Document vectors (Doc2Vec)

#### Build model

In [55]:
# svc
from sklearn.svm import SVC
svc = SVC()

#### Train model and make predictions

# SVC

In [88]:
# tf-idf
model_SVC_tfidf = svc.fit(dfTfidf_train, ytrain)
svc_tfidf_pred = model_SVC_tfidf.predict(dfTfidf_test)

# n-grams tf-idf
model_SVC_ngramTfidf = svc.fit(dfTfidf_ngrams_train, ytrain)
svc_ngram_tfidf_pred = model_SVC_ngramTfidf.predict(dfTfidf_ngrams_test)

# word vectors
model_word2vec = svc.fit(avg_word2vec_train, ytrain)
svc_word2vec_pred = model_word2vec.predict(avg_word2vec_test)

# document vectors
model_doc2vec = svc.fit(docvec_train, ytrain)
svc_doc2vec_pred = model_doc2vec.predict(docvec_test)

#### Evaluate model

In [91]:
# svc tf-idf
from sklearn.metrics import classification_report
print("\nTF-IDF\n", classification_report(ytest, svc_tfidf_pred)) # not removing stopwords improves the model significantly


TF-IDF
                         precision    recall  f1-score   support

           Omnichannel       0.90      0.83      0.87        90
  Product Availability       0.53      0.83      0.65       167
    Product Comparison       0.69      0.49      0.57       161
Product Specifications       0.67      0.65      0.66       168
     Returns & Refunds       0.98      0.97      0.98       153
      Sales/Promotions       0.70      0.49      0.57       101
              Shipping       0.93      0.92      0.92       160

              accuracy                           0.75      1000
             macro avg       0.77      0.74      0.75      1000
          weighted avg       0.76      0.75      0.75      1000



In [92]:
# svc n-grams tf-idf
print("\nTF-IDF n-grams\n", classification_report(ytest, svc_ngram_tfidf_pred)) 


TF-IDF n-grams
                         precision    recall  f1-score   support

           Omnichannel       0.85      0.74      0.79        90
  Product Availability       0.81      0.89      0.85       167
    Product Comparison       0.88      0.66      0.76       161
Product Specifications       0.60      0.83      0.69       168
     Returns & Refunds       0.78      0.82      0.80       153
      Sales/Promotions       0.77      0.48      0.59       101
              Shipping       0.85      0.84      0.85       160

              accuracy                           0.77      1000
             macro avg       0.79      0.75      0.76      1000
          weighted avg       0.79      0.77      0.77      1000



In [93]:
# word vectors
print("\nWord vectors\n", classification_report(ytest, svc_word2vec_pred)) 


Word vectors
                         precision    recall  f1-score   support

           Omnichannel       0.86      0.07      0.12        90
  Product Availability       0.48      0.87      0.62       167
    Product Comparison       0.74      0.50      0.59       161
Product Specifications       0.47      0.46      0.46       168
     Returns & Refunds       0.54      0.58      0.56       153
      Sales/Promotions       0.65      0.11      0.19       101
              Shipping       0.51      0.76      0.61       160

              accuracy                           0.53      1000
             macro avg       0.61      0.48      0.45      1000
          weighted avg       0.59      0.53      0.49      1000



In [97]:
# document vectors
print("\nDocument Vectors\n", classification_report(ytest, svc_doc2vec_pred)) 


Document Vectors
                         precision    recall  f1-score   support

           Omnichannel       1.00      0.16      0.27        90
  Product Availability       0.91      0.58      0.71       167
    Product Comparison       0.21      0.32      0.25       161
Product Specifications       0.22      0.36      0.28       168
     Returns & Refunds       0.41      0.36      0.38       153
      Sales/Promotions       0.96      0.44      0.60       101
              Shipping       0.26      0.29      0.27       160

              accuracy                           0.37      1000
             macro avg       0.57      0.36      0.39      1000
          weighted avg       0.51      0.37      0.39      1000



### Hyperparameter optimization

### Select Model

# XGBoost

### Deploy model?