## Classifying Customer Queries 

In [1]:
#importing libs
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC

import numpy as np
import pandas as pd

In [2]:
#data cleaning
data_url=("https://raw.githubusercontent.com/busyML/Customer-Queries-Classification-NLP/master/NLP%20INSTANTEACH%20ML%20-%20Sheet1.csv")
data=pd.read_csv(data_url)
data.head()

Unnamed: 0,Input Query,Category
0,be or do,0
1,comparatives and superlatives,0
2,how to teach overcoming obstacles concept wit...,1
3,Present perfect simple or continuous,0
4,-,0


0 - Low level query <br>
1 - Urgent query (needs immediate attention)

In [12]:
#shuffling dataset to curb any unnecessary biases
data = data.sample(frac = 1, random_state=11)
data.head()
data.shape

(1084, 2)

## Vectorization
Method 1. Counting word frequency - Bag of words

In [10]:
count_vec = CountVectorizer()
count_example = (count_vec.fit_transform(data["Input Query"].values.astype('U'))).toarray()
count_example = pd.DataFrame(count_example)
vocab_list = list(count_vec.get_feature_names())

i=0
for i in range(len(count_example.columns)):
    count_example.rename(columns={i: vocab_list[i]}, inplace=True)


In [8]:
count_example.to_csv("countexample.csv")

Method 2: Hashing Vectorization

In [17]:
vec = HashingVectorizer(n_features = 2**10, norm="l1")
vec_counts = (vec.fit_transform(data["Input Query"].values.astype('U'))).toarray()
train = pd.DataFrame(vec_counts)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.027778,...,0.0,0.0,0.0,0.0,0.0,-0.027778,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
vec_counts.shape

(1084, 1024)

In [21]:
train_rows, train_cols = train.shape
pca_compressor = PCA(0.99)
comp_train = pd.DataFrame(pca_compressor.fit_transform(train))
comp_rows, comp_cols = comp_train.shape
print("number of columns before compression:",train_cols,"\n","number of columns after compression:" ,comp_cols)
comp_train.head()

number of columns before compression: 1024 
 number of columns after compression: 371


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,361,362,363,364,365,366,367,368,369,370
0,-0.018548,-0.009943,-0.041752,-0.009834,0.038122,-0.020786,0.03267,-0.002762,-0.022673,0.002803,...,0.000165,0.002007,0.000524,0.005563,0.005353,0.003982,-0.002125,0.005118,-0.002865,-0.002994
1,0.144641,-0.030201,-0.024569,-0.010797,0.046357,0.01659,0.104466,-0.018984,-0.016666,0.089007,...,0.000126,-0.004897,-0.001377,-0.005838,-0.000881,0.003819,0.00419,0.005479,-0.001715,-0.003087
2,-0.036206,-0.031946,0.059021,-0.153236,0.376132,0.819627,-0.13438,-0.002157,0.215932,-0.243564,...,0.000546,-0.000513,-4.7e-05,0.000207,-0.00024,-0.00101,0.001648,-0.00038,0.000367,0.001204
3,-0.017365,-0.006946,-0.032016,-0.010879,0.022846,-0.024185,-0.008863,0.004951,-0.010268,-0.015265,...,0.00011,0.000158,8.5e-05,3.7e-05,-8.8e-05,2.8e-05,1.1e-05,-3e-05,-6e-05,8.1e-05
4,0.311567,-0.059424,-0.079415,0.240529,-0.139072,0.096719,-0.001947,0.005382,-0.000205,0.01451,...,-0.002816,-0.003351,0.000699,0.001242,0.000354,2.3e-05,0.001958,-0.001988,-0.00283,-0.000235


## Training

In [22]:
train_ans = data['Category']
train_ans.head()

640    1
336    0
972    0
177    0
328    0
Name: Category, dtype: int64

In [24]:
#adjusting the weights
class_weights = {0:0.13, 1:0.87}
#fitting SVC
svc_model = LinearSVC(C=7, dual = True, loss="squared_hinge", penalty = "l2", tol=1e-7, class_weight=class_weights)
svc_model.fit(comp_train, train_ans)

LinearSVC(C=7, class_weight={0: 0.13, 1: 0.87}, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=1e-07,
     verbose=0)

## Evaluation

In [26]:
TP=0
TN=0
FP=0
FN=0

for i in range(len(train)):
    if svc_model.predict(comp_train.iloc[[i,]])==1 and train_ans.iloc[i]==1:
        TP = TP + 1
    if svc_model.predict(comp_train.iloc[[i,]])==1 and train_ans.iloc[i]==0:
        FP = FP + 1
    if svc_model.predict(comp_train.iloc[[i,]])==0 and train_ans.iloc[i]==0:
        TN = TN + 1
    if svc_model.predict(comp_train.iloc[[i,]])==0 and train_ans.iloc[i]==1:
        FN = FN + 1
        
print("Model", "True Pos - ",TP, "False Pos -",FP,"True Neg - ", TN, "False neg - ",FN)

model_accuracy = (TP+TN)/(len(train_ans))
model_precision = TP/ (TP+FP)
model_recall = TP/(TP+FN)
model_f1 = 2*(model_precision * model_recall) / (model_precision + model_recall)

print("Model's Accuracy:", model_accuracy * 100, '%')
print("Model´s Precision:", model_precision *100,'%')
print("Model´s Recall:", model_recall *100,'%')
print("Model´s F1:", model_f1 *100,'%')


Model True Pos -  191 False Pos - 125 True Neg -  767 False neg -  1
Model's Accuracy: 88.37638376383764 %
Model´s Precision: 60.44303797468354 %
Model´s Recall: 99.47916666666666 %
Model´s F1: 75.1968503937008 %


## Prediction

In [32]:
def query_classifier(new_input):
  #we get the raw text into a "list" format with these brackets
  new_input=[new_input]

  #Now we need to format it in the same way we formated our training data. we first apply to it the hash vectorization to get it into the same format
  new_input_vectorized = vec.fit_transform(new_input)
  new_input_vectorized=pd.DataFrame(new_input_vectorized.toarray())
  
  #Now that we have the hash vectors, we can compress it using PCA (we actually need to add to the training set because PCA compresses the data in function of other data)
  compressing_new_input= train.append(new_input_vectorized, ignore_index=True)
  pca_input_compressor= PCA(n_components=comp_cols, svd_solver='full')
  
  #We compress the data...
  compressing_new_input= pd.DataFrame(pca_input_compressor.fit_transform(compressing_new_input))
  
  #And now we extract the last row that corresponds to the last row which is our new formatted input that we want to predict
  new_input_compressed = compressing_new_input.iloc[[(len(compressing_new_input.index)-1),]]

  #Now we use the ".predict" function to classify the text as "0" or "1"
  prediction=svc_model.predict(new_input_compressed)
  
  if prediction==0:
    print(prediction)
    print("Not to worry, we can deal with this query automatically. This is not an urgent request!")
  else:
    print(prediction)
    print("Human, help please! This request is too complex and specific... Please do it manually")
    

new_input= (input("input new text:"))


query_classifier(new_input)

input new text: do you have this


[0]
Not to worry, we can deal with this query automatically. This is not an urgent request!
