In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

In [3]:
#Load the dataset

urls = "https://github.com/sambit9238/Machine-Learning/raw/master/question_topic.csv"

df = pd.read_csv(urls)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,question_text,question_topic
0,0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,1,I'm going to be out of the country for about a...,Shipping
2,2,I was wondering if you'd be able to overnight ...,Shipping
3,3,The Swingline electronic stapler (472555) look...,Shipping
4,4,I think this cosmetic bag would work great for...,Shipping


In [5]:
df.shape

(5000, 3)

In [6]:
set(df["question_topic"])

{'Omnichannel',
 'Product Availability',
 'Product Comparison',
 'Product Specifications',
 'Returns & Refunds',
 'Sales/Promotions',
 'Shipping'}

In [7]:
from collections import Counter
Counter(df["question_topic"])

Counter({'Sales/Promotions': 505,
         'Shipping': 799,
         'Product Availability': 833,
         'Product Specifications': 839,
         'Omnichannel': 450,
         'Product Comparison': 806,
         'Returns & Refunds': 768})

In [8]:
#pre-processing
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [9]:
df.columns


Index(['Unnamed: 0', 'question_text', 'question_topic'], dtype='object')

In [10]:
#train test split
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["question_topic"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [11]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [12]:
#pipeline of feature engineering and model
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [14]:
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

In [15]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.9683999999999999
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [16]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [17]:
#fit model with training data
model.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced')))])

In [18]:
#evaluation on test data
pred = model.predict(X_test)

In [19]:
model.classes_

array(['Omnichannel', 'Product Availability', 'Product Comparison',
       'Product Specifications', 'Returns & Refunds', 'Sales/Promotions',
       'Shipping'], dtype='<U22')

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[128,   0,   0,   0,   0,   0,   0],
       [  0, 252,   0,   5,   0,   5,   0],
       [  0,   0, 223,   2,   0,   0,   0],
       [  0,   1,   6, 254,   0,   1,   0],
       [  0,   0,   0,   0, 230,   1,   0],
       [  0,   0,   0,   0,   0, 146,   0],
       [  2,   0,   0,   0,   0,   0, 244]], dtype=int64)

In [21]:
accuracy_score(y_test, pred)

0.9846666666666667

In [28]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

                        precision    recall  f1-score   support

           Omnichannel       1.00      0.98      0.99       130
  Product Availability       0.96      1.00      0.98       253
    Product Comparison       0.99      0.97      0.98       229
Product Specifications       0.97      0.97      0.97       261
     Returns & Refunds       1.00      1.00      1.00       230
      Sales/Promotions       1.00      0.95      0.98       153
              Shipping       0.99      1.00      1.00       244

              accuracy                           0.98      1500
             macro avg       0.99      0.98      0.98      1500
          weighted avg       0.98      0.98      0.98      1500



In [23]:
#save the model
import joblib
joblib.dump(model, 'model_question_topic.pkl', compress=1)

['model_question_topic.pkl']

# Deployment

In [25]:
import joblib
model = joblib.load('model_question_topic.pkl')

In [26]:
question = input()

I was wondering if you'd be able to overnight 


In [27]:
model.predict([question])[0]

'Shipping'