## Platt scaling for posterior probabilities in Linear Support Vector Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import pandas_profiling
%matplotlib inline

In [2]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
import re

In [4]:
top_N = 100
#convert list of list into text

a = df['Review Text'].str.lower().str.cat(sep=' ')

# removes punctuation,numbers and returns list of words
b = re.sub('[^A-Za-z]+', ' ', a)

#remove all the stopwords from the text
stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english'))   
stop_words.extend(nltk_words)

word_tokens = word_tokenize(b)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

# Remove characters which have length less than 2  
without_single_chr = [word for word in filtered_sentence if len(word) > 2]

# Remove numbers
cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]  

In [5]:
df=df.dropna(axis=0,how='any')
rating_class = df[(df['Rating'] == 1) | (df['Rating'] == 5)]
X_review=rating_class['Review Text']
y=rating_class['Rating']

In [6]:
import string
def text_process(review):
    nopunc=[word for word in review if word not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
bow_transformer=TfidfVectorizer(analyzer=text_process).fit(X_review)
X_review = bow_transformer.transform(X_review)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_review, y, test_size=0.3, random_state=101)
from sklearn.svm import SVC
svc = SVC(kernel='linear',probability = True)
svc.fit(X_train, y_train) 
svc.score(X_train,y_train)

0.9909698169223157

In [9]:
svc.predict_proba(X_test)

array([[5.93218029e-01, 4.06781971e-01],
       [6.63991133e-06, 9.99993360e-01],
       [2.00763752e-08, 9.99999980e-01],
       ...,
       [3.01416406e-03, 9.96985836e-01],
       [5.82383546e-06, 9.99994176e-01],
       [1.40613952e-06, 9.99998594e-01]])

In [10]:
svc.classes_

array([1, 5], dtype=int64)

In [11]:
bow_transformer.inverse_transform(X_test)[1]

array(['Love', 'comfy', 'fit', 'great', 'jacket', 'love', 'much',
       'traveling', 'way', 'wears'], dtype='<U32')

In [12]:
# According Linear SVC, the following review has a posterior probability of around 99.99% for being rated as 5
df['Review Text'][7698]

"I love these socks so much! i ordered the beige and red pair with the polar bear design and they are my new favorite socks. these are comfy, with great stretch and just the right amount of cushion. however, the best feature is the cute little frill around the cuff! i want to order more, but i don't care for the other designs as much as the pair i ordered. instead, i will be keeping an eye on this brand to see what they come out with next!"

In [13]:
# This turned out to be right
transformed_rating = bow_transformer.transform([df['Review Text'][7698]])
svc.predict(transformed_rating)[0]

5

In [14]:
predict = svc.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predict))
print('\n')
print(classification_report(y_test, predict))

[[  98  101]
 [   8 3258]]


             precision    recall  f1-score   support

          1       0.92      0.49      0.64       199
          5       0.97      1.00      0.98      3266

avg / total       0.97      0.97      0.96      3465

