In [133]:
import pandas as pd
import numpy as np
import seaborn as sns
import string

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

In [3]:
#read table
dat = pd.read_csv('../data/data.csv')
dat.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [70]:
#drop the first column
dat1=dat.drop(['Unnamed: 0'],axis=1)

In [71]:
#drop all NA
cleandat = dat1.dropna()

In [72]:
#drop Rating and Recommended IND
dat_no_rat = cleandat.drop(['Rating'],axis=1).reset_index(drop=True)
dat_no_Rec = cleandat.drop(['Recommended IND'],axis=1).reset_index(drop=True)

In [116]:
print("row number:",dat_no_rat.shape[0])

row number: 19662


In [53]:
dat_no_rat.groupby('Recommended IND').mean().head()

Unnamed: 0_level_0,Clothing ID,Age,Positive Feedback Count
Recommended IND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,926.797203,42.34014,3.388252
1,920.07503,43.465407,2.488966


In [73]:
# we have two text column, which are Title and Reviews
# I want to predict Recommended IND

X1 = dat_no_rat['Title']
X2 = dat_no_rat['Review Text']
y = dat_no_rat['Recommended IND']

In [80]:
# have a look on the text
X2[0]

'I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c'

In [100]:
#remove punctuation and stopwords
def preprocess(review):
    nopunc = [i for i in review if i not in string.punctuation]
    re1 = ''.join(nopunc)
    re2 = [j for j in re1.split() if j.lower() not in stopwords.words('english')]
    return re2

In [102]:
preprocess(X2[0])

['high',
 'hopes',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'outrageously',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layers',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'c']

In [107]:
# Convert the text into a matric of token counts
bag1 = CountVectorizer(analyzer = preprocess).fit(X1)

In [117]:
bag2 = CountVectorizer(analyzer = preprocess).fit(X2)

In [118]:
#overview of Title and Review
print('number of words in Title', len(bag1.vocabulary_))
print('number of words in Review',len(bag2.vocabulary_))

number of words in Title 4954
number of words in Review 18745


In [149]:
# represent as a vector
# bag1 for Title bag2 for Reviews
bag1_vec = bag1.transform(X1)
bag2_vec = bag2.transform(X2)

In [150]:
#split dataset into training and testing
X1_train, X1_test, y_train, y_test = train_test_split(bag1_vec,y,test_size =0.2,random_state = 101)

In [188]:
#learn models(Recommended IND)
models = {
    "logistic regression" : LogisticRegression(penalty = 'l2'),
    "random forest"       : RandomForestClassifier(),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X1_train, y_train);
    print("  training error: %f" % (1-model.score(X1_train, y_train)))
    print("  test     error: %f" % (1-model.score(X1_test, y_test)))

Fitting logistic regression...
  training error: 0.092250
  test     error: 0.120010
Fitting random forest...
  training error: 0.033251
  test     error: 0.162471
Fitting neural network...
  training error: 0.069744
  test     error: 0.109840
Fitting naive bayes...
  training error: 0.095111
  test     error: 0.130943




In [173]:
#split dataset
X2_train, X2_test, y_train, y_test = train_test_split(bag2_vec,y,test_size =0.2,random_state = 101)

In [174]:
#learn models(Recommended IND)
models = {
    "logistic regression" : LogisticRegression(penalty = 'l2'),
    "random forest"       : RandomForestClassifier(),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X2_train, y_train);
    print("  training error: %f" % (1-model.score(X2_train, y_train)))
    print("  test     error: %f" % (1-model.score(X2_test, y_test)))

Fitting logistic regression...
  training error: 0.036048
  test     error: 0.113145
Fitting random forest...
  training error: 0.005023
  test     error: 0.143911
Fitting neural network...
  training error: 0.025494
  test     error: 0.110603
Fitting naive bayes...
  training error: 0.073876
  test     error: 0.118739




In [193]:
#function to predict Recommended IND using title
def title_predict(text):
    a = bag1.transform([text])
    model = MLPClassifier(max_iter=5,alpha=0.01)
    model.fit(X1_train, y_train)
    return model.predict(a)[0]

In [234]:
#check our result
title_predict(X1[0]) == 0



True

In [235]:
#check our result
title_predict(X1[1]) == 1



True

In [203]:
#try sample text
text = "I like it"
title_predict(text)



1

In [199]:
#function to predict Recommended IND using review
def review_predict(text):
    a = bag2.transform([text])
    model = LogisticRegression(penalty = 'l2')
    model.fit(X2_train, y_train)
    return model.predict(a)[0]

In [236]:
review_predict(X2[0]) ==0

True

In [237]:
review_predict(X2[1])==1

True

In [202]:
#try sample text
text = "I like it"
review_predict(text)

1

In [204]:
#set y1 as Rating
y1 = dat_no_Rec['Rating']

In [206]:
#split dataset
X3_train, X3_test, y1_train, y1_test = train_test_split(bag1_vec,y1,test_size =0.2,random_state = 101)


In [230]:
#learn models(Ratings)
models = {
    "logistic regression" : LogisticRegression(),
    "random forest"       : RandomForestClassifier(max_depth=1,n_estimators=100),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X3_train, y1_train);
    print("  training error: %f" % (1-model.score(X3_train, y1_train)))
    print("  test     error: %f" % (1-model.score(X3_test, y1_test)))

Fitting logistic regression...
  training error: 0.301354
  test     error: 0.377320
Fitting random forest...
  training error: 0.449043
  test     error: 0.442665
Fitting neural network...
  training error: 0.259012
  test     error: 0.374015
Fitting naive bayes...
  training error: 0.302880
  test     error: 0.384694




In [231]:
#split dataset
X4_train, X4_test, y1_train, y1_test = train_test_split(bag2_vec,y1,test_size =0.2,random_state = 101)

In [232]:
#learn models(Ratings)
models = {
    "logistic regression" : LogisticRegression(),
    "random forest"       : RandomForestClassifier(max_depth=1,n_estimators=100),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X4_train, y1_train);
    print("  training error: %f" % (1-model.score(X4_train, y1_train)))
    print("  test     error: %f" % (1-model.score(X4_test, y1_test)))

Fitting logistic regression...
  training error: 0.118825
  test     error: 0.380880
Fitting random forest...
  training error: 0.449043
  test     error: 0.442665
Fitting neural network...
  training error: 0.077882
  test     error: 0.378337
Fitting naive bayes...
  training error: 0.251128
  test     error: 0.371218




In [233]:
#make perdiction
text = X2[0]
a = bag2.transform([text])
model = LogisticRegression(penalty = 'l2')
model.fit(X4_train, y1_train)
model.predict(a)[0]

3