In [272]:
import pandas as pd
import numpy as np
import seaborn as sns
import string

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

In [400]:
#read table
dat = pd.read_csv('https://raw.githubusercontent.com/fyang95/cloth_reviews_analysis/master/data/data.csv')
dat.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [274]:
#drop the first column
dat1=dat.drop(['Unnamed: 0'],axis=1)

In [275]:
#drop all NA
cleandat = dat1.dropna()

In [276]:
#drop Rating and Recommended IND
dat_no_rat = cleandat.drop(['Rating'],axis=1).reset_index(drop=True)
dat_no_Rec = cleandat.drop(['Recommended IND'],axis=1).reset_index(drop=True)

In [277]:
print("row number:",dat_no_rat.shape[0])
print("row number:",dat_no_Rec.shape[0])

row number: 19662
row number: 19662


In [278]:
dat_no_rat.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,0,0,General,Dresses,Dresses
1,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1,0,General Petite,Bottoms,Pants
2,847,47,Flattering shirt,This shirt is very flattering to all due to th...,1,6,General,Tops,Blouses
3,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",0,4,General,Dresses,Dresses
4,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,1,1,General Petite,Tops,Knits


In [279]:
# we have two text column, which are Title and Reviews
# I want to predict Recommended IND

X1 = dat_no_rat['Title']
X2 = dat_no_rat['Review Text']
y = dat_no_rat['Recommended IND']

In [280]:
# have a look on the text
X2[0]

'I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c'

In [281]:
#remove punctuation and stopwords
def preprocess(review):
    nopunc = [i for i in review if i not in string.punctuation]
    re1 = ''.join(nopunc)
    re2 = [j for j in re1.split() if j.lower() not in stopwords.words('english')]
    return re2

In [282]:
preprocess(X2[0])

['high',
 'hopes',
 'dress',
 'really',
 'wanted',
 'work',
 'initially',
 'ordered',
 'petite',
 'small',
 'usual',
 'size',
 'found',
 'outrageously',
 'small',
 'small',
 'fact',
 'could',
 'zip',
 'reordered',
 'petite',
 'medium',
 'ok',
 'overall',
 'top',
 'half',
 'comfortable',
 'fit',
 'nicely',
 'bottom',
 'half',
 'tight',
 'layer',
 'several',
 'somewhat',
 'cheap',
 'net',
 'layers',
 'imo',
 'major',
 'design',
 'flaw',
 'net',
 'layer',
 'sewn',
 'directly',
 'zipper',
 'c']

In [401]:
#tfidf_vectorizer = TfidfVectorizer(analyzer = preprocess)

#tfbag2 = tfidf_vectorizer.fit(X2)

In [283]:
# Convert the text into a matric of token counts
bag1 = CountVectorizer(analyzer = preprocess).fit(X1)

In [284]:
bag2 = CountVectorizer(analyzer = preprocess).fit(X2)

In [285]:
#overview of Title and Review
print('number of words in Title', len(bag1.vocabulary_))
print('number of words in Review',len(bag2.vocabulary_))

number of words in Title 4954
number of words in Review 18745


In [286]:
# represent as a vector
# bag1 for Title bag2 for Reviews
bag1_vec = bag1.transform(X1)
bag2_vec = bag2.transform(X2)

In [287]:
#split dataset into training and testing
X1_train, X1_test, y_train, y_test = train_test_split(bag1_vec,y,test_size =0.2,random_state = 101)

In [288]:
#learn models(Recommended IND)
models = {
    "logistic regression" : LogisticRegression(penalty = 'l2'),
    "random forest"       : RandomForestClassifier(),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X1_train, y_train);
    print("  training error: %f" % (1-model.score(X1_train, y_train)))
    print("  test     error: %f" % (1-model.score(X1_test, y_test)))

Fitting logistic regression...
  training error: 0.092250
  test     error: 0.120010
Fitting random forest...
  training error: 0.031852
  test     error: 0.164760
Fitting neural network...
  training error: 0.070825
  test     error: 0.112382
Fitting naive bayes...
  training error: 0.095111
  test     error: 0.130943




In [289]:
#split dataset
X2_train, X2_test, y_train, y_test = train_test_split(bag2_vec,y,test_size =0.2,random_state = 101)

In [290]:
#learn models(Recommended IND)
models = {
    "logistic regression" : LogisticRegression(penalty = 'l2'),
    "random forest"       : RandomForestClassifier(),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X2_train, y_train);
    print("  training error: %f" % (1-model.score(X2_train, y_train)))
    print("  test     error: %f" % (1-model.score(X2_test, y_test)))

Fitting logistic regression...
  training error: 0.036048
  test     error: 0.113145
Fitting random forest...
  training error: 0.004005
  test     error: 0.144928
Fitting neural network...
  training error: 0.023015
  test     error: 0.111620
Fitting naive bayes...
  training error: 0.073876
  test     error: 0.118739




In [387]:
#function to predict Recommended IND using title
def Rec_by_title(text):
    a = bag1.transform([text])
    model = MLPClassifier(max_iter=5,alpha=0.01)
    model.fit(X1_train, y_train)
    return model.predict(a)[0]

In [390]:
#check our result
Rec_by_title(X1[0]) == 0



True

In [293]:
#check our result
Rec_by_title(X1[1]) == 1



True

In [294]:
#try sample text
text = "I like it"
Rec_by_title(text)



1

In [388]:
#function to predict Recommended IND using review
def Rec_by_review(text):
    a = bag2.transform([text])
    model = LogisticRegression(penalty = 'l2')
    model.fit(X2_train, y_train)
    return model.predict(a)[0]

In [296]:
Rec_by_review(X2[0]) ==0

True

In [297]:
Rec_by_review(X2[1])==1

True

In [298]:
#try sample text
text = "I like it"
Rec_by_review(text)

1

In [299]:
#set y1 as Rating
y1 = dat_no_Rec['Rating']

In [300]:
#split dataset
X3_train, X3_test, y1_train, y1_test = train_test_split(bag1_vec,y1,test_size =0.2,random_state = 101)


In [301]:
#learn models(Ratings)
models = {
    "logistic regression" : LogisticRegression(),
    "random forest"       : RandomForestClassifier(max_depth=1,n_estimators=100),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X3_train, y1_train);
    print("  training error: %f" % (1-model.score(X3_train, y1_train)))
    print("  test     error: %f" % (1-model.score(X3_test, y1_test)))

Fitting logistic regression...
  training error: 0.301354
  test     error: 0.377320
Fitting random forest...
  training error: 0.449043
  test     error: 0.442665
Fitting neural network...
  training error: 0.261364
  test     error: 0.377066
Fitting naive bayes...
  training error: 0.302880
  test     error: 0.384694




In [389]:
#predict Rating by title
def Rat_by_title(text):
    a = bag1.transform([text])
    model = LogisticRegression(penalty = 'l2')
    model.fit(X3_train, y1_train)
    return model.predict(a)[0]

In [303]:
Rat_by_title(X1[0])
# This should be 4

3

In [304]:
Rat_by_title(X1[1])
# This is correct

5

In [305]:
#split dataset
X4_train, X4_test, y1_train, y1_test = train_test_split(bag2_vec,y1,test_size =0.2,random_state = 101)

In [306]:
#learn models(Ratings)
models = {
    "logistic regression" : LogisticRegression(),
    "random forest"       : RandomForestClassifier(max_depth=1,n_estimators=100),
    "neural network"      : MLPClassifier(max_iter=5,alpha=0.01),
    "naive bayes"         : MultinomialNB()
}

for name, model in models.items():
    print("Fitting %s..." % name)
    model.fit(X4_train, y1_train);
    print("  training error: %f" % (1-model.score(X4_train, y1_train)))
    print("  test     error: %f" % (1-model.score(X4_test, y1_test)))

Fitting logistic regression...
  training error: 0.118825
  test     error: 0.380880
Fitting random forest...
  training error: 0.449043
  test     error: 0.442665
Fitting neural network...
  training error: 0.079471
  test     error: 0.376049
Fitting naive bayes...
  training error: 0.251128
  test     error: 0.371218




In [391]:
#predict Rating by review
def Rat_by_review(text):
    a = bag2.transform([text])
    model = LogisticRegression(penalty = 'l2')
    model.fit(X4_train, y1_train)
    return model.predict(a)[0]

In [308]:
Rat_by_review(X2[0])
# The result should be 4

3

In [309]:
Rat_by_review(X2[1])
# the result is correct

5

## summary

In [372]:
# samples are randomly picked from Amazon
title1 = "Five stars--just be aware these shirts shrink down a size."
review1 = "First thing you need to know is that these shirts absolutely fit small. I ordered larges and they shrunk to a loose medium size. If you are ordering these to fit to size, I would definitely suggest getting a larger size if you're going to put these in the drier or wash in hot water."
rating1 = 5 

title2 = "4/5 ain’t bad"
review2 = "Unfortunately one of the shirts came with a hole/pull in the back, the rest are great, they fit true to size, and are comfortable"
rating2 = 3

title3 = "Wording not the same as advertised"
review3 = 'I ordered larger so the fit is okay, but it was supposed to say "BREZZY TO BLACK VIBES" and mine says "1.BREESV TO BLACK VIBES"'
rating3 = 1

### for the first item

In [373]:
Rec_by_title(title1)



1

In [374]:
Rec_by_review(review1)

1

In [375]:
Rat_by_title(title1)

5

In [377]:
Rat_by_review(review1)

5

### for the second item

In [392]:
Rec_by_title(title2)



0

In [393]:
Rec_by_review(review2)

1

In [394]:
Rat_by_title(title2)

3

In [395]:
Rat_by_review(review2)

5

### for the third item

In [396]:
Rec_by_title(title3)



0

In [397]:
Rec_by_review(review3)

1

In [398]:
Rat_by_title(title3)

3

In [399]:
Rat_by_review(review3)

5