# Graham Bachman - Final Project 
## Women's E-Commerce Clothing Reviews

Predicting Rating and Recommended IND using Review Text

Data source: https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews/

+ **Clothing ID**: Unique ID of the product
+ **Age**:Age of the reviewer
+ **Title**:Title of the review
+ **ReviewText**: review
+ **Rating**: Product rating by reviewer
+ **Recommended IND**: Whether the product is recommended or not by the reviewer
+ **Positive Feedback Count**: Number of positive feedback on the review
+ **Division Name**: Name of the division product is in
+ **Department Name**: Name of the department product is in
+ **Class Name**: Type of product

# Preprocessing

In [2]:
# write your Python codes here for Q 1 and run this cell to get the outputs
import pandas as pd
import numpy as np
from pprint import pprint
from time import time
import logging
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords
from sklearn.decomposition import NMF
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

df = pd.read_csv('../Homework/Womens Clothing E-Commerce Reviews.csv', sep=',')
print("Removed columns that won't be used in the analysis.")
df = df.drop(['Clothing ID', 'Age', 'Title', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'], axis=1)
df.head()

ModuleNotFoundError: No module named 'spacy'

In [2]:
print('Check for null values')
print(df.isnull().sum())
print()
print('Check how many entries in the dataset.')
print(round(845/len(df),3))
print('3.7 is a small percentage of the total entries, so I would like to remove the null entries outright')
print()
print('Doube check the balance of distribution in the rows I will be removing.')
df1 = df[df.isna().any(axis=1)]
print(df1['Rating'].value_counts())
print(df1['Recommended IND'].value_counts())
print('Seems like the entries I will be removing are not behaving differently than the rest of the data, so safe to drop them')
df = df.dropna()
print()
print(df.isnull().sum())

Check for null values
Review Text        845
Rating               0
Recommended IND      0
dtype: int64

Check how many entries in the dataset.
0.036
3.7 is a small percentage of the total entries, so I would like to remove the null entries outright

Doube check the balance of distribution in the rows I will be removing.
5    591
4    169
3     48
1     21
2     16
Name: Rating, dtype: int64
1    774
0     71
Name: Recommended IND, dtype: int64
Seems like the entries I will be removing are not behaving differently than the rest of the data, so safe to drop them

Review Text        0
Rating             0
Recommended IND    0
dtype: int64


# Splitting the Data

In [4]:
X = df['Review Text']
y = df['Recommended IND']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size=0.20, random_state=2020)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (18112,)
Testing Data Shape:  (4529,)


# Predict Recommended IND using Review Text

In [None]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [None]:
# Logistic Regression
text_lr = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('lr', LogisticRegression()), 
])

text_lr.fit(X_train, y_train) 
print('Model Accuracy: ', metrics.accuracy_score(y_test,text_lr.predict(X_test)))
print('F1 Score: ', metrics.f1_score(y_test,text_lr.predict(X_test)))

In [None]:
# Naive Bayes
text_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('nb', MultinomialNB()), 
])

text_nb.fit(X_train, y_train) 
print('Model Accuracy: ', metrics.accuracy_score(y_test,text_nb.predict(X_test)))
print('F1 Score: ', metrics.f1_score(y_test,text_nb.predict(X_test)))

In [None]:
# Linear SVC
text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('clf', LinearSVC()), 
])

text_clf.fit(X_train, y_train)  
print('Model Accuracy: ', metrics.accuracy_score(y_test,text_clf.predict(X_test)))
print('F1 Score: ', metrics.f1_score(y_test,text_clf.predict(X_test)))

In [None]:
# Random Forest
text_rf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('rf', RandomForestClassifier()), 
])

text_rf.fit(X_train, y_train) 
print('Model Accuracy: ', metrics.accuracy_score(y_test,text_rf.predict(X_test)))
print('F1 Score: ', metrics.f1_score(y_test,text_rf.predict(X_test)))

In [None]:
print('The best model is the Logistic Regression, it had the highest accuracy at 0.8907 and the best F1 Score at 0.902.\nLogistic Regressgion is my recommendation.')

# Predict Rating using Review Text

In [5]:
y1 = df['Rating']

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, stratify = y1,test_size=0.20, random_state=2020)

In [1]:
# Logistic Regression
text_lr_1 = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('lr', LogisticRegression()), 
])

text_lr_1.fit(X_train, y1_train) 
print('Model Accuracy: ', metrics.accuracy_score(y1_test,text_lr_1.predict(X_test)))
#print(metrics.f1_score(y1_test,text_lr_1.predict(X_test)))

NameError: name 'Pipeline' is not defined

In [None]:
# Naive Bayes
text_nb_1 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('nb', MultinomialNB()), 
])

text_nb_1.fit(X_train, y1_train) 
print('Model Accuracy: ', metrics.accuracy_score(y1_test,text_nb_1.predict(X_test)))

In [None]:
# Linear SVC
text_clf_1 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('clf', LinearSVC()), 
])

text_clf_1.fit(X_train, y1_train)  
print('Model Accuracy: ', metrics.accuracy_score(y1_test,text_clf_1.predict(X_test)))

In [None]:
# Random Forest
text_rf_1 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), 
                     ('rf', RandomForestClassifier()), 
])

text_rf_1.fit(X_train, y1_train) 
print('Model Accuracy: ', metrics.accuracy_score(y1_test,text_rf_1.predict(X_test)))

In [None]:
print('The best model is the Logistic Regression, it had the highest accuracy at 0.63568.\nLogistic Regressgion is my recommendation.')

# Topic Models

In [None]:
df

In [None]:
# Removing Stop Words from the Review Text Column
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['Review Text'] = df['Review Text'].str.lower().str.split()
df['Review Text'] = df['Review Text'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
df['Review Text'] = df['Review Text'].to_string()

In [None]:
#df['Review Text'] = df['Review Text'].applymap(str)
df['Review Text'] = ' '.join(map(str, df['Review Text'])) 

In [None]:
cv = CountVectorizer(max_df=0.90, min_df=3)
dtm = cv.fit_transform(df['Review Text'])
LDA = LatentDirichletAllocation(n_components=10,random_state=100)
LDA.fit(dtm)

In [None]:
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(df['Review Text'])
LDA_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
LDA_tfidf.fit(dtm_tfidf)

In [None]:
pyLDAvis.sklearn.prepare(LDA, dtm, cv, mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(LDA_tfidf, dtm_tfidf, tfidf_vectorizer)