In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import scipy
import re
import string

import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from wordcloud import WordCloud, STOPWORDS

from sklearn.model_selection import train_test_split as split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer 
from nltk.stem import PorterStemmer, LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
#from nltk.corpus import stopwords
import string
from nltk.tokenize import RegexpTokenizer
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, SimpleRNN

from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore') 

from IPython.display import Image

%matplotlib inline

In [None]:
clothing = pd.read_csv("../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv", index_col=0)
print(clothing.shape)
clothing.head(5)

In [None]:
clothing.info()

In [None]:
clothing.describe()

In [None]:
#  Checking for Missing Values
clothing.isnull().values.any()

In [None]:
clothing.isnull().sum()

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize= (14,5))
sns.distplot(clothing['Age'], hist_kws=dict(edgecolor="k")).set_title("Distribution of Age")

In [None]:
sns.set(rc={'figure.figsize':(11,5)})
plt.hist(clothing.Age, bins=40)
plt.xlabel('Age')
plt.ylabel('Reviews')
plt.title('Number of Reviews per Age');

In [None]:
sns.set(rc={'figure.figsize':(11,6)})
sns.boxplot(x = 'Rating', y = 'Age', data = clothing)
plt.title('Rating Distribution per Age');

In [None]:
z = clothing.groupby(by=['Department Name'],as_index=False).count().sort_values(by='Class Name',ascending=False)

plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x=z['Department Name'],y=z['Class Name'], data=z)
plt.xlabel("Department Name")
plt.ylabel("Count")
plt.title("Counts Vs Department Name")

In [None]:
w = clothing.groupby(by=['Division Name'],as_index=False).count().sort_values(by='Class Name',ascending=False)

plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x=w['Division Name'],y=w['Class Name'], data=w)
plt.xlabel("Division Name")
plt.ylabel("Count")
plt.title("Counts Vs Division Name")

In [None]:
#  The Product Rating Distribution
plt.figure(figsize= (14,5))
ax=sns.countplot(x='Rating', data=clothing)
ax.set_title("Distribution of Ratings", fontsize=14)

x=clothing['Rating'].value_counts()

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

In [None]:
h = clothing["Rating"].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
plt.bar(clothing["Rating"].unique(),h)
plt.xlabel("Rating")
plt.ylabel("Counts")
plt.title("Histogram of Ratings")
plt.figure(figsize=(8,4))
ax.grid(True)
plt.rcParams['axes.axisbelow'] = True

In [None]:
#  Number of Reviews per Product Category
plt.figure(figsize= (14,5))
ax=sns.countplot(x='Department Name', data=clothing, order = clothing['Department Name'].value_counts().index)
ax.set_title("Reviews per Department", fontsize=14)
ax.set_ylabel("# of Reviews", fontsize=12)
ax.set_xlabel("Department", fontsize=12)

x=clothing['Department Name'].value_counts()

rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

In [None]:
sns.set(rc={'figure.figsize':(11,6)})
sns.boxplot(x = 'Rating', y = 'Age', data = clothing)
plt.title('Rating Distribution per Age');

In [None]:
#  Distribution of Class
ax = plt.subplot2grid((2, 2), (1, 0), colspan=2)
ax = plt.xticks(rotation=45)
ax = sns.countplot(clothing['Class Name'])
ax = plt.title("Reviews in Each Class")

In [None]:
#  Rate of Recommendations
recommended = clothing[clothing['Recommended IND']==1]
not_recommended = clothing[clothing['Recommended IND']==0]

ax0 = plt.subplot2grid((2, 2), (1, 0), colspan=2)
ax0 = plt.xticks(rotation=45)
ax0 = sns.countplot(recommended['Class Name'], color="red", alpha = 0.8, label = "Recommended")
ax0 = sns.countplot(not_recommended['Class Name'], color="green", alpha = 0.8, label = "Not Recommended")
ax0 = plt.title("Recommended Items in Each Class")
ax0 = plt.legend()

In [None]:
ax1 = plt.subplot2grid((2, 2), (1, 0), colspan=2)
ax1 = plt.xticks(rotation=45)
ax1 = sns.boxplot(x="Class Name", y="Rating", data=clothing)
ax1 = plt.title('Rating Distribution per Class')

In [None]:
# The Most Popular Item
fig = plt.figure(figsize=(14, 9))
plt.xticks(rotation=45)
plt.xlabel('Item ID')
plt.ylabel('Popularity')
plt.title("Top 50 Popular Items")
clothing['Clothing ID'].value_counts()[:50].plot(kind='bar');

In [None]:
#  Correlation Plot of Department,Division and Class Against Each Other
sns.heatmap(pd.crosstab(clothing['Class Name'], 
        clothing["Department Name"]),
            annot=True,fmt='g', cmap="Pastel2_r")
plt.title("Class Name Count Vs Department Name",fontsize=20,fontweight='bold')
plt.show()

sns.heatmap(pd.crosstab(clothing['Class Name'], clothing["Division Name"]),
            annot=True,fmt='g', cmap="Pastel1")
plt.title("Class Name Count Vs Division Name",fontsize=20,fontweight='bold')

plt.show()

sns.heatmap(pd.crosstab(clothing['Department Name'], clothing["Division Name"]),
            annot=True,fmt='g', cmap="Pastel1_r")
plt.title("Department Name Count Vs Division Name",fontsize=20,fontweight='bold')

plt.show()

In [None]:
#  The Amount of Missing Values per Feature
sns.set(rc={'figure.figsize':(11,4)})
pd.isnull(clothing).sum().plot(kind='bar')
plt.ylabel('Number of Missing Values')
plt.title('Missing Values per Feature');

In [None]:
clothing.dropna(subset=['Review Text'], inplace=True)

In [None]:
#  Building Some WordClouds
def clean_data(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = letters_only.lower().split()                            
    return( " ".join( words ))

In [None]:
stopwords= set(STOPWORDS)|{'skirt', 'blouse','dress','sweater', 'shirt','bottom', 'pant', 'pants' 'jean', 'jeans','jacket', 'top', 'dresse', 'material', 'while', 'black', 'fabric', 'color', 'order', 'wear'}

def create_cloud(rating):
    x= [i for i in rating]
    y= ' '.join(x)
    cloud = WordCloud(background_color='white',width=1600, height=800,max_words=100,stopwords = stopwords).generate(y)
    plt.figure(figsize=(15,7.5))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()

In [None]:
#  Rating = 1 Top Words
rating1 = clothing[clothing['Rating']==1]['Review Text'].apply(clean_data)
create_cloud(rating1)

In [None]:
#  Rating = 2 Top Words
rating2 = clothing[clothing['Rating']==2]['Review Text'].apply(clean_data)
create_cloud(rating2)

In [None]:
#  Rating = 3 Top Words
rating3 = clothing[clothing['Rating']==3]['Review Text'].apply(clean_data)
create_cloud(rating3)

In [None]:
#  Rating = 4 Top Words
rating4 = clothing[clothing['Rating']==4]['Review Text'].apply(clean_data)
create_cloud(rating1)

In [None]:
#  Rating = 5 Top Words
rating5 = clothing[clothing['Rating']==5]['Review Text'].apply(clean_data)
create_cloud(rating1)

In [None]:
clothing.loc[(clothing.Rating==1) & (clothing['Recommended IND']==1)]['Review Text'].iloc[1]

In [None]:
clothing.loc[(clothing.Rating==5) & (clothing['Recommended IND']==0)]['Review Text'].iloc[1]

In [None]:
#  Test Features - Preprocessing 
#  Dropping Punctuation
string.punctuation

In [None]:
def punctuation_removal(messy_string):
    clean_list = [char for char in messy_string if char not in string.punctuation]
    clean_string = ''.join(clean_list)
    return clean_string

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(punctuation_removal)
clothing['Review Text'].head()

In [None]:
#  The Positiveness or Negativeness of the Reviews are mostly reflected by Verbs & Adjectives
def adj_collector(review_string):
    new_string=[]
    review_string = word_tokenize(review_string)
    tup_word = nltk.pos_tag(review_string)
    for tup in tup_word:
        if 'VB' in tup[1] or tup[1]=='JJ':  #  Adjectives  &  Verbs
            new_string.append(tup[0])  
    return ' '.join(new_string)

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(adj_collector)
clothing['Review Text'].head(7)

In [None]:
#  Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.append("i'm")

In [None]:
#  Punctuation Removal of Stopwords
stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 
print(stop_words[::12])

In [None]:
#  Adding Clothing Stopwords
#  Same as the Stopwords we defined in building the WordCloud
clothing_list =['dress', 'top','sweater','shirt', 'blouse', 'pant', 'pants',
               'skirt','material', 'white', 'black', 'bottom', 'jacket',
              'jean', 'jeans', 'fabric', 'color', 'order', 'wear', 'dresse']

In [None]:
def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words and word.lower() not in clothing_list ]

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(stopwords_removal)
clothing['Review Text'].head()

In [None]:
print(clothing['Review Text'][762]) 

In [None]:
print(clothing['Review Text'][1033])

In [None]:
#  Removing All Numbers Including Size, Weight etc.
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(drop_numbers)
clothing['Review Text'].head()

In [None]:
print(clothing['Review Text'][3922])

In [None]:
print(clothing['Review Text'][762]) 

In [None]:
#  Stemming
porter = PorterStemmer()

clothing['Review Text'] = clothing['Review Text'].apply(lambda x: x.split())
clothing['Review Text'].head()

In [None]:
def stem_update(text_list):
    text_list_new = []
    for word in text_list:
        word = porter.stem(word)
        text_list_new.append(word) 
    return text_list_new

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(stem_update)
clothing['Review Text'].head()

In [None]:
clothing['Review Text'] = clothing['Review Text'].apply(lambda x: ' '.join(x))
clothing['Review Text'].head()

In [None]:
print(clothing['Review Text'][3922])

In [None]:
print(clothing["Review Text"])

In [None]:
#  Sentiment Analysis
# Pre-Processing
SIA = SentimentIntensityAnalyzer()

# Apply Model, Variable Creation
clothing['Polarity Score'] = clothing["Review Text"].apply(lambda x:SIA.polarity_scores(x)['compound'])
clothing['Neutral Score'] = clothing["Review Text"].apply(lambda x:SIA.polarity_scores(x)['neu'])
clothing['Negative Score'] = clothing["Review Text"].apply(lambda x:SIA.polarity_scores(x)['neg'])
clothing['Positive Score'] = clothing["Review Text"].apply(lambda x:SIA.polarity_scores(x)['pos'])

# Convert 0 to 1 Decimal Score to a Categorical Variable
clothing['Sentiment']=''
clothing.loc[clothing['Polarity Score']>0,'Sentiment']='Positive'
clothing.loc[clothing['Polarity Score']==0,'Sentiment']='Neutral'
clothing.loc[clothing['Polarity Score']<0,'Sentiment']='Negative'

In [None]:
conditions = [
    clothing['Sentiment'] == "Positive",
    clothing['Sentiment'] == "Negative",
    clothing['Sentiment'] == "Neutral"]
choices = [1,-1,0]
clothing['label'] = np.select(conditions, choices)
clothing.head()

In [None]:
#  RNN
samples = clothing["Review Text"].tolist()
maxlen = 100 
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
word_index = tokenizer.word_index
print('Found %s Unique Tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)

In [None]:
labels = np.asarray(clothing["label"].values)
print('Data Tensor Shape :', data.shape)
print('Label Tensor Shape :', labels.shape)

In [None]:
indices = np.arange(clothing.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [None]:
training_samples = 11743
validation_samples = 17614

In [None]:
x_train = data[:training_samples]
y_train = labels[:training_samples]

In [None]:
x_val = data[training_samples: validation_samples] 
y_val = labels[training_samples: validation_samples]

In [None]:
x_test = data[validation_samples:]
y_test = labels[validation_samples:]

In [None]:
x_train = pad_sequences(x_train, maxlen=maxlen)
x_val = pad_sequences(x_val, maxlen=maxlen)

In [None]:
def build_RNN():
    model = Sequential() 
    model.add(Embedding(max_words, 100, input_length=maxlen)) 
    model.add(SimpleRNN(32, return_sequences=True))
    model.add(SimpleRNN(32)) 
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) 
    return model

In [None]:
model = build_RNN()
model.summary()
history_RNN = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(x_val, y_val))

model.save("RNN.h5")

In [None]:
acc = history_RNN.history['acc']
val_acc = history_RNN.history['val_acc']
loss = history_RNN.history['loss']
val_loss = history_RNN.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'b', label='Training Acc')
plt.plot(epochs, val_acc, 'r', label='Validation Acc')
plt.title('Training And Validation Accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training And Validation Loss')
plt.legend()
plt.show()

In [None]:
model.evaluate(x_test, y_test)

In [None]:
#  Simple Embedding Deep Neural Network
def build_model():
    model = Sequential()
    model.add(Embedding(max_words, 100, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
    return model

In [None]:
model = build_model()
model.summary()
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(x_val, y_val))

model.save("DNN.h5")

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'b', label='Training Acc')
plt.plot(epochs, val_acc, 'r', label='Validation Acc')
plt.title('Training And Validation Accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training And Validation Loss')
plt.legend()
plt.show()

In [None]:
model.evaluate(x_test, y_test)