In [None]:
# Importing libaries: pandas, seaborn, matplotlib and textblob

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textblob
import numpy as np
from textblob import TextBlob

In [None]:
# import csv file into a DataFrame

clothingdf=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

In [None]:
# create subset of data with only relevant columns:

# 1. First i check the columns:

clothingdf.head()

# 2. I will remove 'Unnamed: 0', 'Clothing ID', 'Age', 'Recommended IND', 'Positive Feedback Count', 'Class Name'

subsetdf = clothingdf.drop(['Unnamed: 0', 'Clothing ID', 'Age', 'Recommended IND', 'Positive Feedback Count', 'Class Name'], axis=1)

subsetdf

In [None]:
# standardising header columns:

subsetdf.columns = [col.lower().replace(" ", "_") for col in subsetdf.columns]
subsetdf


In [None]:
# EDA - Exploring the shape (total rows and columns), info (dtypes and nulls) and descriptive statistics:

subsetdf.shape

In [None]:
subsetdf.info()

In [None]:
subsetdf.describe()

In [None]:
# Checking % of NaN values

100 * subsetdf.isna().sum()/len(subsetdf)

In [None]:
# Plotting count of 'division_name'

subsetdf['division_name'].value_counts().plot(kind='bar');

In [None]:
# Plotting count of 'department_name'

subsetdf['department_name'].value_counts().plot(kind='bar');

In [None]:
# Exploring the reviews

subsetdf['review_text']


In [None]:
# defining a function that converts NaN values to 0 or returns length.

def get_len(x):
    if x==x:
        return len(x)
    else:
        return 0
    

In [None]:
# applying the function to 'review_text' column and storing it in a new column

subsetdf['len_rev'] = subsetdf['review_text'].apply(lambda x: get_len(x))
subsetdf.head()


In [None]:
# visualising the newly created column 'len_rev'

fig, ax = plt.subplots(figsize=(12,10))
sns.histplot(subsetdf['len_rev'], bins=100, ax=ax);

In [None]:
# Visualising again, this time by 'division_name' and removing all records with length of 0 or 500 

fig, ax = plt.subplots(figsize=(12,10))
sns.histplot(subsetdf[(subsetdf['len_rev'] < 499) & (subsetdf['len_rev'] != 0)],
             x='len_rev',
             bins=100,
             ax=ax,
             hue='division_name');

In [None]:
# Applying same visual as above, this time the count is done by 'department_name' 

fig, ax = plt.subplots(figsize=(12,10))
sns.histplot(subsetdf[(subsetdf['len_rev'] < 499) & (subsetdf['len_rev'] != 0)],
             x='len_rev',
             bins=100,
             ax=ax,
             hue='department_name');

#### subset summary

+ There are 2 columns in which we can perform Sentiment Analysis using Text Blob - 'title' and 'review_text'.
+ We also have 2 categorical columns - 'department_name' and 'divsion_name' we can use for aggregated views
+ We have one numerical column, 'rating', which goes from 1 to 5 and can be used to check our sentiment analysis scores (could help spotting sarcasm)
+ We created a new column applying feature engineering to the review_text column that indicates which reviews are more text heavy

In [None]:
# average rating by division and department using group by:

avg_by_div_dept = subsetdf.groupby(["department_name", "division_name"])["rating"].mean().reset_index()
avg_by_div_dept.head(10)

In [None]:
# Create new Dataframe with additional column of concatenated 'division_name' and 'department_name'

avg_by_div_dept["div_dep"] = avg_by_div_dept['division_name'] + " " + avg_by_div_dept['department_name']
avg_by_div_dept

In [None]:
# Visualise average rating per division and department

avg_by_div_dept.plot(kind="barh", y='rating', x='div_dep');

In [None]:
# sample check on the review column, index position 5 

text = subsetdf.iloc[5]['review_text']

In [None]:
# Storing text variable into new variable 'testimonial'

testimonial = TextBlob(text)

In [None]:
# Running sentiment analysis on the selected record

testimonial.sentiment

# Polarity level seems in line with feedback from the customer

In [None]:
# Using TextBlob to break down the review into sentences

testimonial.sentences

In [None]:
# Checking another reviews' sentiment vs actual text:

text2 = subsetdf.iloc[6]['review_text']
testimonial2 = TextBlob(text2)
testimonial2.sentiment

In [None]:
testimonial2.sentences

# Polarity level seems in line with feedback from the customer

In [None]:
# And another sense check:

text3 = subsetdf.iloc[7]['review_text']
testimonial3 = TextBlob(text3)
testimonial3.sentiment

In [None]:
testimonial3.sentences

In [None]:
# Polarity level indicates certain indifference towards the product, which is in line with the customers' feedback

In [None]:
# Removing NaN values from 'review text' from our subset

subsetdf = subsetdf.dropna(subset=['review_text'], how='any')

# Check new total number of rows is indeed 22641

subsetdf.info()

In [None]:
# Reset index

subsetdf.reset_index(drop=True)

In [None]:
# Calculate sentiment polarity for each row of the filtered review data set and store it in a new column

# 1. First we define a function that returns polarity from sentiment analysis

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

# 2. Then, we create the new column applying 'get_polarity' to our 'review_text' column

subsetdf['polarity'] = subsetdf['review_text'].apply(get_polarity)
subsetdf

In [None]:
# Summarise the minimum review polarity, by division and department as a new dataframe

df_min_pol = subsetdf.groupby(['division_name', 'department_name'])['polarity'].min().reset_index()
df_min_pol

In [None]:
# Adding concatenated column of 'department_name' and 'division_name' to the dataframe for our next visualisation

df_min_pol['div_dep'] = df_min_pol['division_name'] + " " + df_min_pol['department_name']
df_min_pol

In [None]:
# Visualising in an ascending order sorted barchart

df_min_pol = df_min_pol.sort_values(by='polarity')
df_min_pol.plot(kind="barh", y='polarity', x='div_dep');

In [None]:
# Average review polarity, by division and department

df_mean_pol = subsetdf.groupby(['division_name', 'department_name'])['polarity'].mean().reset_index()

In [None]:
# Adding concatenated column of 'department_name' and 'division_name'

df_mean_pol = df_mean_pol.sort_values(by='polarity')
df_mean_pol['div_dep'] = df_mean_pol['division_name'] + " " + df_mean_pol['department_name']

In [None]:
# Visualising in an descending order sorted barchart

df_mean_pol.plot(kind='barh', y='polarity', x='div_dep');