In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS
from wordcloud import ImageColorGenerator
from plotnine import ggplot, aes, geom_line
import re
from pathlib import Path  

### Function Definitions

In [14]:
# Inches conversion function
def convert_to_inches(height):
    pattern = re.compile(r"(\d+)ft (\d+)in")  # Regular expression pattern to match feet and inches
    match = pattern.match(height)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        total_inches = (feet * 12) + inches
        return total_inches
    else:
        return None

In [15]:
# Text analysis functions
def processText(text):
    text = text.str.replace("(<br/>)", "")
    text = text.str.replace('(<a).*(>).*(</a>)', '')
    text = text.str.replace('(&amp)', '')
    text = text.str.replace('(&gt)', '')
    text = text.str.replace('(&lt)', '')
    text = text.str.replace('(\xa0)', ' ')
    return text

# List out reviews based on polarity
def listPolarity(df, pol, col_name, n):
    col = df.loc[df.polarity == pol, [col_name]].sample(n).values
    for c in col:
        print(c[0])

### Load Data

In [7]:
df = pd.read_json('../Data/modcloth/modcloth_final_data.json', lines=True)
df.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,


In [8]:
# Remove new, wedding, and sale from category
# Remove NAN review
df2 = df.loc[(df["category"] != "new") & (df["category"] != "wedding") & 
       (df["category"] != "sale")]
df2 = df2.loc[df["review_text"].isnull() == 0]

#### Convert height into inches

In [13]:
df2['height'] = df2['height'].astype(str)
df2['height_inches'] = df2['height'].apply(convert_to_inches)
df2.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text,height_inches
8490,159891,26.0,1,5.0,a,38.0,32.0,dresses,,5ft 4in,Isabella,just right,fit,240386,,,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,64.0
8491,159891,,8,4.0,c,,34.0,dresses,,5ft 8in,Hillary,just right,fit,531304,,,Best dress I've purchased,Best dress I've purchased on this site! (and t...,68.0
8492,159891,,8,5.0,b,,32.0,dresses,,5ft 2in,caitortot,just right,fit,460168,,,Lucky!,I managed to get this dress in my Stylish Surp...,62.0
8493,159891,31.0,8,3.0,d,41.0,36.0,dresses,,5ft 2in,mercersg,just right,fit,439693,,,This dress looks great on,This dress looks great on. I'm a pretty curren...,62.0
8494,159891,,12,3.0,dd/e,36.0,34.0,dresses,,5ft 5in,Erin,just right,fit,479187,,,So adorable! The straps a,"So adorable! The straps are adjustable, which ...",65.0


#### Add polarity scores

In [16]:
df2['review_text'] = processText(df2['review_text'])

df2['polarity'] = df2['review_text'].map(lambda txt: TextBlob(txt).sentiment.polarity)
df2['reviewLen'] = df2['review_text'].astype(str).apply(len)
df2['wordCount'] = df2['review_text'].apply(lambda x: len(str(x).split()))

In [17]:
print("Reviews with the highest positive sentiment polarity: ")
print("-----------------------------------------------------")
listPolarity(df2, 1, 'review_text', 5)
print("")
print("Reviews with most neutral sentiment polarity: ")
print("-----------------------------------------------")
listPolarity(df2, 0, 'review_text', 5)
print("")
print("Reviews with the highest negative sentiment polarity: ")
print("------------------------------------------------------")
listPolarity(df2, -1, 'review_text', 5)

Reviews with the highest positive sentiment polarity: 
-----------------------------------------------------
Perfect dress for any day.. Nothing but compliments, if you are curvy like I am this will be should be a staple piece of your wardrobe
Best gift from a friend. perfect for me
The perfect lbd.
Perfect!
perfect for the occasion

Reviews with most neutral sentiment polarity: 
-----------------------------------------------
I'll be getting this blazer in different colors.
Need in every color!
.
If I could. I would wear it every day
It fits well and is one of my favorites

Reviews with the highest negative sentiment polarity: 
------------------------------------------------------
Hated this fabric!
terrible material.
Horrible fabric and belt
terrible material.  feels like plastic on skin.
I have this skirt in two colours. don't ask me why because I look horrible in both. Tags are still on one of them. they are SO unflattering on my shape.


In [18]:
df2.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,...,fit,user_id,shoe size,shoe width,review_summary,review_text,height_inches,polarity,reviewLen,wordCount
8490,159891,26.0,1,5.0,a,38.0,32.0,dresses,,5ft 4in,...,fit,240386,,,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,64.0,0.3925,301,58
8491,159891,,8,4.0,c,,34.0,dresses,,5ft 8in,...,fit,531304,,,Best dress I've purchased,Best dress I've purchased on this site! (and t...,68.0,0.377778,319,63
8492,159891,,8,5.0,b,,32.0,dresses,,5ft 2in,...,fit,460168,,,Lucky!,I managed to get this dress in my Stylish Surp...,62.0,0.4125,712,138
8493,159891,31.0,8,3.0,d,41.0,36.0,dresses,,5ft 2in,...,fit,439693,,,This dress looks great on,This dress looks great on. I'm a pretty curren...,62.0,0.18625,273,57
8494,159891,,12,3.0,dd/e,36.0,34.0,dresses,,5ft 5in,...,fit,479187,,,So adorable! The straps a,"So adorable! The straps are adjustable, which ...",65.0,0.3475,166,27


In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58501 entries, 8490 to 82789
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         58501 non-null  int64  
 1   waist           2097 non-null   float64
 2   size            58501 non-null  int64  
 3   quality         58454 non-null  float64
 4   cup size        54005 non-null  object 
 5   hips            39783 non-null  float64
 6   bra size        54144 non-null  float64
 7   category        58501 non-null  object 
 8   bust            8461 non-null   object 
 9   height          58501 non-null  object 
 10  user_name       58501 non-null  object 
 11  length          58477 non-null  object 
 12  fit             58501 non-null  object 
 13  user_id         58501 non-null  int64  
 14  shoe size       19941 non-null  float64
 15  shoe width      13253 non-null  object 
 16  review_summary  58501 non-null  object 
 17  review_text     58501 non-null  o

In [24]:
df2.iloc[:, 0:8]
df2.iloc[:,]

IndexingError: Too many indexers

In [48]:
col_i = list(range(0,8))
col_i.extend([11,12, 16, 17, 18, 19, 20, 21])
df3 = df2.iloc[:, col_i]
df3.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,length,fit,review_summary,review_text,height_inches,polarity,reviewLen,wordCount
8490,159891,26.0,1,5.0,a,38.0,32.0,dresses,just right,fit,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,64.0,0.3925,301,58
8491,159891,,8,4.0,c,,34.0,dresses,just right,fit,Best dress I've purchased,Best dress I've purchased on this site! (and t...,68.0,0.377778,319,63
8492,159891,,8,5.0,b,,32.0,dresses,just right,fit,Lucky!,I managed to get this dress in my Stylish Surp...,62.0,0.4125,712,138
8493,159891,31.0,8,3.0,d,41.0,36.0,dresses,just right,fit,This dress looks great on,This dress looks great on. I'm a pretty curren...,62.0,0.18625,273,57
8494,159891,,12,3.0,dd/e,36.0,34.0,dresses,just right,fit,So adorable! The straps a,"So adorable! The straps are adjustable, which ...",65.0,0.3475,166,27


In [49]:
cols_lst = ["item_id", "category", "waist", "size", "quality", "cup size", "hips", "bra size",
           "height_inches", "length", "fit", "review_summary", "review_text", "polarity", "reviewLen", "wordCount"]

In [50]:
df3 = df3[cols_lst]

In [53]:
df3

Unnamed: 0,item_id,category,waist,size,quality,cup size,hips,bra size,height_inches,length,fit,review_summary,review_text,polarity,reviewLen,wordCount
8490,159891,dresses,26.0,1,5.0,a,38.0,32.0,64.0,just right,fit,This dress. Is so amazing,This dress. Is so amazing. It just arrived tod...,0.392500,301,58
8491,159891,dresses,,8,4.0,c,,34.0,68.0,just right,fit,Best dress I've purchased,Best dress I've purchased on this site! (and t...,0.377778,319,63
8492,159891,dresses,,8,5.0,b,,32.0,62.0,just right,fit,Lucky!,I managed to get this dress in my Stylish Surp...,0.412500,712,138
8493,159891,dresses,31.0,8,3.0,d,41.0,36.0,62.0,just right,fit,This dress looks great on,This dress looks great on. I'm a pretty curren...,0.186250,273,57
8494,159891,dresses,,12,3.0,dd/e,36.0,34.0,65.0,just right,fit,So adorable! The straps a,"So adorable! The straps are adjustable, which ...",0.347500,166,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82785,807722,outerwear,,8,4.0,b,,36.0,68.0,just right,fit,Cute jacket!,Cute jacket!,0.625000,12,2
82786,807722,outerwear,,12,5.0,ddd/f,,34.0,65.0,slightly long,small,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...,0.412143,110,24
82787,807722,outerwear,,12,5.0,dddd/g,36.0,32.0,64.0,just right,fit,I love this blazer. It is,I love this blazer. It is a great office piece...,0.516667,130,27
82788,807722,outerwear,,12,4.0,,,,63.0,just right,fit,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...,0.294063,109,19


### Change `length` values

* "just right" -> 5
* "slightly long" -> 4
* "very long" -> 3
* "slightly short" -> 2
* "very short" -> 1

In [52]:
df3["length"].unique()

array(['just right', 'slightly long', 'very long', 'slightly short',
       'very short', nan], dtype=object)

In [58]:
df3.loc[df3["length"] == "just right", "length"] = 5
df3.loc[df3["length"] == "slightly long", "length"] = 4
df3.loc[df3["length"] == "very long", "length"] = 3
df3.loc[df3["length"] == "slightly short", "length"] = 2
df3.loc[df3["length"] == "very short", "length"] = 1

#### Change `fit` values

* "fit" -> 3
* "small" -> 2
* "large" -> 1

In [66]:
df3["fit"].unique()

array([3, 2, 1])

In [59]:
df3.loc[df3["fit"] == "fit", "fit"] = 3
df3.loc[df3["fit"] == "small", "fit"] = 2
df3.loc[df3["fit"] == "large", "fit"] = 1

In [67]:
# Convert to integers
df3["fit"] = df3["fit"].astype(int)
df3["length"] = df3["length"].astype('Int32')

In [68]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58501 entries, 8490 to 82789
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         58501 non-null  int64  
 1   category        58501 non-null  object 
 2   waist           2097 non-null   float64
 3   size            58501 non-null  int64  
 4   quality         58454 non-null  float64
 5   cup size        54005 non-null  object 
 6   hips            39783 non-null  float64
 7   bra size        54144 non-null  float64
 8   height_inches   55687 non-null  float64
 9   length          58477 non-null  Int32  
 10  fit             58501 non-null  int64  
 11  review_summary  58501 non-null  object 
 12  review_text     58501 non-null  object 
 13  polarity        58501 non-null  float64
 14  reviewLen       58501 non-null  int64  
 15  wordCount       58501 non-null  int64  
dtypes: Int32(1), float64(6), int64(5), object(4)
memory usage: 7.4+ MB


In [71]:
# Save df3 as new dataset
filepath = Path('../Data/fashion.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df3.to_csv(filepath) 