In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load Dataset

In [35]:
df = pd.read_json('../Data/modcloth/modcloth_final_data.json', lines=True)
df.head()
df['category']

0              new
1              new
2              new
3              new
4              new
           ...    
82785    outerwear
82786    outerwear
82787    outerwear
82788    outerwear
82789    outerwear
Name: category, Length: 82790, dtype: object

In [23]:
# Glancing at the dataset
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup size        76535 non-null  object 
 5   hips            56064 non-null  float64
 6   bra size        76772 non-null  float64
 7   category        82790 non-null  object 
 8   bust            11854 non-null  object 
 9   height          81683 non-null  object 
 10  user_name       82790 non-null  object 
 11  length          82755 non-null  object 
 12  fit             82790 non-null  object 
 13  user_id         82790 non-null  int64  
 14  shoe size       27915 non-null  float64
 15  shoe width      18607 non-null  object 
 16  review_summary  76065 non-null  object 
 17  review_text     76065 non-null 

In [24]:
# Storing new dataset with specific columns
df["review_summary"]
df["review_text"]

0                              NaN
1                              NaN
2                              NaN
3                              NaN
4                              NaN
                   ...            
82785                 Cute jacket!
82786     It's a beautiful jacket.
82787    I love this blazer. It is
82788    I love this blazer!! I wo
82789    I love this piece. I'm re
Name: review_summary, Length: 82790, dtype: object

`review_summary` takes first certain characters from `review_text`

In [31]:
# review rows with missing review summary data
df.loc[df["review_text"].isnull() == 0]

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
6725,152702,27.0,4,4.0,b,37.0,32.0,new,,5ft 6in,avNYC,just right,small,668176,9.0,average,Too much ruching,"I liked the color, the silhouette, and the fab..."
6726,152702,26.0,4,5.0,c,36.0,34.0,new,,5ft 6in,lanwei91,slightly short,fit,320759,7.5,,Suits my body type!,From the other reviews it seems like this dres...
6727,152702,,4,3.0,a,,34.0,new,,5ft 3in,angeladevoe5678,just right,fit,144785,,,I love the design and fit,I love the design and fit of this dress! I wo...
6728,152702,25.0,4,3.0,c,35.0,32.0,new,,5ft 1in,Juli,slightly long,fit,52664,,,Beautiful Dress!,I bought this dress for work it is flattering...
6729,152702,25.0,4,5.0,c,32.0,32.0,new,,5ft 6in,lhalton,just right,fit,155439,,,This is a very profession,This is a very professional look. It is Great ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82785,807722,,8,4.0,b,,36.0,outerwear,,5ft 8in,Jennifer,just right,fit,727820,8.5,average,Cute jacket!,Cute jacket!
82786,807722,,12,5.0,ddd/f,,34.0,outerwear,,5ft 5in,Kelli,slightly long,small,197040,,,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,807722,,12,5.0,dddd/g,36.0,32.0,outerwear,,5ft 4in,elacount,just right,fit,102493,,,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,807722,,12,4.0,,,,outerwear,,5ft 3in,jennaklinner,just right,fit,756491,,,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


In [36]:
# Finding number of unique clothing items
df["item_id"].unique()
df["category"].unique()

array(['new', 'dresses', 'wedding', 'sale', 'tops', 'bottoms',
       'outerwear'], dtype=object)

In [43]:
# Get rid of new, wedding, and sale
df2 = df.loc[(df["category"] == "new") | (df["category"] == "wedding") | 
       (df["category"] == "sale")]
df2 = df2.loc[df["review_text"].isnull() == 0]

In [45]:
df2

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
6725,152702,27.0,4,4.0,b,37.0,32.0,new,,5ft 6in,avNYC,just right,small,668176,9.0,average,Too much ruching,"I liked the color, the silhouette, and the fab..."
6726,152702,26.0,4,5.0,c,36.0,34.0,new,,5ft 6in,lanwei91,slightly short,fit,320759,7.5,,Suits my body type!,From the other reviews it seems like this dres...
6727,152702,,4,3.0,a,,34.0,new,,5ft 3in,angeladevoe5678,just right,fit,144785,,,I love the design and fit,I love the design and fit of this dress! I wo...
6728,152702,25.0,4,3.0,c,35.0,32.0,new,,5ft 1in,Juli,slightly long,fit,52664,,,Beautiful Dress!,I bought this dress for work it is flattering...
6729,152702,25.0,4,5.0,c,32.0,32.0,new,,5ft 6in,lhalton,just right,fit,155439,,,This is a very profession,This is a very professional look. It is Great ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80926,800975,,15,4.0,dddd/g,,34.0,sale,,5ft 1in,quidditchraka,just right,fit,480585,,,"Great quality, awesome pr","Great quality, awesome print, and perfect fit...."
80927,800975,,15,5.0,d,,36.0,sale,,5ft 2in,Janice,just right,small,291857,,,"Beautiful, unique jacket.","Beautiful, unique jacket. Love it! However, i..."
80928,800975,,12,3.0,c,42.0,38.0,sale,34,5ft 2in,sharonvisoky,just right,fit,580708,6.0,,I bought this in January,I bought this in January and it's pretty cute....
80929,800975,,15,4.0,c,30.0,38.0,sale,,5ft 6in,Michele,just right,small,508785,,,Would have loved it but t,Would have loved it but the sleeves were just ...
