In [1]:
import pandas as pd
import numpy  as np
import re

In [2]:
data = pd.read_csv('HM_data_raw.csv')

In [3]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,style_id,color_id,product_colors,Fit,Composition,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,1024256,1,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",2022-03-11 21:11:24.210824
1,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",2022-03-11 21:11:24.210824
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",2022-03-11 21:11:24.210824
3,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",2022-03-11 21:11:24.210824
4,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",2022-03-11 21:11:24.210824


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7682 entries, 0 to 7681
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_id        7682 non-null   int64 
 1   product_category  7682 non-null   object
 2   product_name      7682 non-null   object
 3   product_price     7682 non-null   object
 4   style_id          7682 non-null   int64 
 5   color_id          7682 non-null   int64 
 6   product_colors    7682 non-null   object
 7   Fit               7682 non-null   object
 8   Composition       7682 non-null   object
 9   scrapy_datetime   7682 non-null   object
dtypes: int64(3), object(7)
memory usage: 600.3+ KB


### Formatting data

In [5]:
# column names
data = data.rename(columns = {'Fit': 'fit', 'Composition': 'composition','Description':'description'})

# product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', '')).astype(float)

# product_color
data['product_colors'] = data['product_colors'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower())

# fit
data['fit'] = data['fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

# scrapy_datetime
data['scrapy_datetime'] = data['scrapy_datetime'].apply(lambda x: re.search('(\d{4}-\d{2}-\d{2}\s\d{2}\:\d{2}\:\d{2})', x).group(0))
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S')

# column 'composition' has several information 

# for the first cycle of CRISP, let's remove "Pocket lining", "Lining", and "Shell"
data = data[~data['composition'].str.contains('Pocket lining:', na=False)]
data = data[~data['composition'].str.contains('Pocket:', na=False)]
data = data[~data['composition'].str.contains('Lining:', na=False)]
data = data[~data['composition'].str.contains('Shell:', na=False)]

# drop duplicates
data = data.drop_duplicates()
# data = data.drop_duplicates(subset=['product_id', 'product_category', 'product_name', 'product_price',
#        'style_id', 'color_id', 'product_colors', 'fit'], keep='last')

# reset index
data = data.reset_index(drop=True)

In [6]:
data.index = np.arange(0, len(data))

data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,style_id,color_id,product_colors,fit,composition,scrapy_datetime
0,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Spandex 2%",2022-03-11 21:11:24
1,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",2022-03-11 21:11:24
2,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,black_washed,skinny_fit,"Cotton 98%, Spandex 2%",2022-03-11 21:11:24
3,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,light_denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",2022-03-11 21:11:24
4,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,black_washed_out,skinny_fit,"Cotton 98%, Spandex 2%",2022-03-11 21:11:24


In [7]:
# break the actual column 'composition' by comma into a new dataframe, df1
df1 = data['composition'].str.split(',', expand=True)

# create a new empty dataframe with the columns from 'composition': cotton | polyester | spandex
# same len as data
df_ref = pd.DataFrame(index=np.arange(len(data)), columns=['cotton', 'polyester', 'spandex'])

# fill up and attach df_ref to data column by column:

# column df1[0]
df1[0].unique() # check the unique values; in this case it's all cotton

# column df1[1]
df1[1].unique()  # it shows we have 'polyester' and 'spandex'

# column df1[2]
df1[2].unique()  # it shows we have only 'spandex'

# cotton from column df1[0]
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['cotton'] = df_ref['cotton'].fillna('Cotton 0%')

# polyester from column df1[1]
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis=1) # overwrites the rows which indexes are in df_polyester; keeps NA in others
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['polyester'] = df_ref['polyester'].fillna('Polyester 0%')

# combine spandex from columns df1[1] and df1[2]
df_spandex = df1.loc[df1[1].str.contains('Spandex', na=True), 1]
df_spandex.name = 'spandex'
df_spandex = df_spandex.combine_first(df1[2])
df_ref = pd.concat([df_ref, df_spandex], axis=1) # overwrites the rows which indexes are in df_spandex; keeps NA in others
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]
df_ref['spandex'] = df_ref['spandex'].fillna('Spandex 0%')

df_ref = df_ref.reset_index(drop=True)

# final join
data = pd.concat([data, df_ref], axis=1)

#format composition data
data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

# drop columns
data = data.drop(columns=['composition'], axis=1)

# drop duplicates
data = data.drop_duplicates()

# reset index
data = data.reset_index(drop=True)

In [8]:
data.sample(5)

Unnamed: 0,product_id,product_category,product_name,product_price,style_id,color_id,product_colors,fit,scrapy_datetime,cotton,polyester,spandex
111,690449056,men_jeans_ripped,skinny_jeans,39.99,690449,56,dark_blue_trashed,skinny_fit,2022-03-11 21:11:24,0.98,0.0,0.02
6,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,dark_denim_blue_trashed,skinny_fit,2022-03-11 21:11:24,0.98,0.0,0.02
78,1013317001,men_jeans_regular,hybrid_regular_tapered_joggers,39.99,1013317,1,denim_blue,regular_fit,2022-03-11 21:11:24,0.8,0.19,0.01
93,1013317002,men_jeans_regular,hybrid_regular_tapered_joggers,39.99,1013317,2,dark_gray,regular_fit,2022-03-11 21:11:24,0.8,0.19,0.01
14,690449043,men_jeans_ripped,skinny_jeans,39.99,690449,43,gray,skinny_fit,2022-03-11 21:11:24,0.98,0.0,0.02


In [9]:
data.dtypes

product_id                   int64
product_category            object
product_name                object
product_price              float64
style_id                     int64
color_id                     int64
product_colors              object
fit                         object
scrapy_datetime     datetime64[ns]
cotton                     float64
polyester                  float64
spandex                    float64
dtype: object