## **IMPORTS**

In [1]:
import requests
import re

import pandas as pd
import numpy  as np

from datetime import datetime
from bs4      import BeautifulSoup

## **CLEANING DATA**

In [62]:
# **===================== Import dataset =====================**
data_raw = pd.read_csv('../datasets/data_raw_HM.csv')
data_raw = data_raw.rename(columns = {'Fit': 'fit', 'Composition': 'composition', 'Size': 'size'})

# product_id
data_raw['product_id'] = data_raw['product_id'].astype(int)

# product_name
data_raw['product_name'] = data_raw['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price
data_raw['product_price'] = data_raw['product_price'].apply(lambda x: x.replace('$ ', ' ')).astype(float)

# scrapy_datetime
data_raw['scrapy_datetime'] = pd.to_datetime(data_raw['scrapy_datetime'], format = '%Y-%m-%d %H:%M:%S')

# style_code
data_raw['style_code'] = data_raw['style_code'].astype(int)

# color_name
data_raw['color_name'] = data_raw['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

# fit
data_raw['fit'] = data_raw['fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

# Size --->>> size_model and size_number
# size_model
data_raw['size_model'] = data_raw['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
data_raw['size_model'] = data_raw['size_model'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

# size_number
data_raw['size_number'] = data_raw['size'].str.extract('(\d+/\\d+)')

# ============================== Composition ============================== #
# For the first cycle, exclude secondary compositions
data_raw = data_raw[~data_raw['composition'].str.contains('Pocket lining:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Pocket:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Lining:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Shell:', na = False)]

# Drop duplicates
data_raw = data_raw.drop_duplicates(subset = ['product_id', 'product_category', 'product_name', 'product_price',
                                              'scrapy_datetime', 'style_code', 'color_id', 'color_name', 'fit'], keep = 'last')
# Reset Index
data_raw = data_raw.reset_index(drop = True)

# Break composition by comma and create a new DataSet
data_composition = data_raw['composition'].str.split(',', expand = True)

# Create a reference DataSet - Columns --> cotton | polyester | elastane | elasterell
data_composition_ref = pd.DataFrame(index = np.arange(len(data_raw)), columns = ['cotton', 'polyester', 'elastane', 'elasterell'])

# cotton
data_cotton = data_composition[0]
data_cotton.name = 'cotton'
data_composition_ref = pd.concat([data_composition_ref, data_cotton], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]
data_composition_ref['cotton'] = data_composition_ref['cotton'].fillna('Cotton 0%')

# polyester
data_polyester = data_composition.loc[data_composition[1].str.contains('Polyester', na = True), 1]
data_polyester.name = 'polyester'
data_composition_ref = pd.concat([data_composition_ref, data_polyester], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]
data_composition_ref['polyester'] = data_composition_ref['polyester'].fillna('Polyester 0%')

# elastane 
data_elastane = data_composition.loc[data_composition[1].str.contains('Elastane', na = True), 1]
data_elastane.name = 'elastane'
# Combine Elastane from both columns 1 and 2
data_elastane = data_elastane.combine_first(data_composition[2])
data_composition_ref = pd.concat([data_composition_ref, data_elastane], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]
data_composition_ref['elastane'] = data_composition_ref['elastane'].fillna('Elastane 0%')

# elasterell
data_elasterell = data_composition.loc[data_composition[1].str.contains('Elasterell', na = True), 1]
data_elasterell.name = 'elasterell'
data_composition_ref = pd.concat([data_composition_ref, data_elasterell], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]
data_composition_ref['elasterell'] = data_composition_ref['elasterell'].fillna('Elasterell-P 0%')

# ============================== Join with Data Raw ============================== #
data_raw = pd.concat([data_raw, data_composition_ref], axis = 1)

# format Composition data
data_raw['cotton'] = data_raw['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['polyester'] = data_raw['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['elastane'] = data_raw['elastane'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['elasterell'] = data_raw['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

# Drop unused columns
data_raw = data_raw.drop(columns = ['size', 'composition'], axis = 1)

# Drop duplicates
data_raw = data_raw.drop_duplicates()

In [72]:
data_raw.sample(10)

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_code,color_id,color_name,fit,size_model,size_number,cotton,polyester,elastane,elasterell
180,751994024,men_jeans_slim,slim_jeans,29.99,2021-09-30 09:51:16,751994,24,light_gray,slim_fit,,,0.98,0.0,0.02,0.0
390,720504013,men_jeans_skinny,skinny_jeans,24.99,2021-09-30 09:51:16,720504,13,light_blue,skinny_fit,,,0.73,0.26,0.01,0.0
136,730863033,men_jeans_skinny,skinny_jeans,29.99,2021-09-30 09:51:16,730863,33,graphite_gray,,,,0.0,0.0,0.0,0.0
411,751994018,men_jeans_slim,slim_jeans,29.99,2021-09-30 09:51:16,751994,18,black_denim,slim_fit,,,0.98,0.0,0.02,0.0
1,690449022,men_jeans_ripped,skinny_jeans,39.99,2021-09-30 09:51:16,690449,22,denim_blue,skinny_fit,,,0.98,0.0,0.02,0.0
109,814631006,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-30 09:51:16,814631,6,gray,slim_fit,,,0.9,0.0,0.02,0.08
207,814631002,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-30 09:51:16,814631,2,black_no_fade_black,slim_fit,,,0.9,0.0,0.02,0.08
280,730863005,men_jeans_skinny,skinny_jeans,29.99,2021-09-30 09:51:16,730863,5,dark_blue,,,,0.0,0.0,0.0,0.0
92,1004476002,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-30 09:51:16,1004476,2,light_denim_blue,slim_fit,,,0.9,0.0,0.02,0.08
16,690449043,men_jeans_ripped,skinny_jeans,39.99,2021-09-30 09:51:16,690449,43,denim_blue,skinny_fit,,,0.98,0.0,0.02,0.0
