## **IMPORTS**

In [2]:
import requests
import re

import pandas as pd
import numpy  as np

from datetime import datetime
from bs4      import BeautifulSoup

## **CLEANING DATA**

In [64]:
# **===================== Import dataset =====================**
data_raw = pd.read_csv('../datasets/data_raw_HM.csv')
data_raw = data_raw.rename(columns = {'Fit': 'fit', 'Composition': 'composition', 'Size': 'size'})

# product_id
data_raw['product_id'] = data_raw['product_id'].astype(int)

# product_name
data_raw['product_name'] = data_raw['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price
data_raw['product_price'] = data_raw['product_price'].apply(lambda x: x.replace('$ ', ' ')).astype(float)

# scrapy_datetime
data_raw['scrapy_datetime'] = pd.to_datetime(data_raw['scrapy_datetime'], format = '%Y-%m-%d %H:%M:%S')

# style_code
data_raw['style_code'] = data_raw['style_code'].astype(int)

# color_name
data_raw['color_name'] = data_raw['color_name'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

# fit
data_raw['fit'] = data_raw['fit'].apply(lambda x: x.replace(' ', '_').lower() if pd.notnull(x) else x)

# Size --->>> size_model and size_number
# size_model
data_raw['size_model'] = data_raw['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
data_raw['size_model'] = data_raw['size_model'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

# size_number
data_raw['size_number'] = data_raw['size'].str.extract('(\d+/\\d+)')

# Remove Size
data_raw = data_raw.drop(columns = ['size'], axis = 1)

# ============================== Composition ============================== #
# For the first cycle, exclude secondary compositions
data_raw = data_raw[~data_raw['composition'].str.contains('Pocket lining:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Pocket:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Lining:', na = False)]
data_raw = data_raw[~data_raw['composition'].str.contains('Shell:', na = False)]
data_raw = data_raw.reset_index(drop = True)

# Break composition by comma and create a new DataSet
data_composition = data_raw['composition'].str.split(',', expand = True)

# Create a reference DataSet - Columns --> cotton | polyester | elastane | elasterell
data_composition_ref = pd.DataFrame(index = np.arange(len(data_raw)), columns = ['cotton', 'polyester', 'elastane', 'elasterell'])

# cotton
data_cotton = data_composition[0]
data_cotton.name = 'cotton'
data_composition_ref = pd.concat([data_composition_ref, data_cotton], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]

# polyester
data_polyester = data_composition.loc[data_composition[1].str.contains('Polyester', na = True), 1]
data_polyester.name = 'polyester'
data_composition_ref = pd.concat([data_composition_ref, data_polyester], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]

# elastane 
data_elastane = data_composition.loc[data_composition[1].str.contains('Elastane', na = True), 1]
data_elastane.name = 'elastane'
data_composition_ref = pd.concat([data_composition_ref, data_elastane], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]

# elasterell
data_elasterell = data_composition.loc[data_composition[1].str.contains('Elasterell', na = True), 1]
data_elasterell.name = 'elasterell'
data_composition_ref = pd.concat([data_composition_ref, data_elasterell], axis = 1)
data_composition_ref = data_composition_ref.iloc[:, ~data_composition_ref.columns.duplicated(keep = 'last')]

# ============================== Join with Data Raw ============================== #
data_raw = pd.concat([data_raw, data_composition_ref], axis = 1)

# format Composition data
data_raw['cotton'] = data_raw['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['polyester'] = data_raw['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['elastane'] = data_raw['elastane'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data_raw['elasterell'] = data_raw['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

In [65]:
data_raw.sample(5)

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_code,color_id,color_name,fit,composition,size_model,size_number,cotton,polyester,elastane,elasterell
2395,636207010,men_jeans_slim,slim_jeans,19.99,2021-09-30 09:51:16,636207,10,dark_gray_denim,slim_fit,"Cotton 89%, Polyester 10%, Elastane 1%",,,0.89,0.1,,
62,690449043,men_jeans_ripped,skinny_jeans,39.99,2021-09-30 09:51:16,690449,43,light_denim_gray_trashed,skinny_fit,"Cotton 98%, Elastane 2%",184.0,31/32,0.98,,0.02,
2609,636207015,men_jeans_slim,slim_jeans,19.99,2021-09-30 09:51:16,636207,15,dark_denim_blue,slim_fit,"Cotton 88%, Polyester 10%, Elastane 2%",,,0.88,0.1,,
1590,636207011,men_jeans_slim,slim_jeans,19.99,2021-09-30 09:51:16,636207,11,gray,slim_fit,"Cotton 89%, Polyester 10%, Elastane 1%",,,0.89,0.1,,
2300,811993031,men_jeans_regular,regular_jeans,29.99,2021-09-30 09:51:16,811993,31,denim_blue,regular_fit,"Cotton 98%, Elastane 2%",,,0.98,,0.02,


In [63]:
data_raw.shape

(2776, 16)