# Aplicação do Regex

In [1]:
import re
import pandas as pd 
import numpy as np

In [2]:
# Organizing data
data = pd.read_csv('products_hm.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data = data[['product_id_x', 'product_category', 'product_name', 'product_price', 'scrapy_datetime', 'style_id', 'color_id', 'color_name',
              'Size', 'Fit', 'Composition', 'More sustainable materials']].copy()
# Changing column names
data.columns = ['product_id', 'product_category', 'product_name', 'product_price',
       'scrapy_datetime', 'style_id', 'color_id', 'color_name', 'size', 'fit',
       'composition', 'product_safety']

In [3]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,size,fit,composition,product_safety
0,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-02-05 11:58:59,979945,1,Denim blue,"The model is 180cm/5'11"" and wears a size 31/32",Loose fit,"Pocket lining: Polyester 65%, Cotton 35%",Recycled cotton 20%
1,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-02-05 11:58:59,979945,1,Denim blue,"The model is 180cm/5'11"" and wears a size 31/32",Loose fit,Shell: Cotton 100%,Recycled cotton 20%
2,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-02-05 11:58:59,979945,1,Black,"The model is 180cm/5'11"" and wears a size 31/32",Loose fit,"Pocket lining: Polyester 65%, Cotton 35%",Recycled cotton 20%
3,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-02-05 11:58:59,979945,1,Black,"The model is 180cm/5'11"" and wears a size 31/32",Loose fit,Shell: Cotton 100%,Recycled cotton 20%
4,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-02-05 11:58:59,979945,1,Light denim blue,,Loose fit,Shell: Cotton 100%,Recycled cotton 20%


In [4]:
data.shape

(1978, 12)

#### <span style="color:#1a8cff">*Types*</span>

In [5]:
data.dtypes

product_id           int64
product_category    object
product_name        object
product_price       object
scrapy_datetime     object
style_id             int64
color_id             int64
color_name          object
size                object
fit                 object
composition         object
product_safety      object
dtype: object

#### <span style="color:#1a8cff">*Checking for NAs*</span>

In [6]:
data.isna().sum()

product_id             0
product_category       0
product_name           0
product_price          0
scrapy_datetime        0
style_id               0
color_id               0
color_name             0
size                 431
fit                    0
composition            0
product_safety      1383
dtype: int64

- No caso de existir NA, e não estivermos usando nenhuma técnica de substituição, podemos simplesmente ignorar, usando o comando abaixo.

`data = data.dropna(subset=['product_id'])`

- Neste caso, estaríamos removendo todos os NAs da coluna `product_id`.

In [7]:
# product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$ ', '')).astype(float)

# scrapy_datetime
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S')

# style_id
data['style_id'] = data['style_id'].astype(int)


# color_id
data['color_id'] = data['color_id'].astype(int)

# color_name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').lower())

# Fit
data['fit'] = data['fit'].apply(lambda x: x.replace(' ', '_').lower())

# size number
data['size_number'] = data['size'].apply(lambda x: re.search ('\d{3}cm', x).group(0) if pd.notnull(x) else x)
data['size_number'] = data['size'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

# size model
data['size_model'] = data['size'].str.extract('(\d+/\\d+)')

# removing the size variable, because we have already removed the information we wanted
data = data.drop(columns=['size', 'product_safety'], axis = 1)

# product_composition
# Excluding composition I don't want for the first cycle of CRISP
data = data[~data['composition'].str.contains('Pocket lining:', na=False)]
data = data[~data['composition'].str.contains('Lining:', na=False)]
data = data[~data['composition'].str.contains('Shell:', na=False)]
data = data[~data['composition'].str.contains('Pocket:', na=False)]

data = data.reset_index()

In [8]:
# composition 
# cotton | polyester | spandex |
df_ref = pd.DataFrame(index=np.arange(len(data)), columns = ['cotton', 'spandex', 'polyester'])

# break composition by comma
df1 = data['composition'].str.split(',', expand = True)

# cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

# polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True), 1]
df_polyester.name = 'polyester'

df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

# spandex
df_spandex = df1.loc[df1[1].str.contains('Spandex', na=True), 1]
df_spandex.name = 'spandex'

df_ref = pd.concat([df_ref, df_spandex], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

df_ref['spandex'][df_ref['cotton']=='Cotton 78%'] = ' Spandex 1%'
df_ref['spandex'][df_ref['cotton']=='Cotton 79%'] = ' Spandex 1%'

# final join
data = pd.concat([data, df_ref], axis=1)

# Format composition data
data['cotton'] = data['cotton'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

### Testes

In [17]:
data.sample(15)

Unnamed: 0,index,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,composition,size_number,size_model,cotton,polyester,spandex
195,839,690449051,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,51,white,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
9,53,811993040,men_jeans_regular,regular_jeans,29.99,2022-02-05 11:58:59,811993,40,light_denim_blue,regular_fit,"Cotton 98%, Spandex 2%",,,98,,2
227,886,690449022,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,22,light_denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
295,1002,690449022,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,22,light_denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",188.0,31/30,98,,2
52,224,1013317002,men_jeans_regular,hybrid_regular_tapered_joggers,39.99,2022-02-05 11:58:59,1013317,2,dark_blue,regular_fit,"Cotton 78%, Polyester 21%, Spandex 1%",,,78,21.0,1
430,1588,690449036,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,36,black/washed,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
355,1282,690449056,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,56,denim_gray,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
281,977,690449022,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,22,black/washed,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
191,831,690449051,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,51,denim_blue,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
282,979,690449022,men_jeans_ripped,skinny_jeans,39.99,2022-02-05 11:58:59,690449,22,dark_denim_blue/trashed,skinny_fit,"Cotton 98%, Spandex 2%",187.0,32/32,98,,2
