In [95]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

In [96]:
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

# Extract

In [97]:
data_folder = 'data/raw'
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
print(csv_files)

['us-shein-appliances-3987.csv', 'us-shein-automotive-4110.csv', 'us-shein-baby_and_maternity-4433.csv', 'us-shein-bags_and_luggage-4299.csv', 'us-shein-beauty_and_health-4267.csv', 'us-shein-curve-2849.csv', 'us-shein-electronics-4395.csv', 'us-shein-home_and_kitchen-3719.csv', 'us-shein-home_textile-3883.csv', 'us-shein-jewelry_and_accessories-3548.csv', 'us-shein-kids-4314.csv', 'us-shein-mens_clothes-1891.csv', 'us-shein-office_and_school_supplies-4233.csv', 'us-shein-pet_supplies-4083.csv', 'us-shein-shoes-4381.csv', 'us-shein-sports_and_outdoors-3853.csv', 'us-shein-swimwear-3761.csv', 'us-shein-tools_and_home_improvement-3903.csv', 'us-shein-toys_and_games-3577.csv', 'us-shein-underwear_and_sleepwear-4019.csv', 'us-shein-womens_clothing-4620.csv']


# Transform

columns that exist in all datasets
- `price`
- `discount`
- `goods-title-link`

### Combine

In [98]:
def transform_column_name(df: pd.DataFrame):
    df.columns = df.columns.str.strip().str.lower().str.replace('-','_').str.replace(' ','_').str.replace('__','_') 
    return df

def transform_title(df: pd.DataFrame):
    df['goods_title_link'] = df['goods_title_link'].str.replace(r'\s+', ' ', regex=True)
    
    if 'goods_title_link_jump' in df.columns and 'goods_title_link' in df.columns:
        df['goods_title_link_jump'] = df['goods_title_link_jump'].str.replace(r'\s+', ' ', regex=True)
        df['product_name'] = df['goods_title_link_jump'].fillna(df['goods_title_link'])
    else:
        df['product_name'] = df['goods_title_link']

    return df

def transform_selling_proposition(df: pd.DataFrame):
    df['selling_proposition'] = df['selling_proposition'].astype(str).str.strip()
    df['sold_number'] = df['selling_proposition'].str.extract(r'(\d+\.?\d*)')
    df['sold_number'] = df['sold_number'].astype(float)
    df.loc[df['selling_proposition'].str.contains('k', case=False, na=False), 'sold_number'] *= 1000
    return df

def transform_price(df: pd.DataFrame):
    df['price'] = df['price'].str.replace(',', '').astype(str)
    df['price'] = df['price'].str.replace('$', '')
    df['price'] = df['price'].astype(float)
    return df

def transform_discount(df: pd.DataFrame):
    df['discount'] = df['discount'].str.replace(r'[^0-9]', '', regex=True).astype(float)
    df['discount'] = df['discount'] / 100
    return df

def transform_rank_title(df: pd.DataFrame):
    df['rank_title'] = df['rank_title'].str.replace(r'[^0-9]', '', regex=True)
    return df

def rename_columns(df: pd.DataFrame):
    df.rename(columns={
        'goods_title_link_jump_href': 'product_link',
        'rank_title': 'rank_num',
        'rank_sub': 'rank_subcategory'
    }, inplace=True)
    return df

In [99]:
def transform_dataset(df: pd.DataFrame, category: str):
    df = transform_column_name(df)
    df = transform_title(df)
    df = transform_price(df)
    df = transform_discount(df)

    if 'selling_proposition' in df.columns:
        df = transform_selling_proposition(df)
    
    if 'rank_title' in df.columns:
        df = transform_rank_title(df)

    if category == "appliances":
        df['color_count'] = 1
    
    if category == "automotive":
        df['rank_num'] = np.nan
        df['rank_subcategory'] = np.nan
        df['color_count'] = 1
    
    if category == "baby_and_maternity":
        df['rank_num'] = np.nan
        df['rank_subcategory'] = np.nan
    
    if category == "home_and_kitchen":
        df['goods_title_link_jump_href'] = np.nan
        df['color_count'] = 1
    
    if category == "jewelry_and_accessories":
        df['color_count'] = 1
    
    if category == "mens_clothes":
        df['goods_title_link_jump_href'] = np.nan
    
    if category == "office_and_school_supplies":
        df['color_count'] = 1
    
    if category == "swimwear":
        df['selling_proposition'] = np.nan
        df['sold_number'] = np.nan
    
    if category == "tools_and_home_improvement":
        df['color_count'] = 1
    
    if category == "toys_and_games":
        df['goods_title_link_jump_href'] = np.nan
        df['color_count'] = 1
    
    if category == "womens_clothing":
        df['goods_title_link_jump_href'] = np.nan    
    
    df = rename_columns(df)
    df['category'] = category.replace('_', ' ')
    df = df[[
        'product_name',
        'product_link',
        'category',
        'color_count',
        'price',
        'discount',
        'rank_num',
        'rank_subcategory',
        'sold_number'
    ]]

    return df

In [100]:
all_dfs = []

for file in csv_files:
    path = os.path.join(data_folder, file)
    category = file.replace("us-shein-", "").split("-")[0]
    print(f"process: {category}")

    df = pd.read_csv(path)
    df_transform = transform_dataset(df, category)
    all_dfs.append(df_transform)

process: appliances
process: automotive
process: baby_and_maternity
process: bags_and_luggage
process: beauty_and_health
process: curve
process: electronics
process: home_and_kitchen
process: home_textile
process: jewelry_and_accessories
process: kids
process: mens_clothes
process: office_and_school_supplies
process: pet_supplies
process: shoes
process: sports_and_outdoors
process: swimwear
process: tools_and_home_improvement
process: toys_and_games
process: underwear_and_sleepwear
process: womens_clothing


In [101]:
df_combined = pd.concat(all_dfs, ignore_index=True)
df_combined.head()

Unnamed: 0,product_name,product_link,category,color_count,price,discount,rank_num,rank_subcategory,sold_number
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,appliances,1.0,2.03,0.22,1.0,in Give Gifts,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,appliances,1.0,6.48,0.2,4.0,in Top rated in Portable Fans,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,appliances,1.0,1.8,,,,400.0
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,appliances,1.0,0.88,0.72,,,5600.0
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,appliances,1.0,12.06,0.4,6.0,in Oral Irrigators,


In [102]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82105 entries, 0 to 82104
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_name      82091 non-null  object 
 1   product_link      664 non-null    object 
 2   category          82105 non-null  object 
 3   color_count       46814 non-null  float64
 4   price             82103 non-null  float64
 5   discount          54977 non-null  float64
 6   rank_num          14605 non-null  object 
 7   rank_subcategory  14605 non-null  object 
 8   sold_number       54364 non-null  float64
dtypes: float64(4), object(5)
memory usage: 5.6+ MB


### Drop Invalid Rows

In [103]:
reject_data = []

#### Doesn't have `price`

In [104]:
df_combined.loc[df_combined['product_name'].notna() & df_combined['price'].isna()]

Unnamed: 0,product_name,product_link,category,color_count,price,discount,rank_num,rank_subcategory,sold_number


In [105]:
df_combined.loc[df_combined['product_name'].isna() & df_combined['price'].isna()]

Unnamed: 0,product_name,product_link,category,color_count,price,discount,rank_num,rank_subcategory,sold_number
23878,,,curve,,,,,,
45583,,,mens clothes,,,,,,


In [106]:
reject_data.append(df_combined.loc[df_combined['product_name'].isna() & df_combined['price'].isna()])

#### Doesn't have `product_name`

In [107]:
df_combined.loc[df_combined['product_name'].isna()].shape

(14, 9)

In [108]:
reject_data.append(df_combined.loc[df_combined['product_name'].isna()])

In [109]:
df_reject = pd.concat(reject_data, ignore_index=True)
df_reject

Unnamed: 0,product_name,product_link,category,color_count,price,discount,rank_num,rank_subcategory,sold_number
0,,,curve,,,,,,
1,,,mens clothes,,,,,,
2,,,curve,,,,,,
3,,,mens clothes,,,,,,
4,,,toys and games,1.0,2.9,,6.0,in Kids Drawing & Painting Supplies,
5,,,toys and games,1.0,2.4,,,,600.0
6,,,toys and games,1.0,3.8,,,,1200.0
7,,,toys and games,1.0,1.2,,2.0,in Musical Instruments & Accessories,
8,,,toys and games,1.0,2.31,0.54,,,500.0
9,,,toys and games,1.0,6.2,,,,


In [110]:
df_reject.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_name      0 non-null      object 
 1   product_link      0 non-null      object 
 2   category          16 non-null     object 
 3   color_count       12 non-null     float64
 4   price             12 non-null     float64
 5   discount          5 non-null      float64
 6   rank_num          5 non-null      object 
 7   rank_subcategory  5 non-null      object 
 8   sold_number       6 non-null      float64
dtypes: float64(4), object(5)
memory usage: 1.3+ KB


In [111]:
df_clean = df_combined.loc[~df_combined.index.isin(df_reject.index)]
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82089 entries, 16 to 82104
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_name      82075 non-null  object 
 1   product_link      648 non-null    object 
 2   category          82089 non-null  object 
 3   color_count       46798 non-null  float64
 4   price             82087 non-null  float64
 5   discount          54968 non-null  float64
 6   rank_num          14595 non-null  object 
 7   rank_subcategory  14595 non-null  object 
 8   sold_number       54359 non-null  float64
dtypes: float64(4), object(5)
memory usage: 6.3+ MB


## Dimensional Model

### `dim_category`

In [112]:
df_clean['category'].unique()

array(['appliances', 'automotive', 'baby and maternity',
       'bags and luggage', 'beauty and health', 'curve', 'electronics',
       'home and kitchen', 'home textile', 'jewelry and accessories',
       'kids', 'mens clothes', 'office and school supplies',
       'pet supplies', 'shoes', 'sports and outdoors', 'swimwear',
       'tools and home improvement', 'toys and games',
       'underwear and sleepwear', 'womens clothing'], dtype=object)

In [113]:
dim_category = df_clean[['category']].drop_duplicates().reset_index(drop=True)
dim_category['category_id'] = range(1, len(dim_category) + 1)

dim_category = dim_category[['category_id', 'category']]
dim_category.head()

Unnamed: 0,category_id,category
0,1,appliances
1,2,automotive
2,3,baby and maternity
3,4,bags and luggage
4,5,beauty and health


### `dim_subcategory`

In [114]:
df_clean['rank_subcategory'].nunique()

5056

In [115]:
dim_rank_subcategory = df_clean[['rank_subcategory']].drop_duplicates().reset_index(drop=True)
dim_rank_subcategory['rank_subcategory_id'] = range(1, len(dim_rank_subcategory) + 1)

dim_rank_subcategory = dim_rank_subcategory[['rank_subcategory_id', 'rank_subcategory']]
dim_rank_subcategory.head()

Unnamed: 0,rank_subcategory_id,rank_subcategory
0,1,in Cleaning Appliance Parts
1,2,
2,3,in Laundry Appliances
3,4,in Fruit & Vegetable Tools
4,5,in Blenders


### `dim_products`

In [116]:
dim_products = df_clean.merge(dim_category, on='category', how='left') \
    .merge(dim_rank_subcategory, on='rank_subcategory', how='left')

dim_products = dim_products[[
    'product_name',
    'product_link',
    'category_id',
    'color_count',
    'price',
    'discount',
    'rank_num',
    'rank_subcategory_id',
    'sold_number'
]]

dim_products

Unnamed: 0,product_name,product_link,category_id,color_count,price,discount,rank_num,rank_subcategory_id,sold_number
0,1pc White Half-Wrap Fan Dustproof Cover For Ho...,https://us.shein.com/1pc-White-Half-Wrap-Fan-D...,1,1.0,1.86,0.28,5,1,
1,"Ear Wax Removal Tool Camera,Ear Cleaner With C...",https://us.shein.com/Ear-Wax-Removal-Tool-Came...,1,1.0,24.10,,,2,
2,"1pc Blue Lint Remover, Wool Sweater Shaver, Cl...",https://us.shein.com/1pc-Blue-Lint-Remover-Woo...,1,1.0,0.75,0.53,9,3,
3,"1pc Four Layer Slide Egg Storage Box, Refriger...",https://us.shein.com/1pc-Four-Layer-Slide-Egg-...,1,1.0,14.60,,,2,30.0
4,"New 2 In 1 Mini Irons, Mini Travel Iron, Porta...",https://us.shein.com/New-2-In-1-Mini-Irons-Min...,1,1.0,11.25,0.11,2,3,
...,...,...,...,...,...,...,...,...,...
82084,Colorful Flower Embroidered Linen Muslim Women...,,21,,35.69,,2,5057,
82085,Elegant And Stylish Patchwork Lace Cheongsam D...,,21,,42.49,,,2,10.0
82086,Split Thigh Satin Tube Prom Dress,,21,5.0,49.29,,,2,200.0
82087,DAZY Women's Solid Color Low Waist Ultra Short...,,21,6.0,9.78,0.05,10,4786,


# Load

### Data Reject

In [117]:
df_reject.to_csv(f'target/reject/data_reject_{timestamp}.csv', index=False)

### Data Accepted

In [118]:
dim_category.to_csv(f'target/accepted/dim_category_{timestamp}.csv', index=False)
dim_rank_subcategory.to_csv(f'target/accepted/dim_rank_subcategory_{timestamp}.csv', index=False)
dim_products.to_csv(f'target/accepted/dim_products_{timestamp}.csv', index=False)