In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Extract

In [2]:
df = pd.read_csv('../data/raw/us-shein-appliances-3987.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   goods-title-link--jump       40 non-null     object
 1   goods-title-link--jump href  40 non-null     object
 2   rank-title                   559 non-null    object
 3   rank-sub                     559 non-null    object
 4   price                        3986 non-null   object
 5   discount                     2459 non-null   object
 6   selling_proposition          1361 non-null   object
 7   goods-title-link             3946 non-null   object
dtypes: object(8)
memory usage: 249.3+ KB


# Transform

### Transform Column Name to Snake Case

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace('-','_').str.replace(' ','_').str.replace('__','_')
df.head()

Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,#1 Best Sellers,in Give Gifts,$2.03,-22%,,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,#4 Best Sellers,in Top rated in Portable Fans,$6.48,-20%,,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,,,$1.80,,400+ sold recently,
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,,,$0.88,-72%,5.6k+ sold recently,
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,#6 Best Sellers,in Oral Irrigators,$12.06,-40%,,


### Locate Row that Hasn't Product Title

In [4]:
df.loc[df['goods_title_link_jump'].isna() & df['goods_title_link'].isna()].shape

(0, 8)

### `goods_title_link_jump`
Product Title (1)

In [5]:
df['goods_title_link_jump'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: goods_title_link_jump
Non-Null Count  Dtype 
--------------  ----- 
40 non-null     object
dtypes: object(1)
memory usage: 31.3+ KB


In [6]:
df['goods_title_link_jump'].nunique()

40

In [7]:
df['goods_title_link_jump'].duplicated().sum()

np.int64(3945)

In [8]:
df['goods_title_link_jump'].isna().sum()

np.int64(3946)

In [9]:
df['goods_title_link_jump'] = df['goods_title_link_jump'].str.replace(r'\s+', ' ', regex=True)
df.head()


Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,#1 Best Sellers,in Give Gifts,$2.03,-22%,,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,#4 Best Sellers,in Top rated in Portable Fans,$6.48,-20%,,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,,,$1.80,,400+ sold recently,
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,,,$0.88,-72%,5.6k+ sold recently,
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,#6 Best Sellers,in Oral Irrigators,$12.06,-40%,,


### `goods_title_link_jump href`
link to product

In [10]:
df['goods_title_link_jump_href'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: goods_title_link_jump_href
Non-Null Count  Dtype 
--------------  ----- 
40 non-null     object
dtypes: object(1)
memory usage: 31.3+ KB


In [11]:
df[df['goods_title_link_jump_href'].notna() & df['goods_title_link_jump'].notna()].shape

(40, 8)

every row that has `goods_title_link_jump` is always has `goods_title_link_jump_href`

### `rank_title`
rank in subcategory

In [12]:
df['rank_title'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: rank_title
Non-Null Count  Dtype 
--------------  ----- 
559 non-null    object
dtypes: object(1)
memory usage: 31.3+ KB


In [13]:
df['rank_title'].unique()

array(['#1 Best Sellers', '#4 Best Sellers', nan, '#6 Best Sellers',
       '#10 Best Sellers', '#8 Best Sellers', '#9 Best Sellers',
       '#5 Best Sellers', '#2 Best Sellers', '#7 Best Sellers',
       '#3 Best Sellers'], dtype=object)

In [14]:
df['rank_title'] = df['rank_title'].str.replace(r'[^0-9]', '', regex=True)
df['rank_title'].unique()

array(['1', '4', nan, '6', '10', '8', '9', '5', '2', '7', '3'],
      dtype=object)

### `rank_sub`
Subcategory of the product

In [15]:
df['rank_sub'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: rank_sub
Non-Null Count  Dtype 
--------------  ----- 
559 non-null    object
dtypes: object(1)
memory usage: 31.3+ KB


In [16]:
df[df['rank_title'].notna() & df['rank_sub'].notna()].shape

(559, 8)

every `rank_title` has `rank_sub`

In [17]:
df['rank_sub'].nunique()

124

In [18]:
df[['rank_sub']].head(10)

Unnamed: 0,rank_sub
0,in Give Gifts
1,in Top rated in Portable Fans
2,
3,
4,in Oral Irrigators
5,in Refrigerators & Freezers
6,in Other Household Appliances
7,
8,in Oral Care Appliances
9,in Household Appliances


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   goods_title_link_jump       40 non-null     object
 1   goods_title_link_jump_href  40 non-null     object
 2   rank_title                  559 non-null    object
 3   rank_sub                    559 non-null    object
 4   price                       3986 non-null   object
 5   discount                    2459 non-null   object
 6   selling_proposition         1361 non-null   object
 7   goods_title_link            3946 non-null   object
dtypes: object(8)
memory usage: 249.3+ KB


### `price`
Price of the product

In [20]:
df['price'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: price
Non-Null Count  Dtype 
--------------  ----- 
3986 non-null   object
dtypes: object(1)
memory usage: 31.3+ KB


In [21]:
df['price'].value_counts()

price
$2.90     35
$38.99    30
$39.99    27
$2.70     26
$22.99    26
          ..
$8.82      1
$58.20     1
$65.45     1
$54.39     1
$61.68     1
Name: count, Length: 1512, dtype: int64

In [22]:
df.loc[~df['price'].str.match(r'^\$\d+(\.\d{1,2})?$')]

Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link
1441,,,,,"$1,430.99",-55%,,"VEVOR Slushy Machine, 30L Slushie Machine For ..."
1959,,,,,"$1,077.99",-56%,,"VEVOR Commercial Meatball Forming Machine, 280..."
2849,,,,,"$2,254.99",-55%,,"VEVOR Commercial Ice Cream Machine, 20-25L/H Y..."
3206,,,,,"$1,174.99",-56%,,VEVOR 110V Frozen Yogurt Blending Machine 750W...
3258,,,,,"$1,398.00",,,Bosch NGM5658UC 36 Inch Stainless 500 Series G...
3401,,,,,"$2,399.99",-27%,,BOSSCARE Full Body Zero Gravity 4D Massage Cha...
3415,,,,,"$1,299.99",-57%,,"Artist Hand Massage Chair Full Body, SL Track ..."
3973,,,,,"$1,399.99",-30%,,"BOSSCARE 3D Full Body SL Track Massage Chair, ..."
3975,,,,,"$1,499.99",-32%,,"BOSSCARE 3D Full Body SL Track Massage Chair, ..."
3976,,,,,"$1,499.99",-32%,,"BOSSCARE 3D Full Body SL Track Massage Chair, ..."


In [23]:
df['price'] = df['price'].str.replace(',', '').astype(str)
df.loc[~df['price'].str.match(r'^\$\d+(\.\d{1,2})?$')].shape

(0, 8)

In [24]:
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].astype(float)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   goods_title_link_jump       40 non-null     object 
 1   goods_title_link_jump_href  40 non-null     object 
 2   rank_title                  559 non-null    object 
 3   rank_sub                    559 non-null    object 
 4   price                       3986 non-null   float64
 5   discount                    2459 non-null   object 
 6   selling_proposition         1361 non-null   object 
 7   goods_title_link            3946 non-null   object 
dtypes: float64(1), object(7)
memory usage: 249.3+ KB


### `discount`
Discount off the price

In [26]:
df['discount'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: discount
Non-Null Count  Dtype 
--------------  ----- 
2459 non-null   object
dtypes: object(1)
memory usage: 31.3+ KB


In [27]:
df[['discount']].head(10)

Unnamed: 0,discount
0,-22%
1,-20%
2,
3,-72%
4,-40%
5,
6,
7,
8,-41%
9,-5%


In [28]:
df['discount'] = df['discount'].str.replace(r'[^0-9]', '', regex=True).astype(float)
df['discount'] = df['discount'] / 100
df.head(10)

Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,1.0,in Give Gifts,2.03,0.22,,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,4.0,in Top rated in Portable Fans,6.48,0.2,,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,,,1.8,,400+ sold recently,
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,,,0.88,0.72,5.6k+ sold recently,
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,6.0,in Oral Irrigators,12.06,0.4,,
5,"1pc Ice Pop Mold, Plastic Ice Cream Mold, Froz...",https://us.shein.com/1pc-Ice-Pop-Mold-Plastic-...,10.0,in Refrigerators & Freezers,2.7,,,
6,"Mini Pocket Bluetooth Thermal Printer, Portabl...",https://us.shein.com/Mini-Pocket-Bluetooth-The...,8.0,in Other Household Appliances,3.5,,,
7,1 PC 400ML Classic Ultra-Silent Air Humidifier...,https://us.shein.com/1-PC-400ML-Classic-Ultra-...,,,4.9,,,
8,6 Modes Multi-Functional Sonic Electric Toothb...,https://us.shein.com/6-Modes-Multi-Functional-...,6.0,in Oral Care Appliances,4.28,0.41,,
9,1pc AA Battery Operated Portable Handheld Mini...,https://us.shein.com/1pc-AA-Battery-Operated-P...,9.0,in Household Appliances,2.76,0.05,,


### `selling_proposition`
how many sold recently

In [29]:
df['selling_proposition'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: selling_proposition
Non-Null Count  Dtype 
--------------  ----- 
1361 non-null   object
dtypes: object(1)
memory usage: 31.3+ KB


In [30]:
df['selling_proposition'].unique()

array([nan, '400+ sold recently', '5.6k+ sold recently',
       '50+ sold recently', '30+ sold recently', '60+ sold recently',
       '100+ sold recently', '200+ sold recently', '8.7k+ sold recently',
       '10+ sold recently', '1.6k+ sold recently', '20+ sold recently',
       '10k+ sold recently', '80+ sold recently', '300+ sold recently',
       '900+ sold recently', '600+ sold recently', '70+ sold recently',
       '40+ sold recently', '1.8k+ sold recently', '5.0k+ sold recently',
       '1.1k+ sold recently', '2.0k+ sold recently', '500+ sold recently',
       '90+ sold recently', '800+ sold recently', '2.2k+ sold recently',
       '1.5k+ sold recently', '1.2k+ sold recently',
       '1.3k+ sold recently', '3.4k+ sold recently', '700+ sold recently',
       '2.4k+ sold recently', '6.3k+ sold recently',
       '2.6k+ sold recently', '3.8k+ sold recently',
       '2.1k+ sold recently', '1.0k+ sold recently',
       '4.7k+ sold recently', '2.9k+ sold recently',
       '1.9k+ sold re

In [31]:
df['selling_proposition'] = df['selling_proposition'].astype(str).str.strip()
df['sold_number'] = df['selling_proposition'].str.extract(r'(\d+\.?\d*)')
df['sold_number'] = df['sold_number'].astype(float)
df.loc[df['selling_proposition'].str.contains('k', case=False, na=False), 'sold_number'] *= 1000

df[['selling_proposition', 'sold_number']].sample(20)

Unnamed: 0,selling_proposition,sold_number
3491,,
1151,,
511,,
1796,,
569,80+ sold recently,80.0
3161,,
1816,,
1910,20+ sold recently,20.0
375,,
273,100+ sold recently,100.0


### `goods_title_link`
Product title (2)

In [32]:
df['goods_title_link'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3986 entries, 0 to 3985
Series name: goods_title_link
Non-Null Count  Dtype 
--------------  ----- 
3946 non-null   object
dtypes: object(1)
memory usage: 31.3+ KB


In [33]:
df['goods_title_link'] = df['goods_title_link'].str.replace(r'\s+', ' ', regex=True)
df.head()

Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link,sold_number
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,1.0,in Give Gifts,2.03,0.22,,,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,4.0,in Top rated in Portable Fans,6.48,0.2,,,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,,,1.8,,400+ sold recently,,400.0
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,,,0.88,0.72,5.6k+ sold recently,,5600.0
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,6.0,in Oral Irrigators,12.06,0.4,,,


### Merge Product Title

In [34]:
df['product_name'] = df['goods_title_link_jump'].fillna(df['goods_title_link'])
df.head()

Unnamed: 0,goods_title_link_jump,goods_title_link_jump_href,rank_title,rank_sub,price,discount,selling_proposition,goods_title_link,sold_number,product_name
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,1.0,in Give Gifts,2.03,0.22,,,,1pc Rechargeable Deep Tissue Muscle Handheld M...
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,4.0,in Top rated in Portable Fans,6.48,0.2,,,,1pc Portable Hanging Neck Fan
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,,,1.8,,400+ sold recently,,400.0,1pc Pink Colored Curved Eyelash Curler False E...
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,,,0.88,0.72,5.6k+ sold recently,,5600.0,1 Mini Portable Handheld Fan With 2 Aa Batteri...
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,6.0,in Oral Irrigators,12.06,0.4,,,,"Wit Water Flosser,Portable Oral Irrigator With..."


## Restructured

In [35]:
df.rename(columns={
    'goods_title_link_jump_href': 'product_link',
    'rank_title': 'rank_num',
    'rank_sub': 'rank_subcategory'
}, inplace=True)

df.columns

Index(['goods_title_link_jump', 'product_link', 'rank_num', 'rank_subcategory',
       'price', 'discount', 'selling_proposition', 'goods_title_link',
       'sold_number', 'product_name'],
      dtype='object')

In [36]:
df['color_count'] = 1
df['category'] = 'appliances'
df = df[[
    'product_name',
    'product_link',
    'category',
    'color_count',
    'price',
    'discount',
    'rank_num',
    'rank_subcategory',
    'sold_number'
]]

df.head()

Unnamed: 0,product_name,product_link,category,color_count,price,discount,rank_num,rank_subcategory,sold_number
0,1pc Rechargeable Deep Tissue Muscle Handheld M...,https://us.shein.com/1pc-Rechargeable-Deep-Tis...,appliances,1,2.03,0.22,1.0,in Give Gifts,
1,1pc Portable Hanging Neck Fan,https://us.shein.com/1pc-Portable-Hanging-Neck...,appliances,1,6.48,0.2,4.0,in Top rated in Portable Fans,
2,1pc Pink Colored Curved Eyelash Curler False E...,https://us.shein.com/1pc-Pink-Colored-Curved-E...,appliances,1,1.8,,,,400.0
3,1 Mini Portable Handheld Fan With 2 Aa Batteri...,https://us.shein.com/1-Mini-Portable-Handheld-...,appliances,1,0.88,0.72,,,5600.0
4,"Wit Water Flosser,Portable Oral Irrigator With...",https://us.shein.com/Wit-Water-Flosser-Portabl...,appliances,1,12.06,0.4,6.0,in Oral Irrigators,


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_name      3986 non-null   object 
 1   product_link      40 non-null     object 
 2   category          3986 non-null   object 
 3   color_count       3986 non-null   int64  
 4   price             3986 non-null   float64
 5   discount          2459 non-null   float64
 6   rank_num          559 non-null    object 
 7   rank_subcategory  559 non-null    object 
 8   sold_number       1361 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 280.4+ KB


# Load

In [38]:
df.to_csv(f'../data/clean/appliances.csv', index=False)