### Read The Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/retail_raw_test.csv", low_memory=False)

In [3]:
df_copy = df.copy()

In [4]:
df.head()

Unnamed: 0,order_id,order_date,customer_id,city,province,brand,quantity,item_price,product_value
0,1730350,"Dec 11, 2019",'13447,Surakarta,Jawa Tengah,BRAND_F,'24,'113000,1374.0
1,1677490,"Jul 31, 2019",'0,,,BRAND_F,'1,'1164000,1370.0
2,1704211,"Oct 18, 2019",'16128,Jakarta Pusat,DKI Jakarta,BRAND_H,'12,'747000,1679.0
3,1679695,"Aug 07, 2019",'16225,Yogyakarta,Yogyakarta,BRAND_H,'6,'590000,1708.0
4,1679080,"Aug 05, 2019",'0,,,BRAND_E,'2,'740000,1201.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       5000 non-null   int64  
 1   order_date     5000 non-null   object 
 2   customer_id    5000 non-null   object 
 3   city           3802 non-null   object 
 4   province       3802 non-null   object 
 5   brand          4995 non-null   object 
 6   quantity       5000 non-null   object 
 7   item_price     5000 non-null   object 
 8   product_value  4995 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 351.7+ KB


### Change Data Type
Mengubah kolom yang memiliki tipe data tidak sesuai

In [6]:
df["customer_id"] = df["customer_id"].apply(lambda x: x.split("'")[1]).astype("int64")

In [7]:
df["quantity"] = df["quantity"].apply(lambda x: x.split("'")[1]).astype("int64")

In [8]:
df["item_price"] = df["item_price"].apply(lambda x: x.split("'")[1]).astype("int64")

In [9]:
df.dtypes

order_id           int64
order_date        object
customer_id        int64
city              object
province          object
brand             object
quantity           int64
item_price         int64
product_value    float64
dtype: object

### Transform "product_value"
transform product_value supaya bentuknya seragam dengan format PXXXX, assign ke kolom baru "product_id", dan drop kolom "product_value", jika terdapat nan gantilah dengan "unknown".

In [10]:
import math

In [11]:
def impute_product_value(val):
    if math.isnan(val):
        return "unknown"
    else:
        return "P" + '{:0>4}'.format(str(val).split('.')[0])

In [12]:
df["product_id"] = df["product_value"].apply(lambda x: impute_product_value(x))

In [13]:
df["product_value"]

0       1374.0
1       1370.0
2       1679.0
3       1708.0
4       1201.0
         ...  
4995     449.0
4996    1685.0
4997    3206.0
4998    4126.0
4999    1890.0
Name: product_value, Length: 5000, dtype: float64

In [14]:
df.drop(["product_value"], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,order_id,order_date,customer_id,city,province,brand,quantity,item_price,product_id
0,1730350,"Dec 11, 2019",13447,Surakarta,Jawa Tengah,BRAND_F,24,113000,P1374
1,1677490,"Jul 31, 2019",0,,,BRAND_F,1,1164000,P1370
2,1704211,"Oct 18, 2019",16128,Jakarta Pusat,DKI Jakarta,BRAND_H,12,747000,P1679
3,1679695,"Aug 07, 2019",16225,Yogyakarta,Yogyakarta,BRAND_H,6,590000,P1708
4,1679080,"Aug 05, 2019",0,,,BRAND_E,2,740000,P1201


### Transform "order_date"
trasnform order_date menjadi value dengan format YYYY-mm-dd

In [16]:
df["order_date"]

0       Dec 11, 2019
1       Jul 31, 2019
2       Oct 18, 2019
3       Aug 07, 2019
4       Aug 05, 2019
            ...     
4995    Jan 08, 2019
4996    Dec 03, 2019
4997    Nov 13, 2019
4998    Jul 03, 2019
4999    Nov 26, 2019
Name: order_date, Length: 5000, dtype: object

In [17]:
months_dict = {
    "Jan" : "01",
    "Feb" : "02",
    "Mar" : "03",
    "Apr" : "04",
    "May" : "05",
    "Jun" : "06",
    "Jul" : "07",
    "Aug" : "08",
    "Sep" : "09",
    "Oct" : "10",
    "Nov" : "11",
    "Dec" : "12",
}

In [18]:
df["order_date"] = pd.to_datetime(df["order_date"].apply(lambda x: str(x)[-4:] + "-" + months_dict[str(x)[:3]] + "-" + str(x)[4:7]))

In [41]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,brand,quantity,item_price,total_price
city/province,order_date,customer_id,customer_id,order_id,product_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Banda Aceh/Aceh,2019-04-17,12818,12818,1642480,P1936,BRAND_K,24,450000,10800000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P0758,BRAND_C,8,695000,5560000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P3042,BRAND_R,12,310000,3720000
Banda Aceh/Aceh,2019-12-09,12374,12374,1729036,P1660,BRAND_G,4,2795000,11180000
Bandar Lampung/Lampung,2019-01-15,12515,12515,1619257,P0628,BRAND_C,12,695000,8340000


### Missing Values
Cek data yang hilang dari tiap kolom dan kemudian isi missing value:
<li> di "brand" dengan "no_brand"
<li> cek bagaimana missing value di city & province isi dengan "unknown"

In [20]:
df.isna().sum()

order_id          0
order_date        0
customer_id       0
city           1198
province       1198
brand             5
quantity          0
item_price        0
product_id        0
dtype: int64

In [21]:
df[["city", "province"]] = df[["city", "province"]].fillna("Unknown")

In [22]:
df["brand"] = df["brand"].fillna("no_brand")

In [23]:
df.isna().sum()

order_id       0
order_date     0
customer_id    0
city           0
province       0
brand          0
quantity       0
item_price     0
product_id     0
dtype: int64

### Membuat column city/province dari gabungan city & province

In [24]:
df["city/province"] = df["city"] + "/" + df["province"]

In [25]:
df.head()

Unnamed: 0,order_id,order_date,customer_id,city,province,brand,quantity,item_price,product_id,city/province
0,1730350,2019-12-11,13447,Surakarta,Jawa Tengah,BRAND_F,24,113000,P1374,Surakarta/Jawa Tengah
1,1677490,2019-07-31,0,Unknown,Unknown,BRAND_F,1,1164000,P1370,Unknown/Unknown
2,1704211,2019-10-18,16128,Jakarta Pusat,DKI Jakarta,BRAND_H,12,747000,P1679,Jakarta Pusat/DKI Jakarta
3,1679695,2019-08-07,16225,Yogyakarta,Yogyakarta,BRAND_H,6,590000,P1708,Yogyakarta/Yogyakarta
4,1679080,2019-08-05,0,Unknown,Unknown,BRAND_E,2,740000,P1201,Unknown/Unknown


In [26]:
df.drop(columns=["city", "province"], axis=1, inplace=True)

In [27]:
df.head()

Unnamed: 0,order_id,order_date,customer_id,brand,quantity,item_price,product_id,city/province
0,1730350,2019-12-11,13447,BRAND_F,24,113000,P1374,Surakarta/Jawa Tengah
1,1677490,2019-07-31,0,BRAND_F,1,1164000,P1370,Unknown/Unknown
2,1704211,2019-10-18,16128,BRAND_H,12,747000,P1679,Jakarta Pusat/DKI Jakarta
3,1679695,2019-08-07,16225,BRAND_H,6,590000,P1708,Yogyakarta/Yogyakarta
4,1679080,2019-08-05,0,BRAND_E,2,740000,P1201,Unknown/Unknown


In [28]:
df["city/province"] = df["city/province"].replace(["Unknown/Unknown"], "Unknown")

In [29]:
df.head()

Unnamed: 0,order_id,order_date,customer_id,brand,quantity,item_price,product_id,city/province
0,1730350,2019-12-11,13447,BRAND_F,24,113000,P1374,Surakarta/Jawa Tengah
1,1677490,2019-07-31,0,BRAND_F,1,1164000,P1370,Unknown
2,1704211,2019-10-18,16128,BRAND_H,12,747000,P1679,Jakarta Pusat/DKI Jakarta
3,1679695,2019-08-07,16225,BRAND_H,6,590000,P1708,Yogyakarta/Yogyakarta
4,1679080,2019-08-05,0,BRAND_E,2,740000,P1201,Unknown


### Creating Hierachical Index
membuat index berdasarkan city_provice, order_date, customer_id, order_id, product_id (cek index)

In [30]:
df = df.set_index(["city/province", "order_date", "customer_id", "customer_id", "order_id", "product_id"])

In [31]:
df = df.sort_index()

In [32]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,brand,quantity,item_price
city/province,order_date,customer_id,customer_id,order_id,product_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Banda Aceh/Aceh,2019-04-17,12818,12818,1642480,P1936,BRAND_K,24,450000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P0758,BRAND_C,8,695000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P3042,BRAND_R,12,310000
Banda Aceh/Aceh,2019-12-09,12374,12374,1729036,P1660,BRAND_G,4,2795000
Bandar Lampung/Lampung,2019-01-15,12515,12515,1619257,P0628,BRAND_C,12,695000


### Create "total_price" Columns
membuat kolom "total_price" sebagai hasil perkalian quantity dengan item_price

In [33]:
df["total_price"] = df["quantity"] * df["item_price"]

In [34]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,brand,quantity,item_price,total_price
city/province,order_date,customer_id,customer_id,order_id,product_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Banda Aceh/Aceh,2019-04-17,12818,12818,1642480,P1936,BRAND_K,24,450000,10800000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P0758,BRAND_C,8,695000,5560000
Banda Aceh/Aceh,2019-11-12,12360,12360,1715116,P3042,BRAND_R,12,310000,3720000
Banda Aceh/Aceh,2019-12-09,12374,12374,1729036,P1660,BRAND_G,4,2795000,11180000
Bandar Lampung/Lampung,2019-01-15,12515,12515,1619257,P0628,BRAND_C,12,695000,8340000


### Slicing Data Jan 2019
slice data hanya untuk Jan 2019

In [42]:
idx = pd.IndexSlice

In [43]:
df_jan2019 = df.loc[idx[:, "2019-01-01":"2019-01-31"], :]


In [44]:
df_jan2019

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,brand,quantity,item_price,total_price
city/province,order_date,customer_id,customer_id,order_id,product_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bandar Lampung/Lampung,2019-01-15,12515,12515,1619257,P0628,BRAND_C,12,695000,8340000
Bandung/Jawa Barat,2019-01-09,16134,16134,1617055,P1597,BRAND_G,9,520000,4680000
Bandung/Jawa Barat,2019-01-10,17392,17392,1617952,P2137,BRAND_M,2,1062000,2124000
Bandung/Jawa Barat,2019-01-14,15527,15527,1618828,P3115,BRAND_S,1,1045000,1045000
Bandung/Jawa Barat,2019-01-29,13253,13253,1620289,P0099,BRAND_A,12,450000,5400000
...,...,...,...,...,...,...,...,...,...
Yogyakarta/Yogyakarta,2019-01-14,14062,14062,1618774,P2227,BRAND_M,48,1745000,83760000
Yogyakarta/Yogyakarta,2019-01-14,14298,14298,1618762,P3638,BRAND_S,48,86000,4128000
Yogyakarta/Yogyakarta,2019-01-14,15279,15279,1618495,P1533,BRAND_G,4,1045000,4180000
Yogyakarta/Yogyakarta,2019-01-14,16713,16713,1618738,P0166,BRAND_A,2,1325000,2650000


Finish. Hope you enjoy it! xoxo