# Walmart Classification Preprocessing
1. Data Load
2. Fill NaN
    - DepartmentDescription
    - FinelineNumber
    - Upc
3. DepartmentDescription Encode
4. Weekday Encode
5. Divide Upc
    - Compnay Upc
    - Product Upc
6. Company_Upc Encode
7. FinelineNumber Encode
8. Groupby VisitNumber

In [1]:
import pandas as pd
import numpy as np
import preprocessing_functions as pf
from functools import partial
from tqdm import tqdm
import slack_incomming_webhook as siw
import requests, json

## 1. Data Load

In [2]:
train = pd.read_csv("train.csv")

print(train.shape)
train.head()

(647054, 7)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113150000.0,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538820000.0,1,SHOES,8931.0
2,30,7,Friday,7410811000.0,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238404000.0,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006614000.0,2,PAINT AND ACCESSORIES,1017.0


In [3]:
test = pd.read_csv("test.csv")

print(test.shape)
test.head()

(653646, 6)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503390000.0,1,SHOES,3002.0
1,1,Friday,1707711000.0,1,DAIRY,1526.0
2,1,Friday,89470000000.0,1,DAIRY,1431.0
3,1,Friday,88491210000.0,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015000.0,1,DSD GROCERY,4408.0


## Pre-Processing

## 2. Fill NaN

### Fill in missing DepartmentDescription - train
- VisitNumber에 따른 DepartmentDescription의 최빈값으로 DepartmentDescription의 빈값 채우기
- 유추할 수 없는 191개의 값은 'UNKNOWN' 으로 대체

In [4]:
train.loc[train["VisitNumber"]==259, "DepartmentDescription"]

546        LAWN AND GARDEN
547        LAWN AND GARDEN
548                    NaN
549                    NaN
550        LAWN AND GARDEN
551        LAWN AND GARDEN
552    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

In [5]:
DD_VN_list = train[train["DepartmentDescription"].isna()]["VisitNumber"].unique()

In [6]:
for loc in tqdm(DD_VN_list): # if: 특정 VisitNumber 따른 DepartmentDescription 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["DepartmentDescription"].isna()), "DepartmentDescription"] = train[train["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index[0]

100%|██████████| 1172/1172 [00:34<00:00, 33.59it/s]


In [7]:
train.loc[train["VisitNumber"]==259, "DepartmentDescription"]

546        LAWN AND GARDEN
547        LAWN AND GARDEN
548        LAWN AND GARDEN
549        LAWN AND GARDEN
550        LAWN AND GARDEN
551        LAWN AND GARDEN
552    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

### Fill in missing DepartmentDescription - test
- VisitNumber에 따른 DepartmentDescription의 최빈값으로 DepartmentDescription의 빈값 채우기
- 유추할 수 없는 191개의 값은 'UNKNOWN' 으로 대체

In [8]:
test.loc[test["VisitNumber"]==874, "DepartmentDescription"]

2115             AUTOMOTIVE
2116             AUTOMOTIVE
2117                    NaN
2118                    NaN
2119             AUTOMOTIVE
2120    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

In [9]:
DD_VN_list_t = test[test["DepartmentDescription"].isna()]["VisitNumber"].unique()

In [10]:
for loc in tqdm(DD_VN_list_t): # if: 특정 VisitNumber 따른 DepartmentDescription 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["DepartmentDescription"].isna()), "DepartmentDescription"] = test[test["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index[0]

100%|██████████| 1141/1141 [00:32<00:00, 34.98it/s]


In [11]:
test.loc[test["VisitNumber"]==874, "DepartmentDescription"]

2115             AUTOMOTIVE
2116             AUTOMOTIVE
2117             AUTOMOTIVE
2118             AUTOMOTIVE
2119             AUTOMOTIVE
2120    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object


### Fill in missing FinelineNumber - train
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 FinelineNumber에는 DepartmentDescription이 'PHARMACY RX'일때의 FinelineNumber의 최빈값으로 채워준다.
- VisitNumber에 따른 FinelineNumber의 최빈값으로 FinelineNumber의 빈값 채우기
- 191개의 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 -9999 값으로 대체

In [12]:
train[train["DepartmentDescription"] == 'PHARMACY RX']["FinelineNumber"].value_counts()

4822.0    84
5615.0    63
1335.0     6
1336.0     1
Name: FinelineNumber, dtype: int64

In [13]:
Pharmacy_idx = train[train["DepartmentDescription"]=='PHARMACY RX'].index
number_idx = np.arange(2922)
idx_box = zip(number_idx, Pharmacy_idx)


for idx, Pha_idx in tqdm(idx_box):
    if idx % 2 == 0:
        train.loc[Pha_idx, "FinelineNumber"] = 4822.0
    else:
        train.loc[Pha_idx, "FinelineNumber"] = 5615.0
        
train[train["DepartmentDescription"] == 'PHARMACY RX'][["DepartmentDescription", "FinelineNumber"]].head()

2922it [00:09, 294.73it/s]


Unnamed: 0,DepartmentDescription,FinelineNumber
1155,PHARMACY RX,4822.0
1216,PHARMACY RX,5615.0
1373,PHARMACY RX,4822.0
1455,PHARMACY RX,5615.0
1456,PHARMACY RX,4822.0


In [14]:
train.loc[train["VisitNumber"]==259, "FinelineNumber"]

546    5141.0
547    1748.0
548       NaN
549       NaN
550    2605.0
551    2605.0
552     337.0
Name: FinelineNumber, dtype: float64

In [15]:
FN_VN_list = train[train["FinelineNumber"].isna()]["VisitNumber"].unique()

In [16]:
for loc in tqdm(FN_VN_list): # if: 특정 VisitNumber 따른 FinelineNumber 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["FinelineNumber"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["FinelineNumber"].isna()), "FinelineNumber"] = train[train["VisitNumber"] == loc]["FinelineNumber"].value_counts().index[0]

100%|██████████| 1172/1172 [00:10<00:00, 111.79it/s]


In [17]:
train.loc[train["VisitNumber"]==259, "FinelineNumber"]

546    5141.0
547    1748.0
548    2605.0
549    2605.0
550    2605.0
551    2605.0
552     337.0
Name: FinelineNumber, dtype: float64

### Fill in missing FinelineNumber - test
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 FinelineNumber에는 DepartmentDescription이 'PHARMACY RX'일때의 FinelineNumber의 최빈값으로 채워준다.
- VisitNumber에 따른 FinelineNumber의 최빈값으로 FinelineNumber의 빈값 채우기
- 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 -9999 값으로 대체

In [18]:
test[test["DepartmentDescription"] == 'PHARMACY RX']["FinelineNumber"].value_counts()

4822.0    79
5615.0    45
1335.0     2
Name: FinelineNumber, dtype: int64

In [19]:
Pharmacy_idx = test[test["DepartmentDescription"]=='PHARMACY RX'].index
number_idx = np.arange(2784)
idx_box = zip(number_idx, Pharmacy_idx)


for idx, Pha_idx in tqdm(idx_box):
    if idx % 2 == 0:
        test.loc[Pha_idx, "FinelineNumber"] = 4822.0
    else:
        test.loc[Pha_idx, "FinelineNumber"] = 5615.0
        
test[test["DepartmentDescription"] == 'PHARMACY RX'][["DepartmentDescription", "FinelineNumber"]].head()

2784it [00:09, 293.69it/s]


Unnamed: 0,DepartmentDescription,FinelineNumber
1188,PHARMACY RX,4822.0
1189,PHARMACY RX,5615.0
1190,PHARMACY RX,4822.0
1314,PHARMACY RX,5615.0
1315,PHARMACY RX,4822.0


In [20]:
test.loc[test["VisitNumber"]==874, ["FinelineNumber","ScanCount"]]

Unnamed: 0,FinelineNumber,ScanCount
2115,250.0,1
2116,9.0,1
2117,,1
2118,,-1
2119,253.0,1
2120,145.0,1


In [21]:
FN_VN_list_t = test[test["FinelineNumber"].isna()]["VisitNumber"].unique()

In [22]:
for loc in tqdm(FN_VN_list_t): # if: 특정 VisitNumber 따른 FinelineNumber 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["FinelineNumber"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["FinelineNumber"].isna()), "FinelineNumber"] = test[test["VisitNumber"] == loc]["FinelineNumber"].value_counts().index[0]

100%|██████████| 1141/1141 [00:10<00:00, 112.91it/s]


In [23]:
test.loc[test["VisitNumber"]==874, "FinelineNumber"]

2115    250.0
2116      9.0
2117    145.0
2118    145.0
2119    253.0
2120    145.0
Name: FinelineNumber, dtype: float64

### Fill in Upc - train
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 Upc에는 DepartmentDescription이 'PHARMACY RX'일때의 Upc의 최빈값으로 채워준다.(**아직 최빈값을 무엇으로 할지 정하지 못함**)
- VisitNumber에 따른 Upc의 최빈값으로 Upc의 빈값 채우기
- 191개의 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 '0000599996' 값으로 대체

In [24]:
train.loc[train["VisitNumber"]==259, "Upc"]

546    7.112176e+09
547    4.656118e+09
548             NaN
549             NaN
550    3.146256e+09
551    3.146253e+09
552    4.650073e+09
Name: Upc, dtype: float64

In [25]:
Upc_VN_list = train[train["Upc"].isna()]["VisitNumber"].unique()

In [26]:
for loc in tqdm(Upc_VN_list): # if: 특정 VisitNumber 따른 Upc 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["Upc"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["Upc"].isna()), "Upc"] = train[train["VisitNumber"] == loc]["Upc"].value_counts().index[0]

100%|██████████| 2754/2754 [00:17<00:00, 153.90it/s]


In [27]:
train.loc[train["VisitNumber"]==259, "Upc"]

546    7.112176e+09
547    4.656118e+09
548    4.656118e+09
549    4.656118e+09
550    3.146256e+09
551    3.146253e+09
552    4.650073e+09
Name: Upc, dtype: float64

### Fill in Upc - test
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 Upc에는 DepartmentDescription이 'PHARMACY RX'일때의 Upc의 최빈값으로 채워준다.(**아직 최빈값을 무엇으로 할지 정하지 못함**)
- VisitNumber에 따른 Upc의 최빈값으로 Upc의 빈값 채우기
- 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 '0000599996' 값으로 대체

In [28]:
test.loc[test["VisitNumber"]==874, "Upc"]

2115    1.284410e+09
2116    8.182000e+10
2117             NaN
2118             NaN
2119    1.284410e+09
2120    3.400001e+09
Name: Upc, dtype: float64

In [29]:
Upc_VN_list_t = test[test["Upc"].isna()]["VisitNumber"].unique()

In [30]:
for loc in tqdm(Upc_VN_list_t): # if: 특정 VisitNumber 따른 Upc 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["Upc"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["Upc"].isna()), "Upc"] = test[test["VisitNumber"] == loc]["Upc"].value_counts().index[0]

100%|██████████| 2706/2706 [00:16<00:00, 165.33it/s]


In [31]:
test.loc[test["VisitNumber"]==874, "Upc"]

2115    1.284410e+09
2116    8.182000e+10
2117    3.400001e+09
2118    3.400001e+09
2119    1.284410e+09
2120    3.400001e+09
Name: Upc, dtype: float64

###  VisitNumber에 따른 모든 DepartmentDescription, FinelineNumber, Upc의 값이 비어있는 경우

- 총 191 개의 유추 불가능한 DepartmentDescription, FinelineNumber, Upc의 값이 모두 비어있는 경우
- 기존에 train, test 데이터에 없는 값으로 각각 "UNKNOWN", -9999, '0000599996' 채운다.

In [32]:
empty_df = train[(train["DepartmentDescription"].isna())&(train["DepartmentDescription"].isna())&(train["DepartmentDescription"].isna())][["VisitNumber", "DepartmentDescription", "FinelineNumber", "Upc", "TripType", "Weekday", "ScanCount"]]

print(empty_df.shape)
empty_df.head()

(191, 7)


Unnamed: 0,VisitNumber,DepartmentDescription,FinelineNumber,Upc,TripType,Weekday,ScanCount
959,409,,,,999,Friday,-1
1134,484,,,,999,Friday,-2
1135,484,,,,999,Friday,-2
6285,2245,,,,999,Friday,-1
8524,3004,,,,999,Friday,1


In [33]:
print("191개의 빈 row들은 모두 triptype이 {}이다.".format(empty_df["TripType"].value_counts().index[0]))
empty_df["TripType"].value_counts()

191개의 빈 row들은 모두 triptype이 999이다.


999    191
Name: TripType, dtype: int64

### train - DepartmentDescription, FinelineNumber, Upc

In [34]:
train.loc[train["DepartmentDescription"].isna(), "DepartmentDescription"] = "UNKNOWN"
train[train["DepartmentDescription"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [35]:
train.loc[train["FinelineNumber"].isna(), "FinelineNumber"] = -9999.0
train[train["FinelineNumber"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [36]:
train.loc[train["Upc"].isna(), "Upc"] = 0000599996.0
train[train["Upc"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


### test - DepartmentDescription, FinelineNumber, Upc

In [37]:
test.loc[test["DepartmentDescription"].isna(), "DepartmentDescription"] = "UNKNOWN"
test[test["DepartmentDescription"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [38]:
test.loc[test["FinelineNumber"].isna(), "FinelineNumber"] = -9999.0
test[test["FinelineNumber"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [39]:
test.loc[test["Upc"].isna(), "Upc"] = 0000599996.0
test[test["Upc"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


### TripType of each VisitNumber Save

In [40]:
train_TripType = train.groupby(by="VisitNumber").mean().reset_index()["TripType"]

In [41]:
train_TripType.head()

0    999.0
1     30.0
2     26.0
3      8.0
4      8.0
Name: TripType, dtype: float64

## 3. DepartmentDescription Encode
- 총 68가지의 종류
- 종류 별로 one-hot encoding
- 'MENSWEAR'와 'MENS WEAR'는 같은 카테고리로 변환

### train

In [42]:
## MENSWEAR - > MENS WEAR
train.loc[train["DepartmentDescription"] == 'MENSWEAR', "DepartmentDescription"] = 'MENS WEAR'

In [43]:
train_desc_ls = train["DepartmentDescription"].unique()
train_desc_ls = list(train_desc_ls)
train_desc_ls.sort()
len(train_desc_ls)

68

In [44]:
number = np.arange(1, 69)
train_desc_dic = zip(train_desc_ls, number)
train_desc_dict = {}
for desc, number in train_desc_dic:
    train_desc_dict[desc] = number

train_desc_dict

{'1-HR PHOTO': 1,
 'ACCESSORIES': 2,
 'AUTOMOTIVE': 3,
 'BAKERY': 4,
 'BATH AND SHOWER': 5,
 'BEAUTY': 6,
 'BEDDING': 7,
 'BOOKS AND MAGAZINES': 8,
 'BOYS WEAR': 9,
 'BRAS & SHAPEWEAR': 10,
 'CAMERAS AND SUPPLIES': 11,
 'CANDY, TOBACCO, COOKIES': 12,
 'CELEBRATION': 13,
 'COMM BREAD': 14,
 'CONCEPT STORES': 15,
 'COOK AND DINE': 16,
 'DAIRY': 17,
 'DSD GROCERY': 18,
 'ELECTRONICS': 19,
 'FABRICS AND CRAFTS': 20,
 'FINANCIAL SERVICES': 21,
 'FROZEN FOODS': 22,
 'FURNITURE': 23,
 'GIRLS WEAR, 4-6X  AND 7-14': 24,
 'GROCERY DRY GOODS': 25,
 'HARDWARE': 26,
 'HEALTH AND BEAUTY AIDS': 27,
 'HOME DECOR': 28,
 'HOME MANAGEMENT': 29,
 'HORTICULTURE AND ACCESS': 30,
 'HOUSEHOLD CHEMICALS/SUPP': 31,
 'HOUSEHOLD PAPER GOODS': 32,
 'IMPULSE MERCHANDISE': 33,
 'INFANT APPAREL': 34,
 'INFANT CONSUMABLE HARDLINES': 35,
 'JEWELRY AND SUNGLASSES': 36,
 'LADIES SOCKS': 37,
 'LADIESWEAR': 38,
 'LARGE HOUSEHOLD GOODS': 39,
 'LAWN AND GARDEN': 40,
 'LIQUOR,WINE,BEER': 41,
 'MEAT - FRESH & FROZEN': 42,
 'ME

In [45]:
train["desc_tag"] = train["DepartmentDescription"]
train["desc_tag"] = train["desc_tag"].apply(partial(pf.desc_tagger, train_desc_dict))

In [46]:
for idx in range(1, 69):
    train["desc_tag_{}".format(idx)] = train["desc_tag"] == idx

### test

In [47]:
## MENSWEAR - > MENS WEAR
test.loc[test["DepartmentDescription"] == 'MENSWEAR', "DepartmentDescription"] = 'MENS WEAR'

In [48]:
test["desc_tag"] = test["DepartmentDescription"]
test["desc_tag"] = test["desc_tag"].apply(partial(pf.desc_tagger, train_desc_dict))

In [49]:
for idx in range(1, 69):
    test["desc_tag_{}".format(idx)] = test["desc_tag"] == idx

## 4. Weekday Encode
- 월화수목금토일(1,2,3,4,5,6,7) one-hot encoding

In [50]:
train.loc[train["Weekday"] == "Monday", "Weekday_num"] = 1
train.loc[train["Weekday"] == "Tuesday", "Weekday_num"] = 2
train.loc[train["Weekday"] == "Wednesday", "Weekday_num"] = 3
train.loc[train["Weekday"] == "Thursday", "Weekday_num"] = 4
train.loc[train["Weekday"] == "Friday", "Weekday_num"] = 5
train.loc[train["Weekday"] == "Saturday", "Weekday_num"] = 6
train.loc[train["Weekday"] == "Sunday", "Weekday_num"] = 7

In [51]:
for idx in range(1, 8):
    train["Weekday_{}".format(idx)] = train["Weekday_num"] == idx

In [52]:
test.loc[test["Weekday"] == "Monday", "Weekday_num"] = 1
test.loc[test["Weekday"] == "Tuesday", "Weekday_num"] = 2
test.loc[test["Weekday"] == "Wednesday", "Weekday_num"] = 3
test.loc[test["Weekday"] == "Thursday", "Weekday_num"] = 4
test.loc[test["Weekday"] == "Friday", "Weekday_num"] = 5
test.loc[test["Weekday"] == "Saturday", "Weekday_num"] = 6
test.loc[test["Weekday"] == "Sunday", "Weekday_num"] = 7

In [53]:
for idx in range(1, 8):
    test["Weekday_{}".format(idx)] = test["Weekday_num"] == idx

## 5. Divde Upc
- 3~12자리의 여러자기 종류의 UPC를 모두 12자리로 복원 후, 필요한 부분은 company_Upc와 product_Upc로 나누어 인코딩

### train

In [54]:
train["Upc"] = train["Upc"].astype(str)

In [55]:
train["full_Upc"] = train["Upc"].apply(pf.upc_789101112_to_10)
train["full_Upc"] = train["full_Upc"].apply(pf.upc_3456_to_10)

train["company_Upc"] = train["full_Upc"].apply(pf.company_part_Upc)
train["product_Upc"] = train["full_Upc"].apply(pf.product_part_Upc) 

train[["Upc", "full_Upc", "company_Upc", "product_Upc"]].tail()

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc
647049,32390001778.0,2390001778,23900,1778
647050,7874205336.0,7874205336,78742,5336
647051,4072.0,404072,4,4072
647052,4190007664.0,4190007664,41900,7664
647053,3800059655.0,3800059655,38000,59655


### test

In [56]:
test["Upc"] = test["Upc"].astype(str)

In [57]:
test["full_Upc"] = test["Upc"].apply(pf.upc_789101112_to_10)
test["full_Upc"] = test["full_Upc"].apply(pf.upc_3456_to_10)

test["company_Upc"] = test["full_Upc"].apply(pf.company_part_Upc)
test["product_Upc"] = test["full_Upc"].apply(pf.product_part_Upc) 

test[["Upc", "full_Upc", "company_Upc", "product_Upc"]].tail()

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc
653641,66572105763.0,6572105763,65721,5763
653642,88181390024.0,8181390024,81813,90024
653643,4282557050.0,4282557050,42825,57050
653644,80469193740.0,469193740,4691,93740
653645,7871535983.0,7871535983,78715,35983


## 6. company_Upc Encode

### train

In [58]:
company_upc_dummy_train = pd.get_dummies(train["company_Upc"])

In [59]:
train = pd.concat([train, company_upc_dummy_train], axis=1)

print(train.shape)
train.head()

(647054, 5773)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,...,99804,99829,99870,99919,99923,99928,99939,99967,99988,99991
0,999,5,Friday,68113152929.0,-1,FINANCIAL SERVICES,1000.0,21,False,False,...,0,0,0,0,0,0,0,0,0,0
1,30,7,Friday,60538815980.0,1,SHOES,8931.0,62,False,False,...,0,0,0,0,0,0,0,0,0,0
2,30,7,Friday,7410811099.0,1,PERSONAL CARE,4504.0,50,False,False,...,0,0,0,0,0,0,0,0,0,0
3,26,8,Friday,2238403510.0,2,PAINT AND ACCESSORIES,3565.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0
4,26,8,Friday,2006613744.0,2,PAINT AND ACCESSORIES,1017.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0


### test

In [60]:
company_upc_dummy_test = pd.get_dummies(test["company_Upc"])

In [61]:
test = pd.concat([test, company_upc_dummy_test], axis=1)

print(test.shape)
test.head()

(653646, 5791)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,desc_tag_3,...,99800,99804,99829,99870,99874,99919,99923,99939,99967,99988
0,1,Friday,72503389714.0,1,SHOES,3002.0,62,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,Friday,1707710732.0,1,DAIRY,1526.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,1,Friday,89470001026.0,1,DAIRY,1431.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,Friday,88491211470.0,1,GROCERY DRY GOODS,3555.0,25,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,2,Friday,2840015224.0,1,DSD GROCERY,4408.0,18,False,False,False,...,0,0,0,0,0,0,0,0,0,0


## 7. FinelineNumber Encode

### train

In [62]:
FlN_dummy_train = pd.get_dummies(train["FinelineNumber"])

In [63]:
train = pd.concat([train, FlN_dummy_train], axis=1)

print(train.shape)
train.head()

(647054, 10967)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,...,9964.0,9966.0,9967.0,9970.0,9971.0,9974.0,9975.0,9991.0,9997.0,9998.0
0,999,5,Friday,68113152929.0,-1,FINANCIAL SERVICES,1000.0,21,False,False,...,0,0,0,0,0,0,0,0,0,0
1,30,7,Friday,60538815980.0,1,SHOES,8931.0,62,False,False,...,0,0,0,0,0,0,0,0,0,0
2,30,7,Friday,7410811099.0,1,PERSONAL CARE,4504.0,50,False,False,...,0,0,0,0,0,0,0,0,0,0
3,26,8,Friday,2238403510.0,2,PAINT AND ACCESSORIES,3565.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0
4,26,8,Friday,2006613744.0,2,PAINT AND ACCESSORIES,1017.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0


### test

In [64]:
FlN_dummy_test = pd.get_dummies(test["FinelineNumber"])

In [65]:
test = pd.concat([test, FlN_dummy_test], axis=1)

print(test.shape)
test.head()

(653646, 10994)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,desc_tag_3,...,9967.0,9969.0,9970.0,9971.0,9974.0,9975.0,9991.0,9997.0,9998.0,9999.0
0,1,Friday,72503389714.0,1,SHOES,3002.0,62,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,Friday,1707710732.0,1,DAIRY,1526.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,1,Friday,89470001026.0,1,DAIRY,1431.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,Friday,88491211470.0,1,GROCERY DRY GOODS,3555.0,25,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,2,Friday,2840015224.0,1,DSD GROCERY,4408.0,18,False,False,False,...,0,0,0,0,0,0,0,0,0,0


### Top 1000 frequent_value of company_Upc, FinelineNumber, product_Upc

In [66]:
top_company_Upc_list = list(train["company_Upc"].value_counts().index)[:1000]
top_company_Upc_list[:5]

['78742', '00004', '81131', '05388', '37000']

In [67]:
top_FinelineNumber_list = list(train["FinelineNumber"].value_counts().index)[:1000]
top_FinelineNumber_list[:5]

[5501.0, 1508.0, 135.0, 808.0, 0.0]

In [68]:
top_product_Upc_list = list(train["product_Upc"].value_counts().index)[:1000]
top_product_Upc_list[:5]

['00000', '04011', '62097', '99996', '35186']

## Feature Selection

In [74]:
feature_names = ['VisitNumber', 'ScanCount', 'desc_tag_1',
       'desc_tag_2', 'desc_tag_3', 'desc_tag_4', 'desc_tag_5', 'desc_tag_6',
       'desc_tag_7', 'desc_tag_8', 'desc_tag_9', 'desc_tag_10', 'desc_tag_11',
       'desc_tag_12', 'desc_tag_13', 'desc_tag_14', 'desc_tag_15',
       'desc_tag_16', 'desc_tag_17', 'desc_tag_18', 'desc_tag_19',
       'desc_tag_20', 'desc_tag_21', 'desc_tag_22', 'desc_tag_23',
       'desc_tag_24', 'desc_tag_25', 'desc_tag_26', 'desc_tag_27',
       'desc_tag_28', 'desc_tag_29', 'desc_tag_30', 'desc_tag_31',
       'desc_tag_32', 'desc_tag_33', 'desc_tag_34', 'desc_tag_35',
       'desc_tag_36', 'desc_tag_37', 'desc_tag_38', 'desc_tag_39',
       'desc_tag_40', 'desc_tag_41', 'desc_tag_42', 'desc_tag_43',
       'desc_tag_44', 'desc_tag_45', 'desc_tag_46', 'desc_tag_47', 
       'desc_tag_48', 'desc_tag_49', 'desc_tag_50', 'desc_tag_51', 
       'desc_tag_52', 'desc_tag_53',
       'desc_tag_54', 'desc_tag_55', 'desc_tag_56', 'desc_tag_57',
       'desc_tag_58', 'desc_tag_59', 'desc_tag_60', 'desc_tag_61',
       'desc_tag_62', 'desc_tag_63', 'desc_tag_64', 'desc_tag_65',
       'desc_tag_66', 'desc_tag_67', 'desc_tag_68',
       'Weekday_1',   'Weekday_2',   'Weekday_3',   'Weekday_4',
       'Weekday_5',   'Weekday_6',   'Weekday_7']

feature_names = feature_names + top_FinelineNumber_list + top_company_Upc_list

In [75]:
len(feature_names)

2077

In [76]:
in_train = train[feature_names]

In [77]:
in_test = test[feature_names]

## 8. Groupby VisitNumber

### train

In [78]:
train = in_train.groupby(by='VisitNumber').sum().reset_index()

In [79]:
train["TripType"] = train_TripType.astype(int)

In [80]:
print(train.shape)
train.head()

(95674, 2078)


Unnamed: 0,VisitNumber,ScanCount,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,desc_tag_8,...,74027,74200,38891,00650,77501,88783,87512,80225,70277,TripType
0,5,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,999
1,7,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,30
2,8,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,26
3,9,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,8
4,10,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,8


### test

In [81]:
test = in_test.groupby(by='VisitNumber').sum().reset_index()

In [82]:
print(test.shape)
test.head()

(95674, 2077)


Unnamed: 0,VisitNumber,ScanCount,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,desc_tag_8,...,33991,74027,74200,38891,00650,77501,88783,87512,80225,70277
0,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## X_train, y_train

In [83]:
feature_names.remove('VisitNumber')
len(feature_names)

2076

In [84]:
X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(95674, 2076)


Unnamed: 0,ScanCount,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,desc_tag_8,desc_tag_9,...,33991,74027,74200,38891,00650,77501,88783,87512,80225,70277
0,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
label_name = 'TripType'

y_train = train[label_name]

print(y_train.shape)
y_train[:5]

(95674,)


0    999
1     30
2     26
3      8
4      8
Name: TripType, dtype: int64

## X_test

In [86]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(95674, 2076)


Unnamed: 0,ScanCount,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,desc_tag_8,desc_tag_9,...,33991,74027,74200,38891,00650,77501,88783,87512,80225,70277
0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0


## model

In [3]:
import xgboost as xgb

model = xgb.XGBClassifier(nthread=-1, tree_method='gpu_hist', seed=8)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=8, silent=True,
       subsample=1, tree_method='gpu_hist')

In [88]:
%time model.fit(X_train, y_train)

CPU times: user 3h 51min 17s, sys: 8.49 s, total: 3h 51min 25s
Wall time: 3h 51min 29s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=-1, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=8, silent=True,
       subsample=1)

In [3]:
# s = datetime.datetime.now() 
msg = "modeling_success."
siw.send_slack(msg)

<Response [200]>


### Prediction

In [92]:
prediction_prob = model.predict_proba(X_test)

print(prediction_prob.shape)
prediction_prob

(95674, 38)


array([[1.9244675e-04, 1.2690612e-04, 7.5309019e-04, ..., 4.6125967e-03,
        5.6023977e-04, 6.5943599e-03],
       [8.9760346e-04, 5.9191114e-04, 3.5125376e-03, ..., 1.6691707e-02,
        2.1450261e-03, 1.2534158e-01],
       [1.8113913e-05, 4.6492391e-06, 7.7981873e-05, ..., 1.8883784e-05,
        1.9959027e-05, 9.9173146e-01],
       ...,
       [1.2278178e-03, 3.1514000e-04, 4.3874430e-03, ..., 9.7832258e-04,
        1.0977978e-03, 4.7401555e-02],
       [8.6032087e-05, 5.6732548e-05, 3.3666400e-04, ..., 6.2108673e-03,
        1.2108069e-02, 2.2760422e-03],
       [1.7653638e-05, 1.1641432e-05, 5.0616032e-05, ..., 1.6885896e-03,
        5.5197853e-04, 1.3914361e-03]], dtype=float32)

### Submission

In [93]:
submission_columns = ['TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999']

X_submission = pd.DataFrame(prediction_prob, columns=submission_columns)

In [94]:
X_submission["VisitNumber"] = test["VisitNumber"]

In [95]:
submission_columns.insert(0, 'VisitNumber')
submission = X_submission[submission_columns]
submission.head()

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.000192,0.000127,0.000753,0.000481,0.015574,0.010427,0.003792,0.000619,0.000105,...,0.001271,0.001019,0.56267,0.038262,0.000393,0.010829,0.013616,0.004613,0.00056,0.006594
1,2,0.000898,0.000592,0.003513,0.00209,0.041765,0.051501,0.023837,0.005378,0.000491,...,0.006103,0.003995,0.017212,0.142629,0.001665,0.008781,0.030022,0.016692,0.002145,0.125342
2,3,1.8e-05,5e-06,7.8e-05,2.9e-05,0.00038,0.004831,0.001205,1e-05,4e-06,...,0.001073,1.4e-05,3.1e-05,7e-05,1.2e-05,1.8e-05,5.2e-05,1.9e-05,2e-05,0.991731
3,4,0.000963,0.000247,0.004472,0.001546,0.035504,0.269553,0.572006,0.000512,0.000292,...,0.001601,0.000769,0.001645,0.002665,0.000586,0.000948,0.003848,0.000847,0.000901,0.04111
4,6,1.8e-05,5e-06,7.8e-05,2.9e-05,0.000669,0.001622,0.002859,1e-05,4e-06,...,3e-05,1.4e-05,3.1e-05,5e-05,1.1e-05,2.3e-05,7.3e-05,1.4e-05,1.8e-05,0.993471


In [96]:
submission.to_csv("submission_xgboost_2076_columns.csv", index=False)

In [97]:
from sklearn.externals import joblib
joblib.dump(model, 'xgboost_2076_columns_joblib.pkl') 

['xgboost_2076_columns_joblib.pkl']