# Walmart Classification Preprocessing
1. Data Load
2. Fill NaN
    - DepartmentDescription
    - FinelineNumber
    - Upc
3. DepartmentDescription Encode
4. Weekday Encode
5. Divide Upc
    - Compnay Upc
    - Product Upc
6. Company_Upc Encode

In [1]:
import pandas as pd
import numpy as np
import preprocessing_functions as pf
from functools import partial
from tqdm import tqdm

## 1. Data Load

In [2]:
train = pd.read_csv("train.csv")

print(train.shape)
train.head()

(647054, 7)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113150000.0,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538820000.0,1,SHOES,8931.0
2,30,7,Friday,7410811000.0,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238404000.0,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006614000.0,2,PAINT AND ACCESSORIES,1017.0


In [3]:
test = pd.read_csv("test.csv")

print(test.shape)
test.head()

(653646, 6)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503390000.0,1,SHOES,3002.0
1,1,Friday,1707711000.0,1,DAIRY,1526.0
2,1,Friday,89470000000.0,1,DAIRY,1431.0
3,1,Friday,88491210000.0,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015000.0,1,DSD GROCERY,4408.0


## 2. Pre-Processing

### Fill NaN

### Fill in missing DepartmentDescription - train
- VisitNumber에 따른 DepartmentDescription의 최빈값으로 DepartmentDescription의 빈값 채우기
- 유추할 수 없는 191개의 값은 'UNKNOWN' 으로 대체

In [4]:
train.loc[train["VisitNumber"]==259, "DepartmentDescription"]

546        LAWN AND GARDEN
547        LAWN AND GARDEN
548                    NaN
549                    NaN
550        LAWN AND GARDEN
551        LAWN AND GARDEN
552    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

In [5]:
DD_VN_list = train[train["DepartmentDescription"].isna()]["VisitNumber"].unique()

In [6]:
for loc in tqdm(DD_VN_list): # if: 특정 VisitNumber 따른 DepartmentDescription 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["DepartmentDescription"].isna()), "DepartmentDescription"] = train[train["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index[0]

100%|██████████| 1172/1172 [00:35<00:00, 32.74it/s]


In [7]:
train.loc[train["VisitNumber"]==259, "DepartmentDescription"]

546        LAWN AND GARDEN
547        LAWN AND GARDEN
548        LAWN AND GARDEN
549        LAWN AND GARDEN
550        LAWN AND GARDEN
551        LAWN AND GARDEN
552    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

### Fill in missing DepartmentDescription - test
- VisitNumber에 따른 DepartmentDescription의 최빈값으로 DepartmentDescription의 빈값 채우기
- 유추할 수 없는 191개의 값은 'UNKNOWN' 으로 대체

In [8]:
test.loc[test["VisitNumber"]==874, "DepartmentDescription"]

2115             AUTOMOTIVE
2116             AUTOMOTIVE
2117                    NaN
2118                    NaN
2119             AUTOMOTIVE
2120    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

In [9]:
DD_VN_list_t = test[test["DepartmentDescription"].isna()]["VisitNumber"].unique()

In [10]:
for loc in tqdm(DD_VN_list_t): # if: 특정 VisitNumber 따른 DepartmentDescription 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["DepartmentDescription"].isna()), "DepartmentDescription"] = test[test["VisitNumber"] == loc]["DepartmentDescription"].value_counts().index[0]

100%|██████████| 1141/1141 [00:36<00:00, 31.27it/s]


In [11]:
test.loc[test["VisitNumber"]==874, "DepartmentDescription"]

2115             AUTOMOTIVE
2116             AUTOMOTIVE
2117             AUTOMOTIVE
2118             AUTOMOTIVE
2119             AUTOMOTIVE
2120    IMPULSE MERCHANDISE
Name: DepartmentDescription, dtype: object

### Fill in missing FinelineNumber - train
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 FinelineNumber에는 DepartmentDescription이 'PHARMACY RX'일때의 FinelineNumber의 최빈값으로 채워준다.
- VisitNumber에 따른 FinelineNumber의 최빈값으로 FinelineNumber의 빈값 채우기
- 191개의 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 -9999 값으로 대체

In [12]:
train[train["DepartmentDescription"] == 'PHARMACY RX']["FinelineNumber"].value_counts()

4822.0    84
5615.0    63
1335.0     6
1336.0     1
Name: FinelineNumber, dtype: int64

In [13]:
Pharmacy_idx = train[train["DepartmentDescription"]=='PHARMACY RX'].index
number_idx = np.arange(2922)
idx_box = zip(number_idx, Pharmacy_idx)


for idx, Pha_idx in tqdm(idx_box):
    if idx % 2 == 0:
        train.loc[Pha_idx, "FinelineNumber"] = 4822.0
    else:
        train.loc[Pha_idx, "FinelineNumber"] = 5615.0
        
train[train["DepartmentDescription"] == 'PHARMACY RX'][["DepartmentDescription", "FinelineNumber"]].head()

2922it [00:09, 292.49it/s]


Unnamed: 0,DepartmentDescription,FinelineNumber
1155,PHARMACY RX,4822.0
1216,PHARMACY RX,5615.0
1373,PHARMACY RX,4822.0
1455,PHARMACY RX,5615.0
1456,PHARMACY RX,4822.0


In [14]:
train.loc[train["VisitNumber"]==259, "FinelineNumber"]

546    5141.0
547    1748.0
548       NaN
549       NaN
550    2605.0
551    2605.0
552     337.0
Name: FinelineNumber, dtype: float64

In [15]:
FN_VN_list = train[train["FinelineNumber"].isna()]["VisitNumber"].unique()

In [16]:
for loc in tqdm(FN_VN_list): # if: 특정 VisitNumber 따른 FinelineNumber 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["FinelineNumber"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["FinelineNumber"].isna()), "FinelineNumber"] = train[train["VisitNumber"] == loc]["FinelineNumber"].value_counts().index[0]

100%|██████████| 1172/1172 [00:10<00:00, 106.93it/s]


In [17]:
train.loc[train["VisitNumber"]==259, "FinelineNumber"]

546    5141.0
547    1748.0
548    2605.0
549    2605.0
550    2605.0
551    2605.0
552     337.0
Name: FinelineNumber, dtype: float64

### Fill in missing FinelineNumber - test
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 FinelineNumber에는 DepartmentDescription이 'PHARMACY RX'일때의 FinelineNumber의 최빈값으로 채워준다.
- VisitNumber에 따른 FinelineNumber의 최빈값으로 FinelineNumber의 빈값 채우기
- 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 -9999 값으로 대체

In [18]:
test[test["DepartmentDescription"] == 'PHARMACY RX']["FinelineNumber"].value_counts()

4822.0    79
5615.0    45
1335.0     2
Name: FinelineNumber, dtype: int64

In [19]:
Pharmacy_idx = test[test["DepartmentDescription"]=='PHARMACY RX'].index
number_idx = np.arange(2784)
idx_box = zip(number_idx, Pharmacy_idx)


for idx, Pha_idx in tqdm(idx_box):
    if idx % 2 == 0:
        test.loc[Pha_idx, "FinelineNumber"] = 4822.0
    else:
        test.loc[Pha_idx, "FinelineNumber"] = 5615.0
        
test[test["DepartmentDescription"] == 'PHARMACY RX'][["DepartmentDescription", "FinelineNumber"]].head()

2784it [00:10, 271.41it/s]


Unnamed: 0,DepartmentDescription,FinelineNumber
1188,PHARMACY RX,4822.0
1189,PHARMACY RX,5615.0
1190,PHARMACY RX,4822.0
1314,PHARMACY RX,5615.0
1315,PHARMACY RX,4822.0


In [20]:
test.loc[test["VisitNumber"]==874, ["FinelineNumber","ScanCount"]]

Unnamed: 0,FinelineNumber,ScanCount
2115,250.0,1
2116,9.0,1
2117,,1
2118,,-1
2119,253.0,1
2120,145.0,1


In [21]:
FN_VN_list_t = test[test["FinelineNumber"].isna()]["VisitNumber"].unique()

In [22]:
for loc in tqdm(FN_VN_list_t): # if: 특정 VisitNumber 따른 FinelineNumber 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["FinelineNumber"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["FinelineNumber"].isna()), "FinelineNumber"] = test[test["VisitNumber"] == loc]["FinelineNumber"].value_counts().index[0]

100%|██████████| 1141/1141 [00:10<00:00, 106.57it/s]


In [23]:
test.loc[test["VisitNumber"]==874, "FinelineNumber"]

2115    250.0
2116      9.0
2117    145.0
2118    145.0
2119    253.0
2120    145.0
Name: FinelineNumber, dtype: float64

### Fill in Upc - train
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 Upc에는 DepartmentDescription이 'PHARMACY RX'일때의 Upc의 최빈값으로 채워준다.(**아직 최빈값을 무엇으로 할지 정하지 못함**)
- VisitNumber에 따른 Upc의 최빈값으로 Upc의 빈값 채우기
- 191개의 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 '0000599996' 값으로 대체

In [24]:
train.loc[train["VisitNumber"]==259, "Upc"]

546    7.112176e+09
547    4.656118e+09
548             NaN
549             NaN
550    3.146256e+09
551    3.146253e+09
552    4.650073e+09
Name: Upc, dtype: float64

In [25]:
Upc_VN_list = train[train["Upc"].isna()]["VisitNumber"].unique()

In [26]:
for loc in tqdm(Upc_VN_list): # if: 특정 VisitNumber 따른 Upc 값이 모두 비어있는 경우 제외
    if len(train[train["VisitNumber"] == loc]["Upc"].value_counts().index) != 0:
        train.loc[(train["VisitNumber"] == loc)&(train["Upc"].isna()), "Upc"] = train[train["VisitNumber"] == loc]["Upc"].value_counts().index[0]

100%|██████████| 2754/2754 [00:16<00:00, 164.09it/s]


In [27]:
train.loc[train["VisitNumber"]==259, "Upc"]

546    7.112176e+09
547    4.656118e+09
548    4.656118e+09
549    4.656118e+09
550    3.146256e+09
551    3.146253e+09
552    4.650073e+09
Name: Upc, dtype: float64

### Fill in Upc - test
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 Upc에는 DepartmentDescription이 'PHARMACY RX'일때의 Upc의 최빈값으로 채워준다.(**아직 최빈값을 무엇으로 할지 정하지 못함**)
- VisitNumber에 따른 Upc의 최빈값으로 Upc의 빈값 채우기
- 유추할 수 없는 값은 기존에 있던 값과 중복되지 않는 '0000599996' 값으로 대체

In [28]:
test.loc[test["VisitNumber"]==874, "Upc"]

2115    1.284410e+09
2116    8.182000e+10
2117             NaN
2118             NaN
2119    1.284410e+09
2120    3.400001e+09
Name: Upc, dtype: float64

In [29]:
Upc_VN_list_t = test[test["Upc"].isna()]["VisitNumber"].unique()

In [30]:
for loc in tqdm(Upc_VN_list_t): # if: 특정 VisitNumber 따른 Upc 값이 모두 비어있는 경우 제외
    if len(test[test["VisitNumber"] == loc]["Upc"].value_counts().index) != 0:
        test.loc[(test["VisitNumber"] == loc)&(test["Upc"].isna()), "Upc"] = test[test["VisitNumber"] == loc]["Upc"].value_counts().index[0]

100%|██████████| 2706/2706 [00:16<00:00, 167.94it/s]


In [31]:
test.loc[test["VisitNumber"]==874, "Upc"]

2115    1.284410e+09
2116    8.182000e+10
2117    3.400001e+09
2118    3.400001e+09
2119    1.284410e+09
2120    3.400001e+09
Name: Upc, dtype: float64

###  VisitNumber에 따른 모든 DepartmentDescription, FinelineNumber, Upc의 값이 비어있는 경우

- 총 191 개의 유추 불가능한 DepartmentDescription, FinelineNumber, Upc의 값이 모두 비어있는 경우
- 기존에 train, test 데이터에 없는 값으로 각각 "UNKNOWN", -9999, '0000599996' 채운다.

In [32]:
empty_df = train[(train["DepartmentDescription"].isna())&(train["DepartmentDescription"].isna())&(train["DepartmentDescription"].isna())][["VisitNumber", "DepartmentDescription", "FinelineNumber", "Upc", "TripType", "Weekday", "ScanCount"]]

print(empty_df.shape)
empty_df.head()

(191, 7)


Unnamed: 0,VisitNumber,DepartmentDescription,FinelineNumber,Upc,TripType,Weekday,ScanCount
959,409,,,,999,Friday,-1
1134,484,,,,999,Friday,-2
1135,484,,,,999,Friday,-2
6285,2245,,,,999,Friday,-1
8524,3004,,,,999,Friday,1


In [33]:
print("191개의 빈 row들은 모두 triptype이 {}이다.".format(empty_df["TripType"].value_counts().index[0]))
empty_df["TripType"].value_counts()

191개의 빈 row들은 모두 triptype이 999이다.


999    191
Name: TripType, dtype: int64

### train - DepartmentDescription, FinelineNumber, Upc

In [34]:
train.loc[train["DepartmentDescription"].isna(), "DepartmentDescription"] = "UNKNOWN"
train[train["DepartmentDescription"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [35]:
train.loc[train["FinelineNumber"].isna(), "FinelineNumber"] = -9999.0
train[train["FinelineNumber"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [36]:
train.loc[train["Upc"].isna(), "Upc"] = 0000599996.0
train[train["Upc"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


### test - DepartmentDescription, FinelineNumber, Upc

In [37]:
test.loc[test["DepartmentDescription"].isna(), "DepartmentDescription"] = "UNKNOWN"
test[test["DepartmentDescription"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [38]:
test.loc[test["FinelineNumber"].isna(), "FinelineNumber"] = -9999.0
test[test["FinelineNumber"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


In [39]:
test.loc[test["Upc"].isna(), "Upc"] = 0000599996.0
test[test["Upc"].isna()]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


### 3. DepartmentDescription Encode
- 총 68가지의 종류
- 종류 별로 one-hot encoding
- 'MENSWEAR'와 'MENS WEAR'는 같은 카테고리로 변환

### train

In [40]:
## MENSWEAR - > MENS WEAR
train.loc[train["DepartmentDescription"] == 'MENSWEAR', "DepartmentDescription"] = 'MENS WEAR'

In [41]:
train_desc_ls = train["DepartmentDescription"].unique()
train_desc_ls = list(train_desc_ls)
train_desc_ls.sort()
len(train_desc_ls)

68

In [42]:
number = np.arange(1, 69)
train_desc_dic = zip(train_desc_ls, number)
train_desc_dict = {}
for desc, number in train_desc_dic:
    train_desc_dict[desc] = number

train_desc_dict

{'1-HR PHOTO': 1,
 'ACCESSORIES': 2,
 'AUTOMOTIVE': 3,
 'BAKERY': 4,
 'BATH AND SHOWER': 5,
 'BEAUTY': 6,
 'BEDDING': 7,
 'BOOKS AND MAGAZINES': 8,
 'BOYS WEAR': 9,
 'BRAS & SHAPEWEAR': 10,
 'CAMERAS AND SUPPLIES': 11,
 'CANDY, TOBACCO, COOKIES': 12,
 'CELEBRATION': 13,
 'COMM BREAD': 14,
 'CONCEPT STORES': 15,
 'COOK AND DINE': 16,
 'DAIRY': 17,
 'DSD GROCERY': 18,
 'ELECTRONICS': 19,
 'FABRICS AND CRAFTS': 20,
 'FINANCIAL SERVICES': 21,
 'FROZEN FOODS': 22,
 'FURNITURE': 23,
 'GIRLS WEAR, 4-6X  AND 7-14': 24,
 'GROCERY DRY GOODS': 25,
 'HARDWARE': 26,
 'HEALTH AND BEAUTY AIDS': 27,
 'HOME DECOR': 28,
 'HOME MANAGEMENT': 29,
 'HORTICULTURE AND ACCESS': 30,
 'HOUSEHOLD CHEMICALS/SUPP': 31,
 'HOUSEHOLD PAPER GOODS': 32,
 'IMPULSE MERCHANDISE': 33,
 'INFANT APPAREL': 34,
 'INFANT CONSUMABLE HARDLINES': 35,
 'JEWELRY AND SUNGLASSES': 36,
 'LADIES SOCKS': 37,
 'LADIESWEAR': 38,
 'LARGE HOUSEHOLD GOODS': 39,
 'LAWN AND GARDEN': 40,
 'LIQUOR,WINE,BEER': 41,
 'MEAT - FRESH & FROZEN': 42,
 'ME

In [43]:
train["desc_tag"] = train["DepartmentDescription"]
train["desc_tag"] = train["desc_tag"].apply(partial(pf.desc_tagger, train_desc_dict))

In [44]:
for idx in range(1, 69):
    train["desc_tag_{}".format(idx)] = train["desc_tag"] == idx

### test

In [45]:
## MENSWEAR - > MENS WEAR
test.loc[test["DepartmentDescription"] == 'MENSWEAR', "DepartmentDescription"] = 'MENS WEAR'

In [46]:
test["desc_tag"] = test["DepartmentDescription"]
test["desc_tag"] = test["desc_tag"].apply(partial(pf.desc_tagger, train_desc_dict))

In [47]:
for idx in range(1, 69):
    test["desc_tag_{}".format(idx)] = test["desc_tag"] == idx

### 4. Weekday Encode
- 월화수목금토일(1,2,3,4,5,6,7) one-hot encoding

In [48]:
train.loc[train["Weekday"] == "Monday", "Weekday_num"] = 1
train.loc[train["Weekday"] == "Tuesday", "Weekday_num"] = 2
train.loc[train["Weekday"] == "Wednesday", "Weekday_num"] = 3
train.loc[train["Weekday"] == "Thursday", "Weekday_num"] = 4
train.loc[train["Weekday"] == "Friday", "Weekday_num"] = 5
train.loc[train["Weekday"] == "Saturday", "Weekday_num"] = 6
train.loc[train["Weekday"] == "Sunday", "Weekday_num"] = 7

In [49]:
for idx in range(1, 8):
    train["Weekday_{}".format(idx)] = train["Weekday_num"] == idx

In [50]:
test.loc[test["Weekday"] == "Monday", "Weekday_num"] = 1
test.loc[test["Weekday"] == "Tuesday", "Weekday_num"] = 2
test.loc[test["Weekday"] == "Wednesday", "Weekday_num"] = 3
test.loc[test["Weekday"] == "Thursday", "Weekday_num"] = 4
test.loc[test["Weekday"] == "Friday", "Weekday_num"] = 5
test.loc[test["Weekday"] == "Saturday", "Weekday_num"] = 6
test.loc[test["Weekday"] == "Sunday", "Weekday_num"] = 7

In [51]:
for idx in range(1, 8):
    test["Weekday_{}".format(idx)] = test["Weekday_num"] == idx

### 5. Divde Upc
- 3~12자리의 여러자기 종류의 UPC를 모두 12자리로 복원 후, 필요한 부분은 company_Upc와 product_Upc로 나누어 인코딩

### train

In [52]:
train["Upc"] = train["Upc"].astype(str)

In [53]:
train["full_Upc"] = train["Upc"].apply(pf.upc_789101112_to_10)
train["full_Upc"] = train["full_Upc"].apply(pf.upc_3456_to_10)

train["company_Upc"] = train["full_Upc"].apply(pf.company_part_Upc)
train["product_Upc"] = train["full_Upc"].apply(pf.product_part_Upc) 

train[["Upc", "full_Upc", "company_Upc", "product_Upc"]].tail()

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc
647049,32390001778.0,2390001778,23900,1778
647050,7874205336.0,7874205336,78742,5336
647051,4072.0,404072,4,4072
647052,4190007664.0,4190007664,41900,7664
647053,3800059655.0,3800059655,38000,59655


### test

In [54]:
test["Upc"] = test["Upc"].astype(str)

In [55]:
test["full_Upc"] = test["Upc"].apply(pf.upc_789101112_to_10)
test["full_Upc"] = test["full_Upc"].apply(pf.upc_3456_to_10)

test["company_Upc"] = test["full_Upc"].apply(pf.company_part_Upc)
test["product_Upc"] = test["full_Upc"].apply(pf.product_part_Upc) 

test[["Upc", "full_Upc", "company_Upc", "product_Upc"]].tail()

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc
653641,66572105763.0,6572105763,65721,5763
653642,88181390024.0,8181390024,81813,90024
653643,4282557050.0,4282557050,42825,57050
653644,80469193740.0,469193740,4691,93740
653645,7871535983.0,7871535983,78715,35983


### 6. company_Upc Encode

### train

In [56]:
company_upc_dummy_train = pd.get_dummies(train["company_Upc"])

In [57]:
train = pd.concat([train, company_upc_dummy_train], axis=1)

print(train.shape)
train.head()

(647054, 5773)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,...,99804,99829,99870,99919,99923,99928,99939,99967,99988,99991
0,999,5,Friday,68113152929.0,-1,FINANCIAL SERVICES,1000.0,21,False,False,...,0,0,0,0,0,0,0,0,0,0
1,30,7,Friday,60538815980.0,1,SHOES,8931.0,62,False,False,...,0,0,0,0,0,0,0,0,0,0
2,30,7,Friday,7410811099.0,1,PERSONAL CARE,4504.0,50,False,False,...,0,0,0,0,0,0,0,0,0,0
3,26,8,Friday,2238403510.0,2,PAINT AND ACCESSORIES,3565.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0
4,26,8,Friday,2006613744.0,2,PAINT AND ACCESSORIES,1017.0,49,False,False,...,0,0,0,0,0,0,0,0,0,0


### test

In [58]:
company_upc_dummy_test = pd.get_dummies(test["company_Upc"])

In [59]:
test = pd.concat([test, company_upc_dummy_test], axis=1)

print(test.shape)
test.head()

(653646, 5791)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,desc_tag_3,...,99800,99804,99829,99870,99874,99919,99923,99939,99967,99988
0,1,Friday,72503389714.0,1,SHOES,3002.0,62,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,Friday,1707710732.0,1,DAIRY,1526.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,1,Friday,89470001026.0,1,DAIRY,1431.0,17,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,Friday,88491211470.0,1,GROCERY DRY GOODS,3555.0,25,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,2,Friday,2840015224.0,1,DSD GROCERY,4408.0,18,False,False,False,...,0,0,0,0,0,0,0,0,0,0


### Featrue Selection
- train data에는 있으나 test data에는 없는 382개의 company_Upc는 제외
- Encode된 object feature 제외

In [60]:
feature_columns = list(train.columns)

In [61]:
remove_list = ["Weekday", "Upc", "DepartmentDescription", "company_Upc", "product_Upc", "TripType", "VisitNumber",\
               '00337', '00373', '00680', '01593', '01769', '02213', '02369', '02443', '02651',\
               '03001', '05144', '05168', '05244', '05260', '05265', '05502', '05584', '05609',\
               '05770', '05778', '05801', '05838', '05848', '05849', '06147', '06162', '06163',\
               '06176', '06248', '06301', '06931', '06960', '08311', '08323', '08709', '08811',\
               '08831', '08855', '08874', '08931', '08957', '09048', '09332', '09716', '09752',\
               '09901', '09991', '10136', '10273', '10687', '10754', '10864', '10879', '11215',\
               '11233', '11283', '11865', '12103', '12260', '12317', '12429', '12715', '13258',\
               '13658', '13958', '14949', '15095', '15222', '15225', '15583', '15849', '16125',\
               '16162', '16169', '16698', '16737', '16796', '17123', '17411', '17478', '18176',\
               '18268', '18575', '18655', '18664', '18771', '18929', '19231', '19275', '19357',\
               '19548', '19565', '19903', '20213', '20578', '20616', '20642', '21105', '21382',\
               '22053', '22174', '23332', '23634', '23922', '24021', '24416', '25004', '25436',\
               '25675', '26362', '27187', '27557', '27782', '27800', '27969', '28500', '28586',\
               '28610', '28765', '28766', '28821', '28839', '29054', '29094', '29133', '29169',\
               '29189', '29225', '29409', '29839', '29988', '30175', '30206', '30306', '30775',\
               '30828', '30918', '31299', '31901', '32009', '32051', '32228', '32244', '32386',\
               '33005', '33051', '33158', '33197', '33799', '34098', '34357', '34556', '34573',\
               '34575', '34656', '34886', '35787', '36332', '37049', '37461', '38462', '38597',\
               '39214', '40410', '40595', '41278', '41343', '41404', '41580', '41636', '41642',\
               '41740', '42281', '42283', '42322', '42693', '42699', '42853', '44306', '45095',\
               '45334', '45849', '45986', '46226', '46447', '47163', '47194', '47282', '47298',\
               '47356', '47700', '47832', '48081', '48485', '48587', '48985', '49004', '49183',\
               '49207', '49568', '49833', '50008', '50015', '50086', '50146', '50214', '50254',\
               '50264', '50274', '50385', '50393', '50394', '50411', '50413', '50423', '50561',\
               '50592', '50624', '50778', '50946', '51057', '51218', '51299', '51319', '51515',\
               '51664', '51667', '51884', '52276', '52435', '52530', '52865', '53276', '53427',\
               '53575', '53740', '53770', '53838', '54392', '54918', '55114', '55142', '55230',\
               '55280', '55431', '55583', '55601', '55759', '55824', '56551', '56879', '57184',\
               '57294', '57798', '57867', '58099', '58297', '58328', '58445', '58622', '58648',\
               '58859', '58962', '59230', '59410', '59751', '61308', '61524', '62825', '63300',\
               '67208', '69044', '69328', '69381', '69540', '70000', '70275', '70313', '70844',\
               '71132', '71391', '71436', '71653', '71757', '72457', '72508', '72824', '73077',\
               '73226', '74011', '74213', '74928', '75307', '75376', '75380', '75386', '76670',\
               '76762', '77245', '77548', '77726', '78372', '78905', '78931', '79085', '79631',\
               '79633', '79942', '79943', '79945', '80153', '80292', '80518', '80993', '81034',\
               '81241', '81306', '81483', '82306', '82801', '82904', '83318', '84530', '85267',\
               '85810', '86069', '86106', '86831', '86867', '87012', '87038', '87064', '87118',\
               '88247', '88303', '88908', '88985', '89013', '89033', '89156', '89176', '89419',\
               '89423', '89518', '89601', '90064', '90288', '90333', '90342', '90577', '90586',\
               '90615', '90669', '90794', '90930', '91001', '91051', '91546', '91549', '91946',\
               '92997', '93164', '93169', '93222', '93765', '95359', '95684', '96287', '97629',\
               '97712', '97812', '97818', '97833', '97923', '98328', '99157', '99274', '99464',\
               '99606', '99675', '99928', '99991']

for rm_value in remove_list:
    feature_columns.remove(rm_value)

In [62]:
len(feature_columns)

5384

##### Encoding, Divide 된 주요 컬럼

## X_train, y_train

In [63]:
X_train = train[feature_columns]

print(X_train.shape)
X_train.head()

(647054, 5384)


Unnamed: 0,ScanCount,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,...,99792,99800,99804,99829,99870,99919,99923,99939,99967,99988
0,-1,1000.0,21,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,8931.0,62,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,1,4504.0,50,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,2,3565.0,49,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,2,1017.0,49,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [64]:
label_name = 'TripType'

y_train = train[label_name]

print(y_train.shape)
y_train[:5]

(647054,)


0    999
1     30
2     30
3     26
4     26
Name: TripType, dtype: int64

## X_test

In [65]:
X_test = test[feature_columns]

print(X_test.shape)
X_test.head()

(653646, 5384)


Unnamed: 0,ScanCount,FinelineNumber,desc_tag,desc_tag_1,desc_tag_2,desc_tag_3,desc_tag_4,desc_tag_5,desc_tag_6,desc_tag_7,...,99792,99800,99804,99829,99870,99919,99923,99939,99967,99988
0,1,3002.0,62,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
1,1,1526.0,17,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
2,1,1431.0,17,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,1,3555.0,25,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,1,4408.0,18,False,False,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,0
