# Walmart Classification Preprocessing
1. Data Load
2. Fill NaN
    - DepartmentDescription
    - FinelineNumber
    - Upc
3. DepartmentDescription Encode
4. Weekday Encode
5. Divide Upc
    - Compnay Upc
    - Product Upc

In [1]:
import pandas as pd
import numpy as np
import preprocessing_functions as pf
from functools import partial

## 1. Data Load

In [2]:
train = pd.read_csv("train.csv")

print(train.shape)
train.head()

(647054, 7)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113150000.0,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538820000.0,1,SHOES,8931.0
2,30,7,Friday,7410811000.0,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238404000.0,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006614000.0,2,PAINT AND ACCESSORIES,1017.0


In [3]:
test = pd.read_csv("test.csv")

print(test.shape)
test.head()

(653646, 6)


Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503390000.0,1,SHOES,3002.0
1,1,Friday,1707711000.0,1,DAIRY,1526.0
2,1,Friday,89470000000.0,1,DAIRY,1431.0
3,1,Friday,88491210000.0,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015000.0,1,DSD GROCERY,4408.0


## 2. Pre-Processing

### Fill NaN
- 아직 모든 NaN값에 대해 채울 값을 정하지 못하였기 때문에, 정해진 값을 제외하고 함수를 사용하기 위해서 모두 최빈값을 넣어줌

#### Fill in missing DepartmentDescription

In [4]:
train["DepartmentDescription"].value_counts().head(1)

GROCERY DRY GOODS    70402
Name: DepartmentDescription, dtype: int64

In [5]:
train.loc[train["DepartmentDescription"].isna(), "DepartmentDescription"] = "GROCERY DRY GOODS"
train[train["DepartmentDescription"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


#### Fill in missing FinelineNumber
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 FinelineNumber에는 DepartmentDescription이 'PHARMACY RX'일때의 FinelineNumber의 최빈값으로 채워준다.

In [6]:
train[train["DepartmentDescription"] == 'PHARMACY RX']["FinelineNumber"].value_counts()

4822.0    84
5615.0    63
1335.0     6
1336.0     1
Name: FinelineNumber, dtype: int64

In [7]:
Pharmacy_idx = train[train["DepartmentDescription"]=='PHARMACY RX'].index
number_idx = np.arange(2922)
idx_box = zip(number_idx, Pharmacy_idx)


for idx, Pha_idx in idx_box:
    if idx % 2 == 0:
        train.loc[Pha_idx, "FinelineNumber"] = 4822.0
    else:
        train.loc[Pha_idx, "FinelineNumber"] = 5615.0
        
train[train["DepartmentDescription"] == 'PHARMACY RX'][["DepartmentDescription", "FinelineNumber"]].head()

Unnamed: 0,DepartmentDescription,FinelineNumber
1155,PHARMACY RX,4822.0
1216,PHARMACY RX,5615.0
1373,PHARMACY RX,4822.0
1455,PHARMACY RX,5615.0
1456,PHARMACY RX,4822.0


In [8]:
train["FinelineNumber"].value_counts().head(1)

5501.0    8244
Name: FinelineNumber, dtype: int64

In [9]:
train.loc[train["FinelineNumber"].isna(), "FinelineNumber"] = 5501.0
train[train["FinelineNumber"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


#### Fill in Upc
- DepartmentDescription이 'PHARMACY RX'인 빈값이 들어있는 Upc에는 DepartmentDescription이 'PHARMACY RX'일때의 Upc의 최빈값으로 채워준다.(**아직 최빈값을 무엇으로 할지 정하지 못함**)

In [10]:
train["Upc"].value_counts().head(1)

4011.0    7657
Name: Upc, dtype: int64

In [11]:
train.loc[train["Upc"].isna(), "Upc"] = 4011.0
train[train["Upc"].isna()]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber


### 3. DepartmentDescription Encode
- 총 68가지의 종류
- 종류 별로 one-hot encoding

In [12]:
train_desc_ls = train["DepartmentDescription"].unique()
train_desc_ls = list(train_desc_ls)
train_desc_ls.sort()
len(train_desc_ls)

68

In [13]:
number = np.arange(1, 69)
train_desc_dic = zip(train_desc_ls, number)
train_desc_dict = {}
for desc, number in train_desc_dic:
    train_desc_dict[desc] = number

train_desc_dict

{'1-HR PHOTO': 1,
 'ACCESSORIES': 2,
 'AUTOMOTIVE': 3,
 'BAKERY': 4,
 'BATH AND SHOWER': 5,
 'BEAUTY': 6,
 'BEDDING': 7,
 'BOOKS AND MAGAZINES': 8,
 'BOYS WEAR': 9,
 'BRAS & SHAPEWEAR': 10,
 'CAMERAS AND SUPPLIES': 11,
 'CANDY, TOBACCO, COOKIES': 12,
 'CELEBRATION': 13,
 'COMM BREAD': 14,
 'CONCEPT STORES': 15,
 'COOK AND DINE': 16,
 'DAIRY': 17,
 'DSD GROCERY': 18,
 'ELECTRONICS': 19,
 'FABRICS AND CRAFTS': 20,
 'FINANCIAL SERVICES': 21,
 'FROZEN FOODS': 22,
 'FURNITURE': 23,
 'GIRLS WEAR, 4-6X  AND 7-14': 24,
 'GROCERY DRY GOODS': 25,
 'HARDWARE': 26,
 'HEALTH AND BEAUTY AIDS': 27,
 'HOME DECOR': 28,
 'HOME MANAGEMENT': 29,
 'HORTICULTURE AND ACCESS': 30,
 'HOUSEHOLD CHEMICALS/SUPP': 31,
 'HOUSEHOLD PAPER GOODS': 32,
 'IMPULSE MERCHANDISE': 33,
 'INFANT APPAREL': 34,
 'INFANT CONSUMABLE HARDLINES': 35,
 'JEWELRY AND SUNGLASSES': 36,
 'LADIES SOCKS': 37,
 'LADIESWEAR': 38,
 'LARGE HOUSEHOLD GOODS': 39,
 'LAWN AND GARDEN': 40,
 'LIQUOR,WINE,BEER': 41,
 'MEAT - FRESH & FROZEN': 42,
 'ME

In [14]:
train["desc_tag"] = train["DepartmentDescription"]
train["desc_tag"] = train["desc_tag"].apply(partial(pf.desc_tagger, train_desc_dict))

In [15]:
for idx in range(1, 69):
    train["desc_tag_{}".format(idx)] = train["desc_tag"] == idx

### 4. Weekday Encode
- 월화수목금토일(1,2,3,4,5,6,7) one-hot encoding

In [16]:
train.loc[train["Weekday"] == "Monday", "Weekday_num"] = 1
train.loc[train["Weekday"] == "Tuesday", "Weekday_num"] = 2
train.loc[train["Weekday"] == "Wednesday", "Weekday_num"] = 3
train.loc[train["Weekday"] == "Thursday", "Weekday_num"] = 4
train.loc[train["Weekday"] == "Friday", "Weekday_num"] = 5
train.loc[train["Weekday"] == "Saturday", "Weekday_num"] = 6
train.loc[train["Weekday"] == "Sunday", "Weekday_num"] = 7

In [17]:
for idx in range(1, 8):
    train["Weekday_{}".format(idx)] = train["Weekday_num"] == idx

### 5. Divde Upc
- 총 8자리 초과 Upc는 12자리로 복원
- 총 7자리 미만 Upc는 10자리로 복원
- 복원된 12자리 Upc중 앞뒤 한자리를 제외하고 (1)-5-5-(1) company_Upc, product_Upc로 나눔
- 복원된 10자리 Upc는 5-5 company_Upc, product_Upc로 나눔

In [18]:
# Upc복원작업을 위한 Upc string 처리
train["Upc"] = train["Upc"].astype(str)

In [19]:
train["full_Upc"] = train["Upc"].apply(pf.upc_345_to_10)
train["full_Upc"] = train["full_Upc"].apply(pf.upc_78_to_12)
train["full_Upc"] = train["full_Upc"].apply(pf.upc_91011_to_12)

train["company_Upc"] = train["full_Upc"].apply(pf.company_part_Upc)
train["product_Upc"] = train["full_Upc"].apply(pf.product_part_Upc) 

train[["Upc", "full_Upc", "company_Upc", "product_Upc"]].tail()

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc
647049,32390001778.0,32390001778.0,32390,177
647050,7874205336.0,7874205336.0,7874,20533
647051,4072.0,10000000.0,0,10000
647052,4190007664.0,4190007664.0,4190,766
647053,3800059655.0,3800059655.0,3800,5965


##### Encoding, Divide 된 주요 컬럼

In [20]:
train[["Upc", "full_Upc", "company_Upc", "product_Upc", "DepartmentDescription", "desc_tag", "Weekday", "Weekday_num"]]

Unnamed: 0,Upc,full_Upc,company_Upc,product_Upc,DepartmentDescription,desc_tag,Weekday,Weekday_num
0,68113152929.0,068113152929.0,68113,15292,FINANCIAL SERVICES,21,Friday,5.0
1,60538815980.0,060538815980.0,60538,81598,SHOES,63,Friday,5.0
2,7410811099.0,007410811099.0,07410,81109,PERSONAL CARE,51,Friday,5.0
3,2238403510.0,002238403510.0,02238,40351,PAINT AND ACCESSORIES,50,Friday,5.0
4,2006613744.0,002006613744.0,02006,61374,PAINT AND ACCESSORIES,50,Friday,5.0
5,2006618783.0,002006618783.0,02006,61878,PAINT AND ACCESSORIES,50,Friday,5.0
6,2006613743.0,002006613743.0,02006,61374,PAINT AND ACCESSORIES,50,Friday,5.0
7,7004802737.0,007004802737.0,07004,80273,PAINT AND ACCESSORIES,50,Friday,5.0
8,2238495318.0,002238495318.0,02238,49531,PAINT AND ACCESSORIES,50,Friday,5.0
9,2238400200.0,002238400200.0,02238,40020,PAINT AND ACCESSORIES,50,Friday,5.0


## To do list
- DepartmentDescription, FinelineNumber, Upc 마저 빈값 채우기
- company_Upc Encode, product_Upc 사용 방법 정하기
- 빈 값을 다 채우고 나면 클러스터링(kmeans), 시각화 시도해보기