## Decision Tree Example

### 라이브러리 선언

In [272]:
import pandas as pd
import numpy as np

#모델 라이브러리 선언
#tree모델은 설명이 가능한 구조 <-> 앙상블모델
from sklearn import datasets, tree

#모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error #평균의 차이
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score 

In [2]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
featureData = pd.read_csv("./dataset/feature_regression_example.csv")

In [3]:
featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


### 2. 타입 통합 / 특성 숫자컬럼 타입 변환

In [4]:
# 데이터 타입 확인
featureData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [5]:
# 데이터 정제1. 홀리데이 및 프로모션 여부 컬럼에 대해서 Y->1, N->0타입을 숫자형태로 변경
featureData['HO_YN'] = np.where(featureData.HOLIDAY == 'Y', 1, 0)
featureData["PRO_YN"] = np.where(featureData.PROMOTION == 'Y', 1, 0)

#### 다른풀이 방식1(get_dummies)

In [6]:
pd.concat([featureData, pd.get_dummies(prefix="PROMO", data = featureData.PROMOTION)], axis=1).head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PROMO_N,PROMO_Y
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,0,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1,0,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,0,1


#### 다른풀이 방식2(sklearn) : 카테고리별 데이터가 너무 많을 때 사용시 유용

In [7]:
from sklearn.preprocessing import LabelEncoder
le_pro = LabelEncoder()
le_holy = LabelEncoder()

In [8]:
# labelencoder가 알아서 y는 1, n은 0으로 인코딩해줌
featureData["HOLY_ENCO"] = le_pro.fit_transform(featureData.PROMOTION)

In [9]:
featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,1


In [10]:
# 바꾼것을 다시 돌리고 싶다면
le_pro.inverse_transform(featureData["HOLY_ENCO"])

  if diff:


array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'N'], dtype=object)

#### 다른풀이 방식3(dictionary mapping) : 내가 바꾸고자 하는 데이터를 원하는 값으로 매핑시켜줌

In [11]:
binarymap = {"Y":1, "N":0}

featureData["PRO_DICT"] = featureData.PROMOTION.map(binarymap)

In [12]:
featureData.head(10)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO,PRO_DICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,1,1
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201506,2015,6,867,N,4,Y,0.208155,0,1,1,1
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201507,2015,7,1187,N,4,Y,0.208155,0,1,1,1
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201508,2015,8,970,Y,1,Y,0.208155,1,1,1,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201509,2015,9,542,N,4,Y,0.208155,0,1,1,1
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201510,2015,10,350,N,4,N,0.0,0,0,0,0


### 3. 특성선정 / 데이터 분리

In [13]:
# 상관관계 확인
corrdf = featureData.corr()

In [74]:
corrdf

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO,PRO_DICT
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032,0.009395,0.108551,0.108551,0.108551
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435,-0.070803,0.085606,0.085606,0.085606
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462,0.284231,0.089293,0.089293,0.089293
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772,0.514813,0.630081,0.630081,0.630081
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991,-0.974902,-0.386926,-0.386926,-0.386926
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0,0.496585,0.903477,0.903477,0.903477
HO_YN,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,1.0,0.378861,0.378861,0.378861
PRO_YN,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,1.0
HOLY_ENCO,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,1.0
PRO_DICT,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,1.0


In [14]:
# 상관관계 계수 비율 선정 
# : 여러가지 모델 중에서 정확도가 가장높은 설정값으로 셋팅하는게 원칙이지만 그렇게되면 너무 복잡해져서
# 우선 임의로 지정
corrstd = 0.5

In [91]:
# 내가 원하는 feature 값들을 가져올 수 있게됨
features = list(corrdf[(abs(corrdf.QTY) > corrstd)&(abs(corrdf.QTY) < 1)].index)

In [92]:
features

['HCLUS', 'PRO_PERCENT', 'HO_YN', 'PRO_YN', 'HOLY_ENCO', 'PRO_DICT']

In [45]:
# 데이터 분리 : YWARWEEK 정렬 후 인덱스 80% 시점으로 지정해야 함
# 8:2로 나누기 위해 데이터의 길이 확인
indexStd = round(len(featureData)*0.8)

yearweekStd = featureData.iloc[0:indexStd,:].YEARWEEK.max()

# 내가 예측하고자 하는 값을 label로
label = ['QTY']

In [46]:
yearweekStd

201633

In [47]:
# 위에서 설정한 분리기준 년도로 traningData와 testData 분리
trainingData_features = featureData[featureData.YEARWEEK <= yearweekStd][features]
trainingData_label = featureData[featureData.YEARWEEK <= yearweekStd][label]
testData_features = featureData[featureData.YEARWEEK > yearweekStd][features]
testData_label = featureData[featureData.YEARWEEK > yearweekStd]

In [49]:
testData_label

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO,PRO_DICT
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1,1
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1,1
88,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1,1
89,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1,1,1
90,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1,1,1
91,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1,1,1
92,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1,1,1
93,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,1,1
94,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,0,1,1,1
95,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,0,0


### 4. 모델 적용 및 학습 / 예측

#### 4-1. Decision Tree 모델 적용

In [50]:
# Define Model depth = 100-200
model_method = tree.DecisionTreeRegressor(random_state=1)

In [51]:
# Learning(Feature & Label)
model = model_method.fit(trainingData_features, trainingData_label)

In [52]:
model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

#### 4-1-2. Decsion Tree 모델 예측

In [53]:
# 예측
predict = model.predict(testData_features)

#### 4-1-3. 데이터 정리

In [55]:
# 데이터프레임으로 변환
predictData = pd.DataFrame(predict)

In [56]:
# 컬럼명 설정
predictData.columns = ["PREDICT"]

In [66]:
# 데이터 합치기
testData_label = testData_label.reset_index(drop=True)
finalResult = pd.concat([testData_label, predictData], axis=1)

In [67]:
finalResult.head(10)

Unnamed: 0,index,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO,PRO_DICT,PREDICT
0,86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1,1,1434.6
1,87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1,1,1434.6
2,88,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1,1,1434.6
3,89,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1,1,1,1434.6
4,90,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1,1,1,1434.6
5,91,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1,1,1,1708.75
6,92,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1,1,1,1708.75
7,93,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,1,1,2620.428571
8,94,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,0,1,1,1,1708.75
9,95,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,0,0,336.727273


#### 4-1-4. 결과 검증

In [59]:
# MAE
mean_absolute_error(finalResult['QTY'], finalResult['PREDICT'])

455.88382807668523

In [60]:
# RMSE(MSE)
mean_squared_error(finalResult['QTY'], finalResult['PREDICT'])

361337.6680780306

In [61]:
# R2
r2_score(finalResult['QTY'], finalResult["PREDICT"])

0.3837928130534113

#### 4-2. Linear Regresiion 모델 적용

In [93]:
from sklearn import datasets, linear_model

In [94]:
# Define Model
model_method2 = linear_model.LinearRegression()

In [95]:
# Learning (Feature & Label)
model2 = model_method2.fit(trainingData_features,trainingData_label)

#### 4-2-1. Linear Regression 모델 예측

In [123]:
predict2 = model2.predict(testData_features)

#### 4-2-2. Linear Regression 모델 데이터 정리

In [98]:
# 데이터프레임으로 변환
predict2 = pd.DataFrame(predict2)

In [103]:
# 컬럼명 주입
predict2.columns = ["PREDICT2"]

In [104]:
# 데이터합치기
## testData_label 인덱스 reset은 위에서 했으므로 생략
finalResult2 = pd.concat([testData_label, predict2], axis=1)

#### 4-2-3. Linear Regresiion 결과 검증

In [107]:
# MAE
mean_absolute_error(finalResult2['QTY'], finalResult2['PREDICT2'])

609.4472138981575

In [108]:
# RMSE(MSE)
mean_squared_error(finalResult2['QTY'], finalResult2['PREDICT2'])

581236.9130656859

In [109]:
# R2
r2_score(finalResult2['QTY'], finalResult2["PREDICT2"])

0.008787638845390466

#### 4-3. Random Forest 모델 적용

In [121]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
model_method3 = RandomForestRegressor(n_estimators=100, random_state = 1)

# Train the model on training data
model_method3.fit(trainingData_features,trainingData_label)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

#### 4-3-1. Random Forest 모델 예측

In [126]:
predict3 = model_method3.predict(testData_features)

In [127]:
predict3

array([1463.58473882, 1463.58473882, 1463.58473882, 1474.06164358,
       1474.06164358, 1696.11693254, 1696.11693254, 2491.21467471,
       1696.11693254,  343.78099059, 2164.52718723, 2164.52718723,
       2164.52718723, 2164.52718723, 2164.52718723, 1669.86056025,
       1669.86056025, 1669.86056025, 1821.50983478,  343.78099059,
        343.78099059])

#### 4-3-2. 모델 검증

In [128]:
# 데이터프레임으로 변환
predict3 = pd.DataFrame(predict3)

In [129]:
# 컬럼명 주입
predict3.columns = ["PREDICT3"]

In [130]:
# 데이터합치기
## testData_label 인덱스 reset은 위에서 했으므로 생략
finalResult3 = pd.concat([testData_label, predict3], axis=1)

In [131]:
finalResult3

Unnamed: 0,index,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,HOLY_ENCO,PRO_DICT,PREDICT3
0,86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1,1,1463.584739
1,87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1,1,1463.584739
2,88,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1,1,1463.584739
3,89,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1,1,1,1474.061644
4,90,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1,1,1,1474.061644
5,91,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1,1,1,1696.116933
6,92,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1,1,1,1696.116933
7,93,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,1,1,2491.214675
8,94,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,0,1,1,1,1696.116933
9,95,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,0,0,343.780991


## 실습1. cars.csv 파일의 sales를 예측

In [132]:
carsData = pd.read_csv("./dataset/cars.csv")

In [133]:
carsData.head()

Unnamed: 0,age,gender,miles,debt,income,sales
0,28,0,23,0,4099,620
1,26,0,27,0,2677,1792
2,30,1,58,41576,6215,27754
3,26,1,25,43172,7626,28256
4,20,1,17,6979,8071,4438


### 1. 데이터타입 재정의 

In [135]:
carsData.dtypes

# 모두 int형이므로 재정의 필요없을 것 같음

age       int64
gender    int64
miles     int64
debt      int64
income    int64
sales     int64
dtype: object

### 2. 특성선정, 데이터분리

In [140]:
# 상관관계 확인
carscorrdf = carsData.corr()

In [141]:
carscorrdf

Unnamed: 0,age,gender,miles,debt,income,sales
age,1.0,-0.000702,0.232399,0.218896,0.239644,0.352609
gender,-0.000702,1.0,-0.031355,-0.033181,-0.034317,-0.03635
miles,0.232399,-0.031355,1.0,0.544791,0.422141,0.636676
debt,0.218896,-0.033181,0.544791,1.0,0.49179,0.835541
income,0.239644,-0.034317,0.422141,0.49179,1.0,0.674685
sales,0.352609,-0.03635,0.636676,0.835541,0.674685,1.0


In [145]:
# 상관관계 계수 비율 선정 
corrstd = 0.5

# feature 선정
carsFeature = list(carscorrdf[(abs(carscorrdf.sales) > corrstd)&(abs(carscorrdf.sales) < 1)].index)

# label 선정
carsLabel = ['sales']

In [143]:
carsFeature

['miles', 'debt', 'income']

In [157]:
# 데이터 분리
# train_test_split 함수를 활용하여 feature/label 데이터 자동 분리 7:3

# 라이브러리 선언
from sklearn.model_selection import train_test_split

featureData = carsData[carsFeature]
labelData = carsData[carsLabel]

feature_train, feature_test, label_train, label_test = train_test_split(featureData, labelData, test_size = 0.3, random_state=1)

### 3. 모델선정 및 적용 / 예측

In [158]:
# Define Model depth = 100-200
cars_model_method = tree.DecisionTreeRegressor(random_state=1)

In [159]:
# Learning(Feature & Label)
cars_model = model_method.fit(feature_train, label_train)

In [160]:
cars_model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [161]:
# 예측
cars_predict = model.predict(feature_test)

### 4. 데이터 정리

In [165]:
cars_predict = pd.DataFrame(cars_predict)
cars_predict.columns = ["PREDICT"]

In [167]:
# 데이터 concat
label_test = label_test.reset_index(drop=True)
cars_finalResult = pd.concat([label_test, cars_predict], axis=1)

In [168]:
cars_finalResult.head()

Unnamed: 0,sales,PREDICT
0,5265,15823.0
1,23894,22508.0
2,9319,9533.0
3,13722,13410.0
4,22145,21080.0


## 실습2. kopo_decision_tree_all_new.csv 파일의 판매량 예측 생성 모델

In [394]:
featureData = pd.read_csv("./dataset/kopo_decision_tree_all_new.csv")

In [395]:
featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,A01,PG01,P01,ITEM001,201538,2015,38,1,N,4,N,0.0
1,A01,PG01,P01,ITEM001,201548,2015,48,1,Y,0,N,0.0
2,A01,PG01,P01,ITEM001,201549,2015,49,2,Y,0,N,0.0
3,A01,PG01,P01,ITEM002,201526,2015,26,1,Y,1,N,0.0
4,A01,PG01,P01,ITEM002,201532,2015,32,1,N,4,N,0.0


In [396]:
featureData["YEARWEEK"].count()

32415

In [397]:
len(featureData)

32415

### 1. 타입 통합 및 특성컬럼 숫자로 타입 변환

In [352]:
featureData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [398]:
# YEARWEEK, YEARK, WEEK, QTY int타입이므로 타입 변환 생략

# HOLIDAY, PROMOTION 숫자로 타입 변환
binarymap = {"Y":1, "N":0}

featureData["PRO_DICT"] = featureData.PROMOTION.map(binarymap)
featureData["HOLI_DICT"] = featureData.HOLIDAY.map(binarymap)

# featureData_new = pd.concat([featureData, pd.get_dummies(prefix="PROMO", data = featureData.PROMOTION)], axis=1)
# featureData = pd.concat([featureData, pd.get_dummies(prefix="HOLI", data = featureData.HOLIDAY)], axis=1).head()

In [399]:
featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_DICT,HOLI_DICT
0,A01,PG01,P01,ITEM001,201538,2015,38,1,N,4,N,0.0,0,0
1,A01,PG01,P01,ITEM001,201548,2015,48,1,Y,0,N,0.0,0,1
2,A01,PG01,P01,ITEM001,201549,2015,49,2,Y,0,N,0.0,0,1
3,A01,PG01,P01,ITEM002,201526,2015,26,1,Y,1,N,0.0,0,1
4,A01,PG01,P01,ITEM002,201532,2015,32,1,N,4,N,0.0,0,0


### 2. Group Data 생성
#### ITEM 종류가 한개가 아니라 여러개일때는 Group Data 적용

In [400]:
# ITEM 종류가 한개가 아니라 여러개일때는 ? 
# 지역, 상품, 아이템별로 그룹을 두고 count
# 아이템별로 영향을 받는 feature가 다르기떄문에 그룹별로 영향을 많이 끼치는 feature를 선정하여 예측이 필요함
# <문제점>
# 하지만 아이템별로 corr이 없는 항목이 있을 수 있음. 이러한 error를 처리해야함
# 각각 ITEM으로 나누다보니까 ITEM별로 갯수가 적은 데이터가 있을 수 있음. 이런 error도 처리해줘야함
# => Max길이를 아에 선언해서 해결해볼 수 있음(내가 원하는 데이터 양으로 선언)
groupKey = ["REGIONID", "PRODUCTGROUP", "PRODUCT", "ITEM"]

groupData = featureData.groupby(groupKey)

In [371]:
groupData

<pandas.core.groupby.DataFrameGroupBy object at 0x000000000F4536D8>

In [364]:
onegroup = groupData.get_group(list(groupData.groups)[8])

In [365]:
eachgroup = onegroup.reset_index()

In [366]:
# 상관관계 산출
corrdf = eachgroup.corr()

In [367]:
features = list(corrdf[(abs(corrdf.QTY > 0.5)) & (corrdf.QTY != 1)].index)

In [368]:
features

['PRO_PERCENT']

In [369]:
label = ['QTY']

### 3. 특성 선정 및 데이터 분리

In [221]:
stdIndex = round(len(eachgroup)*0.8)

In [223]:
stdIndex

71

In [248]:
stdYearweek = eachgroup.loc[stdIndex,["YEARWEEK"]].values

In [249]:
stdYearweek

array([201619], dtype=object)

In [238]:
traingData_features = eachgroup[eachgroup.YEARWEEK <= stdYearweek][features]
traingData_label = eachgroup[eachgroup.YEARWEEK <= stdYearweek][label]
testData_features = eachgroup[eachgroup.YEARWEEK > stdYearweek][features]
testData_label = eachgroup[eachgroup.YEARWEEK > stdYearweek][label]
testData_label2 = eachgroup[eachgroup.YEARWEEK > stdYearweek]

### 4. 모델 선언 및 학습

In [235]:
model_method= tree.DecisionTreeRegressor()

In [236]:
model = model.fit(traingData_features, traingData_label)

In [239]:
final_predict = model.predict(testData_features)

In [241]:
final_predict = pd.DataFrame(final_predict)

In [243]:
final_predict.columns = ["PREDICT"]

In [None]:
# 함수화 시킬때, 지역, 그룹, 아이템별로 카운트수를 옆에 붙인다.

In [421]:
#그룹바이 필터  -- 146주가 다포함된 데이터만 추출
#그룹바이 끝날때마다 그룹이 되어 인덱스가 변하기때문에 reset_index를 항상해줘야함
featureData=featureData.groupby(by = groupKey).filter(lambda x: x["YEARWEEK"].count()==146).reset_index(drop=True)

In [410]:
a= featureData[featureData["YEARWEEK"] > 201700]

In [406]:
# # 함수화
# from sklearn import datasets, tree

# def miFcst(onegroup):

#     onegroup.reset_index(drop=True)

#     ### 1. 상관관계를 산출 후 feature와 label을 정의한다.
#     corrdf = eachgroup.corr()
#     features = list(corrdf[(abs(corrdf.QTY > 0.5)) & (corrdf.QTY != 1)].index)

#     print(features)

#     label = ["QTY"]
    
#     ### 2. 데이터 분리
    
#     stdIndex = round(len(eachgroup)*0.8)

#     stdYearweek = eachgroup.loc[stdIndex,["YEARWEEK"]].values[0]
    
#     print(stdYearweek)
    
#     trainData = onegroup[onegroup["YEARWEEK"] < stdYearweek].reset_index(drop = True)
#     testData = onegroup[onegroup["YEARWEEK"] >= stdYearweek].reset_index(drop = True)
#     testData['YEARWEEK'] = testData['YEARWEEK']
# #     train_feature = trainData[ ["WEEK","HOLIDAY",'PRO_PERCENT']]
#     #train_feature = trainData[ ["YEARWEEK","HOLIDAY",'PRO_PERCENT']]
#     #train_feature = trainData[ ["HOLIDAY",'PRO_PERCENT']]
# #     train_label = trainData[['QTY']]
# #     test_feature = testData[ ["WEEK","HOLIDAY",'PRO_PERCENT']]
#     #test_feature = testData[ ["YEARWEEK","HOLIDAY",'PRO_PERCENT']]
#     #test_feature = testData[ ["HOLIDAY",'PRO_PERCENT']]
    

#     traingData_features = onegroup[onegroup.YEARWEEK <= stdYearweek][features]
#     traingData_label = onegroup[onegroup.YEARWEEK <= stdYearweek][label]
#     testData_features = onegroup[onegroup.YEARWEEK > stdYearweek][features]
#     testData_label = onegroup[onegroup.YEARWEEK > stdYearweek]
# #     testData_label2 = eachgroup[eachgroup.YEARWEEK > stdYearweek]

#     model_method= tree.DecisionTreeRegressor()
#     model = model_method.fit(traingData_features, testData_label)
#     predict = model.predict(testData_features)
#     final_predict = pd.DataFrame(data = predict, columns=["PREDICT"])

#     return final_predict

In [414]:
def sub_function(data):
    #그룹바이 다음에 바로 apply(sub_function)이 들어갈거기때문에 맨첫줄에 reset_index 
    data.reset_index(inplace=True,drop=True)
    #####################train,test 데이터 분리###################################
    trainData = data[data["YEARWEEK"].astype(int)<201700].reset_index(drop = True)
    testData = data[data["YEARWEEK"].astype(int)>=201700].reset_index(drop = True)
    testData['YEARWEEK'] = testData['YEARWEEK']
    train_feature = trainData[ ["WEEK","HOLI_DICT",'PRO_DICT']]
    #train_feature = trainData[ ["YEARWEEK","HOLIDAY",'PRO_PERCENT']]
    #train_feature = trainData[ ["HOLIDAY",'PRO_PERCENT']]
    train_label = trainData[['QTY']]
    test_feature = testData[ ["WEEK","HOLI_DICT",'PRO_DICT']]
    #test_feature = testData[ ["YEARWEEK","HOLIDAY",'PRO_PERCENT']]
    #test_feature = testData[ ["HOLIDAY",'PRO_PERCENT']]
    #################linearRegression 모델 적용 ##########################
    lm = linear_model.LinearRegression()
    model = lm.fit(train_feature, train_label)
    predict = lm.predict(test_feature)
    predictDF = pd.DataFrame(data = predict , columns=["PREDICT"])
    testData["PREDICT"]=predictDF
    
    return testData

In [417]:
final_data = featureData.groupby(groupKey).apply(sub_function).reset_index(drop=True)