In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [2]:
data = pd.read_csv('train.csv',  encoding='CP949')
data.head()

Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,,9.0,6.0,,,6.0,9.0,화장품
1,21,0,353450,9,39272,1.2,6,8,11.1,37.5,,5.0,4.0,,3.0,,6.0,영캐주얼
2,23,0,5671400,36,157539,2.8,22,16,5.6,37.5,7.0,17.0,12.0,,10.0,11.0,15.0,장신구
3,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,,,18.0,10.0,화장품
4,35,0,885000,5,177000,6.0,5,2,0.0,100.0,,,5.0,,,5.0,,피혁A


In [3]:
data.dtypes

custid         int64
gender         int64
총구매액           int64
구매건수           int64
평균구매가격         int64
평균할부개월수      float64
구매브랜드종류        int64
내점일수           int64
수입상품_구매비율    float64
주말방문비율       float64
가을_구매건수      float64
겨울_구매건수      float64
봄_구매건수       float64
여름_구매건수      float64
아침_구매건수      float64
저녁_구매건수      float64
점심_구매건수      float64
주구매코너         object
dtype: object

In [4]:
data.isnull().sum()

custid          0
gender          0
총구매액            0
구매건수            0
평균구매가격          0
평균할부개월수         0
구매브랜드종류         0
내점일수            0
수입상품_구매비율       0
주말방문비율          0
가을_구매건수       945
겨울_구매건수       897
봄_구매건수        689
여름_구매건수      1022
아침_구매건수      1442
저녁_구매건수       967
점심_구매건수       145
주구매코너           0
dtype: int64

In [5]:
data['주구매코너'].value_counts()

일반식품        1306
화장품         1206
유니캐주얼        410
유아동복         383
스포츠          301
영캐주얼         246
캐릭터캐주얼       188
수입명품         136
섬유            96
트래디셔널캐주얼      91
니트단품          91
정장셔츠          90
엘레강스캐주얼       78
피혁A           75
장신구           68
피혁B           56
디자이너부띠끄       51
문화완구          38
조리욕실          28
가전            18
기타바이어         12
도자기크리스탈       11
침구수예           8
가구             7
타운모피           6
Name: 주구매코너, dtype: int64

유일한 ojbect 데이터인 주구매 코너에 대해서 데이터 정제
-> 코너별로 구매건수를 구한다
-> 비슷한 카테고리별로 묶는다(식품, 의류, 생필품, 가구 , 문화, 악세사리 , 화장품)


In [6]:
data1 = data.copy()

In [7]:
data1.head()

Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,,9.0,6.0,,,6.0,9.0,화장품
1,21,0,353450,9,39272,1.2,6,8,11.1,37.5,,5.0,4.0,,3.0,,6.0,영캐주얼
2,23,0,5671400,36,157539,2.8,22,16,5.6,37.5,7.0,17.0,12.0,,10.0,11.0,15.0,장신구
3,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,,,18.0,10.0,화장품
4,35,0,885000,5,177000,6.0,5,2,0.0,100.0,,,5.0,,,5.0,,피혁A


In [8]:
data_sum = data1.groupby('주구매코너')['구매건수'].sum()

In [9]:
data_sum.head()

주구매코너
가구          100
가전          315
기타바이어       149
니트단품       2542
도자기크리스탈     261
Name: 구매건수, dtype: int64

In [10]:
data_sum = data_sum.reset_index()
data_sum.columns = ['주구매코너', '코너별구매건']

In [11]:
data_sum

Unnamed: 0,주구매코너,코너별구매건
0,가구,100
1,가전,315
2,기타바이어,149
3,니트단품,2542
4,도자기크리스탈,261
5,디자이너부띠끄,2219
6,문화완구,848
7,섬유,1627
8,수입명품,4846
9,스포츠,10664


In [12]:
data_sum.head(2)

Unnamed: 0,주구매코너,코너별구매건
0,가구,100
1,가전,315


In [13]:
data_merge = data1.merge(data_sum, how='left',on = '주구매코너')
data_merge

Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너,코너별구매건
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,,9.0,6.0,,,6.0,9.0,화장품,25740
1,21,0,353450,9,39272,1.2,6,8,11.1,37.5,,5.0,4.0,,3.0,,6.0,영캐주얼,8285
2,23,0,5671400,36,157539,2.8,22,16,5.6,37.5,7.0,17.0,12.0,,10.0,11.0,15.0,장신구,1165
3,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,,,18.0,10.0,화장품,25740
4,35,0,885000,5,177000,6.0,5,2,0.0,100.0,,,5.0,,,5.0,,피혁A,880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,49952,1,6666517,93,71683,1.8,27,39,10.8,43.6,13.0,33.0,14.0,33.0,8.0,41.0,44.0,일반식품,76174
4996,49954,0,3112100,17,183065,3.6,10,13,29.4,38.5,4.0,6.0,6.0,1.0,3.0,,14.0,디자이너부띠끄,2219
4997,49957,1,5852482,37,158175,2.6,14,21,40.5,19.0,12.0,6.0,6.0,13.0,16.0,,21.0,화장품,25740
4998,49975,1,654498,13,50346,2.5,10,7,7.7,14.3,6.0,4.0,2.0,1.0,,2.0,11.0,피혁A,880


In [14]:
features = []
f = data1.groupby('custid')['구매건수'].agg([('코너별구매건수','sum')]).reset_index()
features.append(f)
f

Unnamed: 0,custid,코너별구매건수
0,18,15
1,21,9
2,23,36
3,26,28
4,35,5
...,...,...
4995,49952,93
4996,49954,17
4997,49957,37
4998,49975,13


코너별로 구매건수 확인하여 파생변수 생성

In [15]:
f = data1.groupby('custid')['총구매액'].agg([('코너별구매액', 'sum')]).reset_index()
features.append(f)
f

Unnamed: 0,custid,코너별구매액
0,18,680100
1,21,353450
2,23,5671400
3,26,1964000
4,35,885000
...,...,...
4995,49952,6666517
4996,49954,3112100
4997,49957,5852482
4998,49975,654498


코너별로 구매액 확인하여 파생변수 생성

In [16]:
f = data1.groupby('custid')['총구매액'].agg([('평균구매가격', 'mean')]).reset_index()
features.append(f)
f

Unnamed: 0,custid,평균구매가격
0,18,680100
1,21,353450
2,23,5671400
3,26,1964000
4,35,885000
...,...,...
4995,49952,6666517
4996,49954,3112100
4997,49957,5852482
4998,49975,654498


In [17]:
features

[      custid  코너별구매건수
 0         18       15
 1         21        9
 2         23       36
 3         26       28
 4         35        5
 ...      ...      ...
 4995   49952       93
 4996   49954       17
 4997   49957       37
 4998   49975       13
 4999   49993       32
 
 [5000 rows x 2 columns],
       custid   코너별구매액
 0         18   680100
 1         21   353450
 2         23  5671400
 3         26  1964000
 4         35   885000
 ...      ...      ...
 4995   49952  6666517
 4996   49954  3112100
 4997   49957  5852482
 4998   49975   654498
 4999   49993  2554723
 
 [5000 rows x 2 columns],
       custid   평균구매가격
 0         18   680100
 1         21   353450
 2         23  5671400
 3         26  1964000
 4         35   885000
 ...      ...      ...
 4995   49952  6666517
 4996   49954  3112100
 4997   49957  5852482
 4998   49975   654498
 4999   49993  2554723
 
 [5000 rows x 2 columns]]

In [18]:
data_sum['주구매코너']

0           가구
1           가전
2        기타바이어
3         니트단품
4      도자기크리스탈
5      디자이너부띠끄
6         문화완구
7           섬유
8         수입명품
9          스포츠
10     엘레강스캐주얼
11        영캐주얼
12       유니캐주얼
13        유아동복
14        일반식품
15         장신구
16        정장셔츠
17        조리욕실
18        침구수예
19      캐릭터캐주얼
20        타운모피
21    트래디셔널캐주얼
22         피혁A
23         피혁B
24         화장품
Name: 주구매코너, dtype: object

In [19]:
data_train = pd.DataFrame({'custid' : data1.custid.unique()})
data_train

Unnamed: 0,custid
0,18
1,21
2,23
3,26
4,35
...,...
4995,49952
4996,49954
4997,49957
4998,49975


In [20]:
data1.loc[(data1['주구매코너'] == '수입명품'),'코너묶음'] ='명품'

In [21]:
data1.loc[(data1['주구매코너'] == '기타바이어') |
         (data1['주구매코너'] == '문화완구'),'코너묶음'] = '기타'


In [22]:
data1.loc[(data1['주구매코너'] == '영캐주얼') |
         (data1['주구매코너'] == '유니캐주얼')|
         (data1['주구매코너'] == '유아동복')|
         (data1['주구매코너'] == '캐릭터캐주얼')|
         (data1['주구매코너'] == '트래디셔널캐주얼'),'코너묶음'] ='아동'

In [23]:
data1.loc[(data1['주구매코너'] == '정장셔츠')|
         (data1['주구매코너'] == '타운모피')|
         (data1['주구매코너'] == '디자이너부띠끄')|
         (data1['주구매코너'] == '니트단품')|
         (data1['주구매코너'] == '엘레강스캐주얼')|
         (data1['주구매코너'] == '스포츠'),'코너묶음'] = '성인'

In [24]:
data1.loc[(data1['주구매코너'] == '가구')|
         (data1['주구매코너'] == '가전')|
         (data1['주구매코너'] == '도자기크리스탈')|
         (data1['주구매코너'] == '조리욕실')|
         (data1['주구매코너'] == '침구수예'), '코너묶음'] = '집'


In [25]:
data1.loc[(data1['주구매코너'] == '피혁A')|
           (data1['주구매코너'] == '피혁B')|
           (data1['주구매코너'] == '섬유'), '코너묶음'] ='의류'

In [26]:
data1.loc[(data1['주구매코너'] == '장신구')|
         (data1['주구매코너'] == '화장품'), '코너묶음'] = '악세사리'


In [27]:
data1.loc[(data1['주구매코너'] == '일반식품'), '코너묶음'] ='식품'

In [28]:
data1.head(20)

Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너,코너묶음
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,,9.0,6.0,,,6.0,9.0,화장품,악세사리
1,21,0,353450,9,39272,1.2,6,8,11.1,37.5,,5.0,4.0,,3.0,,6.0,영캐주얼,아동
2,23,0,5671400,36,157539,2.8,22,16,5.6,37.5,7.0,17.0,12.0,,10.0,11.0,15.0,장신구,악세사리
3,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,,,18.0,10.0,화장품,악세사리
4,35,0,885000,5,177000,6.0,5,2,0.0,100.0,,,5.0,,,5.0,,피혁A,의류
5,50,0,3580200,36,99450,2.4,27,20,0.0,50.0,7.0,7.0,8.0,14.0,1.0,11.0,24.0,영캐주얼,아동
6,69,0,816300,14,58307,1.7,10,10,7.1,40.0,4.0,4.0,6.0,,,3.0,11.0,유니캐주얼,아동
7,82,0,752200,5,150440,2.6,5,3,60.0,100.0,,,5.0,,,,5.0,화장품,악세사리
8,92,0,1145400,13,88108,1.9,7,9,23.1,22.2,1.0,,2.0,10.0,1.0,8.0,4.0,화장품,악세사리
9,100,0,442160,16,27635,1.0,10,6,25.0,33.3,,5.0,11.0,,,3.0,13.0,화장품,악세사리


In [29]:
data_train

Unnamed: 0,custid
0,18
1,21
2,23
3,26
4,35
...,...
4995,49952
4996,49954
4997,49957
4998,49975


In [30]:
for f in features:
    data_train = pd.merge(data_train, f, how='left')
    

In [31]:
data1.head()

Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너,코너묶음
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,,9.0,6.0,,,6.0,9.0,화장품,악세사리
1,21,0,353450,9,39272,1.2,6,8,11.1,37.5,,5.0,4.0,,3.0,,6.0,영캐주얼,아동
2,23,0,5671400,36,157539,2.8,22,16,5.6,37.5,7.0,17.0,12.0,,10.0,11.0,15.0,장신구,악세사리
3,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,,,18.0,10.0,화장품,악세사리
4,35,0,885000,5,177000,6.0,5,2,0.0,100.0,,,5.0,,,5.0,,피혁A,의류


In [32]:
data1_drop = data1.dropna(subset=['가을_구매건수','겨울_구매건수','봄_구매건수','여름_구매건수','아침_구매건수','저녁_구매건수','점심_구매건수' ])
data1_drop.shape

(2235, 19)

In [33]:
data1_drop.isnull().sum()

custid       0
gender       0
총구매액         0
구매건수         0
평균구매가격       0
평균할부개월수      0
구매브랜드종류      0
내점일수         0
수입상품_구매비율    0
주말방문비율       0
가을_구매건수      0
겨울_구매건수      0
봄_구매건수       0
여름_구매건수      0
아침_구매건수      0
저녁_구매건수      0
점심_구매건수      0
주구매코너        0
코너묶음         0
dtype: int64

In [34]:
con = ['가을_구매건수','겨울_구매건수','봄_구매건수','여름_구매건수','아침_구매건수','저녁_구매건수','점심_구매건수' ]

In [35]:
from sklearn.impute import SimpleImputer
imputer_con = SimpleImputer(strategy="most_frequent")  
imputer_con.fit(data1[con])

SimpleImputer(strategy='most_frequent')

In [36]:
x = imputer_con.transform(data1[con])
x

array([[ 1.,  9.,  6., ...,  1.,  6.,  9.],
       [ 1.,  5.,  4., ...,  3.,  1.,  6.],
       [ 7., 17., 12., ..., 10., 11., 15.],
       ...,
       [12.,  6.,  6., ..., 16.,  1., 21.],
       [ 6.,  4.,  2., ...,  1.,  2., 11.],
       [ 6.,  5., 10., ...,  8.,  5., 19.]])

In [37]:
data1[con] = x
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   custid     5000 non-null   int64  
 1   gender     5000 non-null   int64  
 2   총구매액       5000 non-null   int64  
 3   구매건수       5000 non-null   int64  
 4   평균구매가격     5000 non-null   int64  
 5   평균할부개월수    5000 non-null   float64
 6   구매브랜드종류    5000 non-null   int64  
 7   내점일수       5000 non-null   int64  
 8   수입상품_구매비율  5000 non-null   float64
 9   주말방문비율     5000 non-null   float64
 10  가을_구매건수    5000 non-null   float64
 11  겨울_구매건수    5000 non-null   float64
 12  봄_구매건수     5000 non-null   float64
 13  여름_구매건수    5000 non-null   float64
 14  아침_구매건수    5000 non-null   float64
 15  저녁_구매건수    5000 non-null   float64
 16  점심_구매건수    5000 non-null   float64
 17  주구매코너      5000 non-null   object 
 18  코너묶음       5000 non-null   object 
dtypes: float64(10), int64(7), object(2)
memory usage

In [38]:
obj = ['주구매코너','코너묶음']

In [39]:
data1[obj] = data1[obj].apply(lambda x: x.astype('category').cat.codes) 

In [40]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   custid     5000 non-null   int64  
 1   gender     5000 non-null   int64  
 2   총구매액       5000 non-null   int64  
 3   구매건수       5000 non-null   int64  
 4   평균구매가격     5000 non-null   int64  
 5   평균할부개월수    5000 non-null   float64
 6   구매브랜드종류    5000 non-null   int64  
 7   내점일수       5000 non-null   int64  
 8   수입상품_구매비율  5000 non-null   float64
 9   주말방문비율     5000 non-null   float64
 10  가을_구매건수    5000 non-null   float64
 11  겨울_구매건수    5000 non-null   float64
 12  봄_구매건수     5000 non-null   float64
 13  여름_구매건수    5000 non-null   float64
 14  아침_구매건수    5000 non-null   float64
 15  저녁_구매건수    5000 non-null   float64
 16  점심_구매건수    5000 non-null   float64
 17  주구매코너      5000 non-null   int8   
 18  코너묶음       5000 non-null   int8   
dtypes: float64(10), int64(7), int8(2)
memory usage: 

In [41]:
dt0 = data1.groupby(['주구매코너'])['구매건수'].agg(sum)   
df0 = dt0.to_frame().reset_index()
df0.columns = ['주구매코너', '코너별구매건']
#df0.head(3)
data1_ = pd.merge(data1, df0, on='주구매코너')
# dataP_.head()
print(data1_[(data1_['주구매코너']==1)].groupby(['gender'])['총구매액'].agg(sum))
print(data1_[(data1_['주구매코너']==1)].groupby(['gender'])['구매건수'].agg(sum))
data1_.head()

gender
0    31111117
1    29039091
Name: 총구매액, dtype: int64
gender
0    214
1    101
Name: 구매건수, dtype: int64


Unnamed: 0,custid,gender,총구매액,구매건수,평균구매가격,평균할부개월수,구매브랜드종류,내점일수,수입상품_구매비율,주말방문비율,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수,아침_구매건수,저녁_구매건수,점심_구매건수,주구매코너,코너묶음,코너별구매건
0,18,0,680100,15,45340,1.7,9,10,26.7,100.0,1.0,9.0,6.0,1.0,1.0,6.0,9.0,24,5,25740
1,26,0,1964000,28,70143,1.4,15,14,39.3,28.6,5.0,8.0,15.0,1.0,1.0,18.0,10.0,24,5,25740
2,82,0,752200,5,150440,2.6,5,3,60.0,100.0,1.0,1.0,5.0,1.0,1.0,1.0,5.0,24,5,25740
3,92,0,1145400,13,88108,1.9,7,9,23.1,22.2,1.0,1.0,2.0,10.0,1.0,8.0,4.0,24,5,25740
4,100,0,442160,16,27635,1.0,10,6,25.0,33.3,1.0,5.0,11.0,1.0,1.0,3.0,13.0,24,5,25740


In [42]:
def createFeatures(df_p):
    df_p['아침구매액'] = np.round(df_p['총구매액']*(df_p['아침_구매건수'] /df_p['구매건수'] ), 3)
    df_p['점심구매액'] = np.round(df_p['총구매액']*(df_p['점심_구매건수'] /df_p['구매건수'] ), 3)
    df_p['저녁구매액'] = np.round(df_p['총구매액']*(df_p['저녁_구매건수'] /df_p['구매건수'] ), 3)  
    df_p['봄구매액']   = np.round(df_p['총구매액']*(df_p['봄_구매건수'] /df_p['구매건수'] ), 3) 
    df_p['여름구매액'] = np.round(df_p['총구매액']*(df_p['여름_구매건수'] /df_p['구매건수'] ), 3) 
    df_p['가을구매액'] = np.round(df_p['총구매액']*(df_p['가을_구매건수'] /df_p['구매건수'] ), 3)  
    df_p['겨울구매액'] = np.round(df_p['총구매액']*(df_p['겨울_구매건수'] /df_p['구매건수'] ), 3)  
    df_p['내점구매율'] = np.round(df_p['구매건수'] /df_p['내점일수'], 3)

def dropFeatures(df_p):
    drop_cols = ['아침_구매건수', '점심_구매건수', '저녁_구매건수', '총구매액', '구매건수',
         '봄_구매건수', '여름_구매건수', '가을_구매건수', '겨울_구매건수', '내점일수'] 
    
    return df_p.drop(drop_cols, axis=1)

createFeatures(data1_)
dataPP = dropFeatures(data1_) 
dataPP[(dataPP['주구매코너']==2)].head()

Unnamed: 0,custid,gender,평균구매가격,평균할부개월수,구매브랜드종류,수입상품_구매비율,주말방문비율,주구매코너,코너묶음,코너별구매건,아침구매액,점심구매액,저녁구매액,봄구매액,여름구매액,가을구매액,겨울구매액,내점구매율
4123,363,0,90960,3.0,4,20.0,25.0,2,0,149,90960.0,90960.0,363840.0,454800.0,90960.0,90960.0,90960.0,1.25
4124,4822,0,248049,2.3,10,10.5,33.3,2,0,149,992196.632,3224639.053,496098.316,744147.474,992196.632,1736344.105,1240245.789,1.583
4125,11617,1,124767,3.8,8,25.0,62.5,2,0,149,124766.667,499066.667,873366.667,249533.333,499066.667,748600.0,124766.667,1.5
4126,12979,0,79250,1.7,10,0.0,33.3,2,0,149,554750.0,396250.0,79250.0,158500.0,79250.0,792500.0,79250.0,4.0
4127,16414,0,31625,1.0,6,12.5,14.3,2,0,149,63250.0,189750.0,31625.0,94875.0,158125.0,31625.0,31625.0,1.143


In [43]:
dataPP['내점구매율']

0       1.500
1       2.000
2       1.667
3       1.444
4       2.667
        ...  
4995    1.917
4996    1.850
4997    1.571
4998    2.449
4999    1.308
Name: 내점구매율, Length: 5000, dtype: float64

In [44]:
dfX = dataPP.drop(['custid','gender'], axis=1) 

In [45]:
dfy = dataPP['gender']

dataPP를 사용하여 기본학습, 엔지니어링 후 학습

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    dfX, dfy, random_state=0)

In [47]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
pred_tree = tree.predict(X_test)

In [48]:
X_train_fs= X_train.values

In [49]:
y_train_fs = y_train.values

In [50]:
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score

# K-Fold교차 검증
kf = KFold(n_splits=10, shuffle=True)
# 모델의 정밀도를 보존하기 위한 준비
scores = []
# 데이터를 셔플하고, 훈련데이터와 테스트 데이터를 분활
for train_id, test_id in kf.split(X_train_fs):
    # 훈련 데이터를 사용한 모델을 작성
    x = X_train_fs[train_id]
    y = y_train_fs[train_id]
    clf = tree.DecisionTreeClassifier()
    clf.fit(x,y)
    # 테스트 데이터에 모델을 적용
    pred_y = clf.predict(X_train_fs[test_id])
    # 모델의 정밀도를 계산하고 보존
    score = accuracy_score(y_train_fs[test_id], pred_y)
    scores.append(score)

# 모델의 평균정밀도, 표준편차를 확인
scores = np.array(scores)
print(scores.mean(), scores.std())

0.5874666666666666 0.026346072867802425


In [51]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

# 재현률, 적합률의 확인
#print(confusion_matrix(Y[test_id], pred_y))
print(recall_score(y_train_fs[test_id], pred_y))
print(precision_score(y_train_fs[test_id], pred_y))

0.3153153153153153
0.3017241379310345


In [52]:
from sklearn.model_selection import GridSearchCV

# 패러메터의 범위설정
params = {
    'criterion': ['entropy'],    
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
}

# 그리드 서치의 실행조건을 설정
clf_gs = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                      cv=KFold(n_splits=10, shuffle=True), scoring='accuracy')

# 그리드 서치 실행
clf_gs.fit(X_train_fs, y_train_fs)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'min_samples_leaf': [10, 20, 30, 40, 50]},
             scoring='accuracy')

In [53]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.7069333333333333
{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 10}


In [54]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [55]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], clf_gs.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)


0.5939803439803439

In [64]:
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import plot_importance
#import pan
import numpy as np

In [65]:
X_train_fs

array([[8.49310000e+04, 2.50000000e+00, 8.00000000e+00, ...,
        6.79446154e+05, 4.24653846e+05, 2.16700000e+00],
       [2.27900000e+04, 2.30000000e+00, 1.10000000e+01, ...,
        1.13950000e+05, 2.50690000e+05, 1.53800000e+00],
       [1.15436000e+05, 2.50000000e+00, 3.00000000e+00, ...,
        2.30872000e+05, 1.15436000e+05, 1.33300000e+00],
       ...,
       [8.81860000e+04, 1.60000000e+00, 2.30000000e+01, ...,
        3.52744000e+05, 2.64558000e+05, 2.05900000e+00],
       [2.91950000e+04, 1.00000000e+00, 5.00000000e+00, ...,
        2.91953330e+04, 2.91953330e+04, 1.20000000e+00],
       [4.44500000e+04, 1.70000000e+00, 3.00000000e+00, ...,
        2.22250000e+05, 4.44500000e+04, 3.00000000e+00]])

In [66]:
X_test_fs=X_test.values

In [67]:
dtrain = xgb.DMatrix(data=X_train_fs, label = y_train)
dtest = xgb.DMatrix(data=X_test_fs, label=y_test)

In [68]:
params = {'max_depth':3,
          'eta':0.1,
          'objective':'binary:logistic',
          'eval_metric':'logloss',
          'early_stoppings':100,
          'learning_rate' : 0.01,
          'n_estimators' : 100
         }

num_rounds = 400

In [69]:
# train 데이터 세트는 'train', evaluation(test) 데이터 세트는 'eval'로 명기
wlist = [(dtrain, 'train'),(dtest, 'eval')]
# 하이퍼 파라미터와 early stoppinig 파라미터를 train() 함수의 파라미터로 전달
xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_rounds, evals=wlist)

Parameters: { early_stoppings, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.69113	eval-logloss:0.69152
[1]	train-logloss:0.68915	eval-logloss:0.68992
[2]	train-logloss:0.68721	eval-logloss:0.68837
[3]	train-logloss:0.68531	eval-logloss:0.68684
[4]	train-logloss:0.68344	eval-logloss:0.68535
[5]	train-logloss:0.68161	eval-logloss:0.68390
[6]	train-logloss:0.67982	eval-logloss:0.68248
[7]	train-logloss:0.67806	eval-logloss:0.68108
[8]	train-logloss:0.67634	eval-logloss:0.67972
[9]	train-logloss:0.67464	eval-logloss:0.67839
[10]	train-logloss:0.67297	eval-logloss:0.67710
[11]	train-logloss:0.67134	eval-logloss:0.67585
[12]	train-logloss:0.66974	eval-logloss:0.67461
[13]	train-logloss:0.66817	eval-logloss:0.67342
[14]	train-logloss:0.66663	eval-l

[162]	train-logloss:0.58060	eval-logloss:0.61774
[163]	train-logloss:0.58038	eval-logloss:0.61762
[164]	train-logloss:0.58014	eval-logloss:0.61752
[165]	train-logloss:0.57997	eval-logloss:0.61750
[166]	train-logloss:0.57974	eval-logloss:0.61743
[167]	train-logloss:0.57957	eval-logloss:0.61738
[168]	train-logloss:0.57935	eval-logloss:0.61729
[169]	train-logloss:0.57919	eval-logloss:0.61724
[170]	train-logloss:0.57895	eval-logloss:0.61713
[171]	train-logloss:0.57879	eval-logloss:0.61712
[172]	train-logloss:0.57858	eval-logloss:0.61706
[173]	train-logloss:0.57835	eval-logloss:0.61696
[174]	train-logloss:0.57817	eval-logloss:0.61692
[175]	train-logloss:0.57801	eval-logloss:0.61688
[176]	train-logloss:0.57780	eval-logloss:0.61679
[177]	train-logloss:0.57760	eval-logloss:0.61674
[178]	train-logloss:0.57743	eval-logloss:0.61672
[179]	train-logloss:0.57723	eval-logloss:0.61666
[180]	train-logloss:0.57703	eval-logloss:0.61659
[181]	train-logloss:0.57683	eval-logloss:0.61654
[182]	train-logloss:

[330]	train-logloss:0.55990	eval-logloss:0.61593
[331]	train-logloss:0.55979	eval-logloss:0.61597
[332]	train-logloss:0.55974	eval-logloss:0.61598
[333]	train-logloss:0.55961	eval-logloss:0.61598
[334]	train-logloss:0.55951	eval-logloss:0.61598
[335]	train-logloss:0.55943	eval-logloss:0.61596
[336]	train-logloss:0.55939	eval-logloss:0.61598
[337]	train-logloss:0.55929	eval-logloss:0.61600
[338]	train-logloss:0.55920	eval-logloss:0.61600
[339]	train-logloss:0.55916	eval-logloss:0.61603
[340]	train-logloss:0.55906	eval-logloss:0.61602
[341]	train-logloss:0.55897	eval-logloss:0.61605
[342]	train-logloss:0.55888	eval-logloss:0.61603
[343]	train-logloss:0.55878	eval-logloss:0.61603
[344]	train-logloss:0.55869	eval-logloss:0.61604
[345]	train-logloss:0.55861	eval-logloss:0.61604
[346]	train-logloss:0.55851	eval-logloss:0.61603
[347]	train-logloss:0.55843	eval-logloss:0.61604
[348]	train-logloss:0.55838	eval-logloss:0.61605
[349]	train-logloss:0.55828	eval-logloss:0.61605
[350]	train-logloss:

In [70]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
     eval_metric = "logloss",
    nthread=4,
    seed=42)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 10,
cv = 10,
verbose=True
)

In [71]:
grid_search.fit(X_train_fs,y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   11.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   58.4s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  2.9min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  5.0min




[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:  6.4min finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=42,
                                     subsample=No

In [72]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.6198736200399699
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 60}


In [73]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], grid_search.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)


0.7027368277368278

단순 표본으로의 예측률

-> 스케일링후의 예측률

In [74]:
dataPS = dataPP.copy()

In [75]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(dataPS)

MinMaxScaler()

In [76]:
x = scaler.transform(dataPS)
dataPS = pd.DataFrame(x , columns = dataPS.columns)


In [77]:
dataPS.describe()

Unnamed: 0,custid,gender,평균구매가격,평균할부개월수,구매브랜드종류,수입상품_구매비율,주말방문비율,주구매코너,코너묶음,코너별구매건,아침구매액,점심구매액,저녁구매액,봄구매액,여름구매액,가을구매액,겨울구매액,내점구매율
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.503902,0.3042,0.175572,0.10174,0.123074,0.179899,0.385719,0.646208,0.537029,0.388932,0.033597,0.038559,0.04005,0.041252,0.034203,0.050831,0.060285,0.15252
std,0.287677,0.460114,0.079175,0.081173,0.109642,0.194836,0.245098,0.247343,0.184499,0.379004,0.047028,0.04827,0.047161,0.058861,0.043935,0.056867,0.057713,0.113202
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.256483,0.0,0.127769,0.044444,0.045113,0.05,0.222,0.5,0.428571,0.107827,0.013322,0.015055,0.016474,0.009945,0.012267,0.022553,0.029184,0.073682
50%,0.506793,0.0,0.155188,0.088889,0.090226,0.125,0.3485,0.583333,0.571429,0.337214,0.017891,0.024362,0.024283,0.02184,0.020205,0.032283,0.040884,0.133273
75%,0.751701,1.0,0.197174,0.133333,0.172932,0.25,0.5,0.958333,0.714286,1.0,0.033346,0.043662,0.044095,0.049298,0.039604,0.056197,0.068046,0.206591
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [78]:
dfX = dataPS.drop(['custid','gender'], axis=1) 
dfy = dataPS['gender']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(
    dfX, dfy, random_state=0)

In [80]:
X_train_fs= X_train.values

In [81]:
y_train_fs = y_train.values

In [82]:
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
#pred_tree = tree.predict(X_test_fs)

In [83]:
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score

# K-Fold교차 검증
kf = KFold(n_splits=10, shuffle=True)
# 모델의 정밀도를 보존하기 위한 준비
scores = []
# 데이터를 셔플하고, 훈련데이터와 테스트 데이터를 분활
for train_id, test_id in kf.split(X_train_fs):
    # 훈련 데이터를 사용한 모델을 작성
    x = X_train_fs[train_id]
    y = y_train_fs[train_id]
    clf = tree.DecisionTreeClassifier()
    clf.fit(x,y)
    # 테스트 데이터에 모델을 적용
    pred_y = clf.predict(X_train_fs[test_id])
    # 모델의 정밀도를 계산하고 보존
    score = accuracy_score(y_train_fs[test_id], pred_y)
    scores.append(score)

# 모델의 평균정밀도, 표준편차를 확인
scores = np.array(scores)
print(scores.mean(), scores.std())

0.5888000000000001 0.021226398658274553


In [84]:
print(recall_score(y_train_fs[test_id], pred_y))
print(precision_score(y_train_fs[test_id], pred_y))

0.3669724770642202
0.35398230088495575


In [85]:
params = {
    'criterion': ['entropy'],    
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
}

# 그리드 서치의 실행조건을 설정
clf_gs = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                      cv=KFold(n_splits=10, shuffle=True), scoring='accuracy')

# 그리드 서치 실행
clf_gs.fit(X_train_fs, y_train_fs)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'min_samples_leaf': [10, 20, 30, 40, 50]},
             scoring='accuracy')

In [86]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.7069333333333333
{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 10}


In [87]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], clf_gs.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)


0.5700317307029041

In [88]:
dtrain = xgb.DMatrix(data=X_train_fs, label = y_train)
dtest = xgb.DMatrix(data=X_test_fs, label=y_test)

In [89]:
params = {'max_depth':3,
          'eta':0.1,
          'objective':'binary:logistic',
          'eval_metric':'logloss',
          'early_stoppings':100,
          'learning_rate' : 0.01,
          'n_estimators' : 100
         }

num_rounds = 400

In [90]:
# train 데이터 세트는 'train', evaluation(test) 데이터 세트는 'eval'로 명기
wlist = [(dtrain, 'train'),(dtest, 'eval')]
# 하이퍼 파라미터와 early stoppinig 파라미터를 train() 함수의 파라미터로 전달
xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_rounds, evals=wlist)

Parameters: { early_stoppings, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.69113	eval-logloss:0.69319
[1]	train-logloss:0.68915	eval-logloss:0.69322
[2]	train-logloss:0.68721	eval-logloss:0.69326
[3]	train-logloss:0.68531	eval-logloss:0.69330
[4]	train-logloss:0.68344	eval-logloss:0.69333
[5]	train-logloss:0.68161	eval-logloss:0.69337
[6]	train-logloss:0.67982	eval-logloss:0.69341
[7]	train-logloss:0.67806	eval-logloss:0.69344
[8]	train-logloss:0.67634	eval-logloss:0.69348
[9]	train-logloss:0.67464	eval-logloss:0.69351
[10]	train-logloss:0.67297	eval-logloss:0.69355
[11]	train-logloss:0.67134	eval-logloss:0.69358
[12]	train-logloss:0.66974	eval-logloss:0.69362
[13]	train-logloss:0.66817	eval-logloss:0.69365
[14]	train-logloss:0.66663	eval-l

[162]	train-logloss:0.58060	eval-logloss:0.65922
[163]	train-logloss:0.58038	eval-logloss:0.65904
[164]	train-logloss:0.58014	eval-logloss:0.65887
[165]	train-logloss:0.57997	eval-logloss:0.65785
[166]	train-logloss:0.57974	eval-logloss:0.65769
[167]	train-logloss:0.57957	eval-logloss:0.65767
[168]	train-logloss:0.57935	eval-logloss:0.65751
[169]	train-logloss:0.57919	eval-logloss:0.65749
[170]	train-logloss:0.57895	eval-logloss:0.65722
[171]	train-logloss:0.57879	eval-logloss:0.65623
[172]	train-logloss:0.57858	eval-logloss:0.65609
[173]	train-logloss:0.57835	eval-logloss:0.65583
[174]	train-logloss:0.57817	eval-logloss:0.65583
[175]	train-logloss:0.57801	eval-logloss:0.65581
[176]	train-logloss:0.57780	eval-logloss:0.65567
[177]	train-logloss:0.57760	eval-logloss:0.65553
[178]	train-logloss:0.57743	eval-logloss:0.65589
[179]	train-logloss:0.57723	eval-logloss:0.65565
[180]	train-logloss:0.57703	eval-logloss:0.65554
[181]	train-logloss:0.57683	eval-logloss:0.65530
[182]	train-logloss:

[330]	train-logloss:0.55990	eval-logloss:0.64484
[331]	train-logloss:0.55979	eval-logloss:0.64452
[332]	train-logloss:0.55974	eval-logloss:0.64453
[333]	train-logloss:0.55961	eval-logloss:0.64458
[334]	train-logloss:0.55951	eval-logloss:0.64375
[335]	train-logloss:0.55943	eval-logloss:0.64360
[336]	train-logloss:0.55939	eval-logloss:0.64361
[337]	train-logloss:0.55929	eval-logloss:0.64358
[338]	train-logloss:0.55920	eval-logloss:0.64419
[339]	train-logloss:0.55916	eval-logloss:0.64419
[340]	train-logloss:0.55906	eval-logloss:0.64389
[341]	train-logloss:0.55897	eval-logloss:0.64388
[342]	train-logloss:0.55888	eval-logloss:0.64373
[343]	train-logloss:0.55878	eval-logloss:0.64294
[344]	train-logloss:0.55869	eval-logloss:0.64294
[345]	train-logloss:0.55861	eval-logloss:0.64337
[346]	train-logloss:0.55851	eval-logloss:0.64340
[347]	train-logloss:0.55843	eval-logloss:0.64372
[348]	train-logloss:0.55838	eval-logloss:0.64373
[349]	train-logloss:0.55828	eval-logloss:0.64294
[350]	train-logloss:

In [91]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
     eval_metric = "logloss",
    nthread=4,
    seed=42)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 10,
cv = 10,
verbose=True
)

In [92]:
grid_search.fit(X_train_fs,y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    8.9s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   34.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  2.9min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:  3.9min finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None

In [93]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.6198736200399699
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 60}


In [94]:
fpr, tpr, _ = roc_curve(y_train[test_id], grid_search.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)

0.7129750982961993

스케일링 후 예측률

-> 스케일링 + KBest 셀렉션 후 예측률

In [95]:
dataSS = dataPS.copy()

In [96]:
dataSS.head()

Unnamed: 0,custid,gender,평균구매가격,평균할부개월수,구매브랜드종류,수입상품_구매비율,주말방문비율,주구매코너,코너묶음,코너별구매건,아침구매액,점심구매액,저녁구매액,봄구매액,여름구매액,가을구매액,겨울구매액,내점구매율
0,0.0,0.0,0.118288,0.077778,0.06015,0.267,1.0,1.0,0.714286,0.337214,0.011695,0.015525,0.022815,0.015213,0.009767,0.019271,0.041942,0.090909
1,0.00016,0.0,0.143117,0.044444,0.105263,0.393,0.286,1.0,0.714286,0.337214,0.012629,0.019533,0.062933,0.047305,0.010487,0.031474,0.048938,0.181818
2,0.001281,0.0,0.223499,0.177778,0.030075,0.6,1.0,1.0,0.714286,0.337214,0.015652,0.020227,0.01789,0.034966,0.012817,0.023471,0.030169,0.121273
3,0.001481,0.0,0.161101,0.1,0.045113,0.231,0.222,1.0,0.714286,0.337214,0.013305,0.014765,0.040345,0.01127,0.034016,0.02098,0.027321,0.080727
4,0.001641,0.0,0.100564,0.0,0.067669,0.25,0.333,1.0,0.714286,0.337214,0.011029,0.014858,0.015155,0.016527,0.009253,0.018564,0.029609,0.303091


In [97]:
from sklearn.feature_selection import SelectKBest

print(X_train_fs.shape)
X_train_ss = SelectKBest(k=5).fit_transform(X_train_fs, y_train_fs) # 5개를 선택해 fit, transform을 동시에 해야함.
X_train_ss.shape


(3750, 16)


(3750, 5)

In [98]:
kf = KFold(n_splits=10, shuffle=True)
# 모델의 정밀도를 보존하기 위한 준비
scores = []
# 데이터를 셔플하고, 훈련데이터와 테스트 데이터를 분활
for train_id, test_id in kf.split(X_train_ss):
    # 훈련 데이터를 사용한 모델을 작성
    x = X_train_ss[train_id]
    y = y_train_fs[train_id]
    clf = tree.DecisionTreeClassifier()
    clf.fit(x,y)
    # 테스트 데이터에 모델을 적용
    pred_y = clf.predict(X_train_ss[test_id])
    # 모델의 정밀도를 계산하고 보존
    score = accuracy_score(y_train_fs[test_id], pred_y)
    scores.append(score)

# 모델의 평균정밀도, 표준편차를 확인
scores = np.array(scores)
print(scores.mean(), scores.std())

0.6008 0.012510617712789232


In [99]:
params = {
    'criterion': ['entropy'],    
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
}

# 그리드 서치의 실행조건을 설정
clf_gs = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                      cv=KFold(n_splits=10, shuffle=True), scoring='accuracy')

# 그리드 서치 실행
clf_gs.fit(X_train_ss, y_train_fs)
# 가장 높은 정밀도와 패러메터의 조합을 표시
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.7088000000000001
{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 50}


In [100]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], clf_gs.predict_proba(X_train_ss[test_id])[:,1])
auc(fpr, tpr)


0.5695962248084019

In [101]:
dtrain = xgb.DMatrix(data=X_train_fs, label = y_train)
dtest = xgb.DMatrix(data=X_test_fs, label=y_test)

In [102]:
params = {'max_depth':3,
          'eta':0.1,
          'objective':'binary:logistic',
          'eval_metric':'logloss',
          'early_stoppings':100,
          'learning_rate' : 0.01,
          'n_estimators' : 100
         }

num_rounds = 400

In [103]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
     eval_metric = "logloss",
    nthread=4,
    seed=42)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 10,
cv = 10,
verbose=True
)

In [104]:
grid_search.fit(X_train_fs,y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    8.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   34.1s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  1.5min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  2.9min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:  3.9min finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None

In [118]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.6198736200399699
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 60}


In [119]:
fpr, tpr, _ = roc_curve(y_train[test_id], grid_search.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)

0.6886437908496732

KBest는 제외

RandomForest 셀렉트 진행

In [78]:
dataS = dataPS.copy()

In [79]:
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier

select = SelectFromModel(RandomForestClassifier(), threshold=None) # threshold=None 스스로 알아서 처리
# select 는 전처리기

In [80]:
X_train_fs = select.fit(X_train, y_train).transform(X_train)
# 변수의 중요도가 나온다.
print("X_train.shape: {}, X_train_fs.shape: {}".format(
    X_train.shape, X_train_fs.shape))

X_train.shape: (3750, 16), X_train_fs.shape: (3750, 11)


In [81]:
kf = KFold(n_splits=10, shuffle=True)
# 모델의 정밀도를 보존하기 위한 준비
scores = []
# 데이터를 셔플하고, 훈련데이터와 테스트 데이터를 분활
for train_id, test_id in kf.split(X_train_fs):
    # 훈련 데이터를 사용한 모델을 작성
    x = X_train_fs[train_id]
    y = y_train_fs[train_id]
    clf = tree.DecisionTreeClassifier()
    clf.fit(x,y)
    # 테스트 데이터에 모델을 적용
    pred_y = clf.predict(X_train_fs[test_id])
    # 모델의 정밀도를 계산하고 보존
    score = accuracy_score(y_train_fs[test_id], pred_y)
    scores.append(score)

# 모델의 평균정밀도, 표준편차를 확인
scores = np.array(scores)
print(scores.mean(), scores.std())

0.5954666666666666 0.0264538171998758


In [82]:
print(recall_score(y_train_fs[test_id], pred_y))
print(precision_score(y_train_fs[test_id], pred_y))

0.31
0.28440366972477066


In [83]:
params = {
    'criterion': ['entropy'],    
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
}

# 그리드 서치의 실행조건을 설정
clf_gs = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                      cv=KFold(n_splits=10, shuffle=True), scoring='accuracy')

# 그리드 서치 실행
clf_gs.fit(X_train_fs, y_train_fs)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'min_samples_leaf': [10, 20, 30, 40, 50]},
             scoring='accuracy')

In [84]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.7074666666666667
{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 10}


In [85]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], clf_gs.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)


0.5893272727272727

In [86]:
#단순 스케일링 auc점수가 높다

In [93]:
#폴리 + 스케일링 + 셀렉션 

In [94]:
from sklearn.preprocessing import PolynomialFeatures

In [88]:
dfX = dataPS.drop(['custid','gender'], axis=1) 
dfy = dataPS['gender']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(
    dfX, dfy, random_state=0)

In [109]:
poly = PolynomialFeatures(2)

In [112]:
X_train_fs = poly.fit_transform (X_train_fs)

In [113]:
from sklearn.feature_selection import SelectKBest

print(X_train_fs.shape)
X_train_new= SelectKBest(k=5).fit_transform(X_train_fs, y_train_fs)
X_train_new.shape

(3750, 153)


  f = msb / msw


(3750, 5)

In [123]:
y_train_fs = y_train.values

In [127]:
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)
# 모델의 정밀도를 보존하기 위한 준비
scores = []
# 데이터를 셔플하고, 훈련데이터와 테스트 데이터를 분활
for train_id, test_id in kf.split(X_train_new):
    # 훈련 데이터를 사용한 모델을 작성
    x = X_train_new[train_id]
    y = y_train_fs[train_id]
    clf = tree.DecisionTreeClassifier()
    clf.fit(x,y)
    # 테스트 데이터에 모델을 적용
    pred_y = clf.predict(X_train_new[test_id])
    # 모델의 정밀도를 계산하고 보존
    score = accuracy_score(y_train_fs[test_id], pred_y)
    scores.append(score)

# 모델의 평균정밀도, 표준편차를 확인
scores = np.array(scores)
print(scores.mean(), scores.std())

0.5946666666666667 0.02804440922695446


In [129]:
params = {
    'criterion': ['entropy'],    
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
}

# 그리드 서치의 실행조건을 설정
clf_gs = GridSearchCV(tree.DecisionTreeClassifier(), params, 
                      cv=KFold(n_splits=10, shuffle=True), scoring='accuracy')

# 그리드 서치 실행
clf_gs.fit(X_train_new, y_train_fs)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy'],
                         'max_depth': [2, 4, 6, 8, 10],
                         'min_samples_leaf': [10, 20, 30, 40, 50]},
             scoring='accuracy')

In [130]:
print(clf_gs.best_score_)
print(clf_gs.best_params_)

0.7069333333333333
{'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 40}


In [131]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], clf_gs.predict_proba(X_train_new[test_id])[:,1])
auc(fpr, tpr)


0.5976153482402216

In [120]:
dtrain = xgb.DMatrix(data=X_train_fs, label = y_train)
dtest = xgb.DMatrix(data=X_test_fs, label=y_test)

In [121]:
params = {'max_depth':3,
          'eta':0.1,
          'objective':'binary:logistic',
          'eval_metric':'logloss',
          'early_stoppings':100,
          'learning_rate' : 0.01,
          'n_estimators' : 100
         }

num_rounds = 400

In [122]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
     eval_metric = "logloss",
    nthread=4,
    seed=42)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 10,
cv = 10,
verbose=True
)

In [123]:
grid_search.fit(X_train_fs,y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   13.2s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   49.8s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  2.3min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  4.7min




[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed:  6.1min finished


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=42,
                                     subsample=No

In [125]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.6198736200399699
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 60}


In [126]:
fpr, tpr, _ = roc_curve(y_train_fs[test_id], grid_search.predict_proba(X_train_fs[test_id])[:,1])
auc(fpr, tpr)

0.6886437908496732