In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# 유니코드 깨짐현상 해결
mpl.rcParams['axes.unicode_minus'] = False

# 나눔고딕 폰트 적용
plt.rcParams["font.family"] = 'NanumGothic'

## 나무구조 생성 및 저장
from sklearn.tree import export_graphviz
##그래프 비즈 나무구조 시각화
import graphviz

##데이터 분할
from sklearn.model_selection import train_test_split

#예측 회귀
from sklearn.ensemble import GradientBoostingRegressor

##최적 모델 파라미터 탐색
from sklearn.model_selection import GridSearchCV

import os
import pydot
from IPython.display import Image, display

In [4]:
##데이터 불러오기
df_raw=pd.read_csv("./체질검사.csv",encoding='euc-kr')
df_raw.head()

Unnamed: 0,FAT,AGE,WEIGHT,HEIGHT,NECK,CHEST,ABDOMEN,HIP,THIGH,KNEE,ANKLE,BICEPS,FOREARM,WRIST,GENDER
0,35.2,46,363.15 lb,72.25 inch,51.2,136.2,148.1,147.7,87.3,49.1,29.6,45.0,29.0,21.4,남성
1,11.8,27,168 lb,71.25 inch,38.1,93.0,79.1,94.5,57.3,36.2,24.5,29.0,30.0,18.8,남성
2,22.2,69,177.75 lb,68.5 inch,38.7,102.0,95.0,98.3,55.0,38.3,21.8,30.8,25.7,18.8,남성
3,10.6,57,147.75 lb,65.75 inch,35.2,99.6,86.4,90.1,53.0,35.0,21.3,31.7,27.3,16.9,여성
4,47.5,51,219 lb,64 inch,41.2,119.8,122.1,112.8,62.5,36.9,23.6,34.7,29.1,18.4,여성


In [5]:
## 데이터 구성 변환하기 
## 단위와 같이 표현되어 있는 수치형 변수 처리 

df_raw[['WEIGHT','WEIGHT_UNIT']]=df_raw['WEIGHT'].str.split(expand=True)
df_raw[['HEIGHT','HEIGHT_UNIT']]=df_raw['HEIGHT'].str.split(expand=True)

df_raw['WEIGHT']=df_raw['WEIGHT'].astype('float64')
df_raw['HEIGHT']=df_raw['HEIGHT'].astype('float64')
df_raw.info()


#단위를 나타내는 항목 제외 
df_raw=df_raw.drop(['WEIGHT_UNIT','HEIGHT_UNIT'],axis=1)

#범주형 변수의 dummy변수화 
df_raw_dummy=pd.get_dummies(df_raw,drop_first=True)
df_raw_dummy.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   FAT          252 non-null    float64
 1   AGE          252 non-null    int64  
 2   WEIGHT       252 non-null    float64
 3   HEIGHT       252 non-null    float64
 4   NECK         252 non-null    float64
 5   CHEST        252 non-null    float64
 6   ABDOMEN      252 non-null    float64
 7   HIP          252 non-null    float64
 8   THIGH        252 non-null    float64
 9   KNEE         252 non-null    float64
 10  ANKLE        252 non-null    float64
 11  BICEPS       252 non-null    float64
 12  FOREARM      252 non-null    float64
 13  WRIST        252 non-null    float64
 14  GENDER       252 non-null    object 
 15  WEIGHT_UNIT  252 non-null    object 
 16  HEIGHT_UNIT  252 non-null    object 
dtypes: float64(13), int64(1), object(3)
memory usage: 33.6+ KB


Unnamed: 0,FAT,AGE,WEIGHT,HEIGHT,NECK,CHEST,ABDOMEN,HIP,THIGH,KNEE,ANKLE,BICEPS,FOREARM,WRIST,GENDER_여성
0,35.2,46,363.15,72.25,51.2,136.2,148.1,147.7,87.3,49.1,29.6,45.0,29.0,21.4,0
1,11.8,27,168.0,71.25,38.1,93.0,79.1,94.5,57.3,36.2,24.5,29.0,30.0,18.8,0
2,22.2,69,177.75,68.5,38.7,102.0,95.0,98.3,55.0,38.3,21.8,30.8,25.7,18.8,0
3,10.6,57,147.75,65.75,35.2,99.6,86.4,90.1,53.0,35.0,21.3,31.7,27.3,16.9,1
4,47.5,51,219.0,64.0,41.2,119.8,122.1,112.8,62.5,36.9,23.6,34.7,29.1,18.4,1


In [6]:
## 변수 역활 지정 
df_raw_y=df_raw_dummy['FAT']
df_raw_x=df_raw_dummy.drop('FAT', axis=1, inplace=False)

In [7]:
## 데이터 분할하기 
df_train_x,df_test_x,df_train_y,df_test_y=train_test_split(df_raw_x,df_raw_y, test_size=0.3,random_state=1234)
print(f"train_data_size{df_train_x.shape}")
print(f"train_data_size{df_test_x.shape}")
print(f"train_data_size{df_train_y.shape}")
print(f"train_data_size{df_test_y.shape}")


train_data_size(176, 14)
train_data_size(76, 14)
train_data_size(176,)
train_data_size(76,)


In [9]:
#모델 생성

gb_uncustomized=GradientBoostingRegressor(random_state=1234)
gb_uncustomized.fit(df_train_x,df_train_y)


#train 데이터의 설명력
print(f"score on trainnig set{gb_uncustomized.score(df_train_x,df_train_y)}")

##test 데이터의 설명력
print(f"score on test set{gb_uncustomized.score(df_test_x,df_test_y)}")

score on trainnig set0.9795671047777067
score on test set0.5826975018333711


In [10]:
gb_uncustomized.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 1234,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
# 파라미터 조정 

#train 데이터 와 test데이터의 설명력을 저장하기
train_score=[]
test_score=[]

#n_estimators: 트리수 변경
para_n_tree=[n_tree for n_tree range(1,20)]

for v_n_estimator in para_n_tree:
    gb=DecisionTreeRegressor(random_state=1234,
                              n_estimators=v_n_estimator)
    
    gb.fit(df_train_x,df_train_y)
    train_score.append(gb.score(df_train_x,df_train_y))
    test_score.append(gb.score(df_test_x,df_test_y))
    
    
#결과 저장
df_score_n=pd.DataFrame()
df_score_n['Estimator']=para_n_tree
df_score_n['TrainScore']=train_score
df_score_n['TestScore']=test_score