In [1]:
#정재환
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.widgets import Button
from matplotlib import font_manager, rc
import numpy as np
import csv
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [2]:
# 한글 폰트 적용 - 한글깨짐 방지
font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

In [3]:
# csv 파일 읽어와서 데이터프레임에 저장
df_source = pd.read_csv('NHIS_OPEN_GJ_2017_3.csv', encoding='cp949')
df_source = df_source.drop_duplicates() # 중복값 제거
df_source = df_source.dropna() # 결측값 제거
df_source

Unnamed: 0.1,Unnamed: 0,번호,성별,연령,신장,체중,허리둘레,수축기혈압,이완기혈압,식전혈당,총콜레스테롤,흡연상태,음주여부
0,0,1,1,40,170,75,90.0,120.0,80.0,99.0,193.0,1.0,1.0
1,1,2,1,35,180,80,89.0,130.0,82.0,106.0,228.0,3.0,0.0
2,2,3,1,45,165,75,91.0,120.0,70.0,98.0,136.0,1.0,0.0
3,3,4,1,55,175,80,91.0,145.0,87.0,95.0,201.0,1.0,0.0
4,4,5,1,55,165,60,80.0,138.0,82.0,101.0,199.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199772,199772,199996,1,45,170,100,107.0,135.0,88.0,112.0,247.0,3.0,1.0
199773,199773,199997,2,30,155,45,63.0,107.0,61.0,83.0,151.0,1.0,1.0
199774,199774,199998,1,55,160,70,91.0,100.0,76.0,100.0,222.0,1.0,0.0
199775,199775,199999,1,40,170,75,88.2,147.0,89.0,81.0,125.0,3.0,1.0


In [4]:
x = df_source.drop(['성별', '번호', 'Unnamed: 0'], axis=1) # 성별특성을 제외한 모든 특성의 데이터
y = df_source['성별']

In [5]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 100으로 설정
xgb = XGBClassifier(n_estinators=100) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)



Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9326
훈련 셋 정확도: 0.9409


In [6]:
# 테스트 셋 에서는 93%, 훈련 셋 에서는 94%의 정확도가 나타난다

In [7]:
# 각 특성별 확률값(중요도) 출력
feature_name_list = x_train.columns
df_feature_importance = xgb.feature_importances_
for i,v in enumerate(df_feature_importance):
    print('(%2d) Feature : %s, %5f' %(i, feature_name_list[i], v))

( 0) Feature : 연령, 0.033661
( 1) Feature : 신장, 0.565439
( 2) Feature : 체중, 0.016018
( 3) Feature : 허리둘레, 0.026227
( 4) Feature : 수축기혈압, 0.005974
( 5) Feature : 이완기혈압, 0.004733
( 6) Feature : 식전혈당, 0.003906
( 7) Feature : 총콜레스테롤, 0.005177
( 8) Feature : 흡연상태, 0.312890
( 9) Feature : 음주여부, 0.025975


In [8]:
#각 특성을 확률값이 큰 순서로 정렬
df_importance = pd.DataFrame({'feature' : x_train.columns, 'importance' : xgb.feature_importances_})
df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance.head(10)

Unnamed: 0,feature,importance
1,신장,0.565439
8,흡연상태,0.31289
0,연령,0.033661
3,허리둘레,0.026227
9,음주여부,0.025975
2,체중,0.016018
4,수축기혈압,0.005974
7,총콜레스테롤,0.005177
5,이완기혈압,0.004733
6,식전혈당,0.003906


In [9]:
# 신장, 흡연상태, 연령 등의 순서로 높은 확률값을 나타낸다

In [10]:
del xgb, x_train, y_train, x_test, y_test

In [11]:
# 이번에는 n_estinators를 2로 설정하여 실행

In [12]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 2으로 설정
xgb = XGBClassifier(n_estinators=2) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)



Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9326
훈련 셋 정확도: 0.9409


In [13]:
#기존에 n_estinators를 100으로 설정했을 때와 결과가 거의 같다

In [14]:
del xgb, x_train, y_train, x_test, y_test

In [15]:
#이번에는 데이터를 표준화 하고 xgboost 알고리즘을 실행해보자

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# 표준화 전처리 적용
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaler = scaler.transform(x_train)
x_train_scaler = pd.DataFrame(x_train_scaler, columns = x_train.columns) # 표준화를 적용한 이후 다시 컬럼명을 지정해야 한다 

del scaler
scaler = StandardScaler()
scaler.fit(x_test)
x_test_scaler = scaler.transform(x_test)
x_test_scaler = pd.DataFrame(x_test_scaler, columns = x_test.columns)

In [17]:
# n_estinators매개변수로 군집수를 100으로 설정
xgb = XGBClassifier(n_estinators=100) # xgboost 모델 생성
xgb = xgb.fit(x_train_scaler, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test_scaler, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train_scaler, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)



Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9325
훈련 셋 정확도: 0.9409


In [18]:
#표준화를 했을 떄의 결과는 이전과 거의 차이가 없다

In [19]:
del xgb, x_train, y_train, x_test, y_test

In [20]:
#이번에는 노드의 최대깊이를 3으로 설정했들 때의 결과를 확인해보자

In [21]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 100으로 설정
#노드의 최대 깊이(max_depth)를 3으로 설정
xgb = XGBClassifier(n_estinators=100, max_depth=3) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)

Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9320
훈련 셋 정확도: 0.9333


In [23]:
# 테스트셋 정확도가 93%로 기존에 비해 1% 정도 감소하였다

In [24]:
# 각 특성별 확률값(중요도) 출력
feature_name_list = x_train.columns
df_feature_importance = xgb.feature_importances_
for i,v in enumerate(df_feature_importance):
    print('(%2d) Feature : %s, %5f' %(i, feature_name_list[i], v))
#각 특성을 확률값이 큰 순서로 정렬
df_importance = pd.DataFrame({'feature' : x_train.columns, 'importance' : xgb.feature_importances_})
df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance.head(10)

( 0) Feature : 연령, 0.039096
( 1) Feature : 신장, 0.474767
( 2) Feature : 체중, 0.024714
( 3) Feature : 허리둘레, 0.069104
( 4) Feature : 수축기혈압, 0.012250
( 5) Feature : 이완기혈압, 0.006712
( 6) Feature : 식전혈당, 0.004806
( 7) Feature : 총콜레스테롤, 0.007326
( 8) Feature : 흡연상태, 0.322111
( 9) Feature : 음주여부, 0.039112


Unnamed: 0,feature,importance
1,신장,0.474767
8,흡연상태,0.322111
3,허리둘레,0.069104
9,음주여부,0.039112
0,연령,0.039096
2,체중,0.024714
4,수축기혈압,0.01225
7,총콜레스테롤,0.007326
5,이완기혈압,0.006712
6,식전혈당,0.004806


In [25]:
#기존에 비해 허리둘레 요소의 확률값이 더 증가하였다

In [26]:
del xgb, x_train, y_train, x_test, y_test

In [27]:
#이번에는 노드의 최대깊이를 10으로 설정했들 때의 결과를 확인해보자

In [28]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 100으로 설정
#노드의 최대 깊이(max_depth)를 10으로 설정
xgb = XGBClassifier(n_estinators=100, max_depth=10) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)



Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9283
훈련 셋 정확도: 0.9721


In [29]:
# 기존에 비해 테스트 셋 정확도는 감소하고 훈련셋 정확도는 증가하였다

In [30]:
del xgb, x_train, y_train, x_test, y_test

In [31]:
# 이번에는 learning_rate를 설정하여 가중치 적용 비율을 설정한다
# learning_rate값이 작을수록 가중치를 적게 반영한다

In [32]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 100으로 설정
#learning_rate를 0.1로 설정
xgb = XGBClassifier(n_estinators=100, learning_rate=0.1) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)

Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9323
훈련 셋 정확도: 0.9354


In [33]:
# 각 특성별 확률값(중요도) 출력
feature_name_list = x_train.columns
df_feature_importance = xgb.feature_importances_
for i,v in enumerate(df_feature_importance):
    print('(%2d) Feature : %s, %5f' %(i, feature_name_list[i], v))
#각 특성을 확률값이 큰 순서로 정렬
df_importance = pd.DataFrame({'feature' : x_train.columns, 'importance' : xgb.feature_importances_})
df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance.head(10)

( 0) Feature : 연령, 0.034149
( 1) Feature : 신장, 0.502413
( 2) Feature : 체중, 0.021256
( 3) Feature : 허리둘레, 0.037459
( 4) Feature : 수축기혈압, 0.007768
( 5) Feature : 이완기혈압, 0.006149
( 6) Feature : 식전혈당, 0.004257
( 7) Feature : 총콜레스테롤, 0.005850
( 8) Feature : 흡연상태, 0.357791
( 9) Feature : 음주여부, 0.022907


Unnamed: 0,feature,importance
1,신장,0.502413
8,흡연상태,0.357791
3,허리둘레,0.037459
0,연령,0.034149
9,음주여부,0.022907
2,체중,0.021256
4,수축기혈압,0.007768
5,이완기혈압,0.006149
7,총콜레스테롤,0.00585
6,식전혈당,0.004257


In [34]:
del xgb, x_train, y_train, x_test, y_test

In [35]:
#이번에는 learning_rate를 0.7로 설정하여 가중치를 더 많이 반영해보자

In [36]:
# 훈련, 테스트 셋 분리
# test_size=0.3으로 설정하여 훈련:테스트 비율을 7:3으로
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# n_estinators매개변수로 군집수를 100으로 설정
#learning_rate를 0.7로 설정
xgb = XGBClassifier(n_estinators=100, learning_rate=0.7) # xgboost 모델 생성
xgb = xgb.fit(x_train, y_train)

# 테스트 셋 정확도 
accuracy = float(xgb.score(x_test, y_test))
print('테스트 셋 정확도: %.4f' %accuracy)

# 훈련 셋 정확도 
accuracy = float(xgb.score(x_train, y_train))
print('훈련 셋 정확도: %.4f' %accuracy)



Parameters: { n_estinators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


테스트 셋 정확도: 0.9285
훈련 셋 정확도: 0.9509


In [37]:
#기존에 비해 테스트셋 정확도는 1% 감소하고 훈련셋 정확도는 2% 증가하였다

In [38]:
# 각 특성별 확률값(중요도) 출력
feature_name_list = x_train.columns
df_feature_importance = xgb.feature_importances_
for i,v in enumerate(df_feature_importance):
    print('(%2d) Feature : %s, %5f' %(i, feature_name_list[i], v))
#각 특성을 확률값이 큰 순서로 정렬
df_importance = pd.DataFrame({'feature' : x_train.columns, 'importance' : xgb.feature_importances_})
df_importance = df_importance.sort_values(by='importance', ascending=False)
df_importance.head(10)

( 0) Feature : 연령, 0.034807
( 1) Feature : 신장, 0.552673
( 2) Feature : 체중, 0.013454
( 3) Feature : 허리둘레, 0.022037
( 4) Feature : 수축기혈압, 0.006062
( 5) Feature : 이완기혈압, 0.005380
( 6) Feature : 식전혈당, 0.004979
( 7) Feature : 총콜레스테롤, 0.005748
( 8) Feature : 흡연상태, 0.331559
( 9) Feature : 음주여부, 0.023301


Unnamed: 0,feature,importance
1,신장,0.552673
8,흡연상태,0.331559
0,연령,0.034807
9,음주여부,0.023301
3,허리둘레,0.022037
2,체중,0.013454
4,수축기혈압,0.006062
7,총콜레스테롤,0.005748
5,이완기혈압,0.00538
6,식전혈당,0.004979


In [39]:
# 기존에 비해 연령 등의 요소 확률값이 증가하였다

In [40]:
del xgb, x_train, y_train, x_test, y_test

In [42]:
del df_source