In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import numpy  as np
import seaborn as sns
%matplotlib inline

path = './data/'
pd.options.display.max_rows = 150
pd.options.display.max_columns = 350
plt.rc('font',family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

pd.options.display.max_rows = 999

In [2]:
sido = pd.read_csv(path+'sido_imp_10.csv', encoding='cp949')

## 데이터모양확인 및 split

In [3]:
sido.shape

(17, 173)

In [4]:
sido_y = sido.iloc[:,-7:]
sido_x = sido.iloc[:,3:-7]
sido_name = sido.iloc[:,:3]

In [5]:
sido_y.shape, sido_x.shape,sido_name.shape

((17, 7), (17, 163), (17, 3))

## 표준화 (MinmaxScaler)
* 다른 스케일러 robust_scale(), maxabs_scale(), scale(), 
* minmaxscaler() 범위조정(0,1)
* 로 만들어진 데이터셋도 시도해봐야한다.

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
#독립변수에 일괄적용
sido_x_scaled = scaler.fit_transform(sido_x)
#sigungu_x_scaled = scaler.fit_transform(sigungu_x)

In [7]:
#scale결과 array를 데이터프레임으로 만들고 열이름을 새로 지정해준다.
#sigungu_x_scaled = pd.DataFrame(data = sigungu_x_scaled)
#sigungu_x_scaled.columns = sigungu_x.columns
sido_x_scaled = pd.DataFrame(data = sido_x_scaled)
sido_x_scaled.columns = sido_x.columns

In [8]:
sido_x_scaled.shape#, sigungu_x_scaled.shape

(17, 163)

## Feature Importance_RFC

In [9]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [10]:
#importance
rf = RandomForestClassifier(n_estimators=10).fit(sido_x_scaled, sido.감염병)
feature_importance_rf = pd.DataFrame(data=np.c_[sido_x_scaled.columns.values,rf.feature_importances_],
                                     columns=['feature','FI_RFC'])
feature_importance_rf
#feature_importance_rf.sort_values('FI',ascending=False,inplace=True)

Unnamed: 0,feature,FI_RFC
0,1인가구수,0.0
1,5대범죄 발생건수,0.0
2,가스공급설비 면적,0.0
3,가스사고발생건수,0.0
4,가해(타살) 사망자수,0.0
5,감염병 발생건수,0.0
6,감염병 사망자수,0.0
7,건강보험급여실적,0.0
8,건설업 업체수,0.0
9,건설업 종사자수,0.0


## Feature Importance_XGB

In [11]:
#importance... xgboost
xg = XGBClassifier(max_depth=10, n_estimators=200, learning_rate=0.01).fit(sido_x_scaled, sido.화재)
feature_importance_xg = pd.DataFrame(data=np.c_[sido_x_scaled.columns.values,xg.feature_importances_],
                                     columns=['feature','FI_XGB'])
#feature_importance_rf.sort_values('FI',ascending=False,inplace=True)
feature_importance_xg 

Unnamed: 0,feature,FI_XGB
0,1인가구수,0.0349947
1,5대범죄 발생건수,0.0
2,가스공급설비 면적,0.0
3,가스사고발생건수,0.0
4,가해(타살) 사망자수,0.0
5,감염병 발생건수,0.0
6,감염병 사망자수,0.0
7,건강보험급여실적,0.0
8,건설업 업체수,0.0
9,건설업 종사자수,0.0250437


## Feature Importance_LGBM

In [12]:
import copy
#lgbm은 아스키 코드값만 입력받기 때문에 한글을 다 숫자로 바뀌주기
lgbm_sido = b = copy.deepcopy(sido_x_scaled) 
lgbm_sido.columns = list(range(163))

In [13]:
lgbm= LGBMClassifier(n_estimators=200,num_boost_round =500,learning_rate =0.01).fit(lgbm_sido, sido.화재)
feature_importance_lg = pd.DataFrame(data=np.c_[lgbm_sido.columns.values,lgbm.feature_importances_],
                                     columns=['feature','FI_LGB'])
#feature_importance_rf.sort_values('FI',ascending=False,inplace=True)
feature_importance_lg



Unnamed: 0,feature,FI_LGB
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


# RFE

In [14]:
#importance
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=10),n_features_to_select =10)###################
select.fit(sido_x_scaled, sido.감염병)
feature_importance_rfe = pd.DataFrame(data=np.c_[sido_x_scaled.columns.values,select.get_support()],
                                     columns=['feature','RFE'])
#feature_importance_rfe.sort_values('RFE',ascending=False,inplace=True)
feature_importance_rfe

Unnamed: 0,feature,RFE
0,1인가구수,False
1,5대범죄 발생건수,False
2,가스공급설비 면적,False
3,가스사고발생건수,False
4,가해(타살) 사망자수,False
5,감염병 발생건수,False
6,감염병 사망자수,False
7,건강보험급여실적,False
8,건설업 업체수,False
9,건설업 종사자수,False


# Logistic Regression
regression coefficient

In [15]:
from sklearn.linear_model import LinearRegression
model_fire = LinearRegression().fit(sido_x_scaled, sido.감염병)

In [16]:
#coefficient
feature_importance_lr = pd.DataFrame(np.c_[sido_x_scaled.columns.values,model_fire.coef_.ravel()],
                                    columns=['feature','lr'])
#feature_importance_lr.sort_values('lr', ascending=False, inplace=True)
feature_importance_lr

Unnamed: 0,feature,lr
0,1인가구수,-0.0574369
1,5대범죄 발생건수,-0.0111413
2,가스공급설비 면적,-0.0284417
3,가스사고발생건수,-0.00989687
4,가해(타살) 사망자수,-0.0884516
5,감염병 발생건수,0.0467156
6,감염병 사망자수,0.0302418
7,건강보험급여실적,0.069519
8,건설업 업체수,-0.0646215
9,건설업 종사자수,-0.0980002


# SelectKBest

In [17]:
from sklearn.feature_selection import SelectKBest, chi2
k_best = SelectKBest(chi2, k=10)#####################
k_best.fit(sido_x_scaled, sido.감염병)
feature_importance_k = pd.DataFrame(np.c_[sido_x_scaled.columns.values,k_best.get_support()],
                                    columns=['feature','Kbest'])
#feature_importance_k.sort_values('Kbest', ascending=False, inplace=True)

feature_importance_k

Unnamed: 0,feature,Kbest
0,1인가구수,False
1,5대범죄 발생건수,False
2,가스공급설비 면적,False
3,가스사고발생건수,False
4,가해(타살) 사망자수,False
5,감염병 발생건수,False
6,감염병 사망자수,True
7,건강보험급여실적,False
8,건설업 업체수,False
9,건설업 종사자수,False


# corr

In [18]:
sumdf = pd.concat([sido_x_scaled, sido.감염병], axis=1)
corr_df = pd.DataFrame(sumdf.corr(method='pearson')['감염병']).reset_index()
corr_df.sort_values(by='감염병', ascending=False).drop(163, axis=0)
corr_df.rename(columns={'감염병':'corr','index':'feature'},inplace=True)
corr_df

Unnamed: 0,feature,corr
0,1인가구수,0.156977
1,5대범죄 발생건수,-0.41072
2,가스공급설비 면적,-0.209224
3,가스사고발생건수,-0.190529
4,가해(타살) 사망자수,-0.191561
5,감염병 발생건수,0.218697
6,감염병 사망자수,0.571542
7,건강보험급여실적,0.512053
8,건설업 업체수,0.131918
9,건설업 종사자수,-0.027906


# 합치기

In [19]:
df=feature_importance_lr.merge(corr_df).merge(feature_importance_rf)
df = df.merge(feature_importance_xg).merge(feature_importance_rfe).merge(feature_importance_k)
#df.sort_values('Kbest', ascending=False, inplace=True)

In [20]:
a = df.loc[:,'lr'].apply(lambda x : abs(x))
b = df.loc[:,'corr'].apply(lambda x : abs(x))
df['lr_abs'] = a
df['corr_abs'] = b
df

Unnamed: 0,feature,lr,corr,FI_RFC,FI_XGB,RFE,Kbest,lr_abs,corr_abs
0,1인가구수,-0.0574369,0.156977,0.0,0.0349947,False,False,0.057437,0.156977
1,5대범죄 발생건수,-0.0111413,-0.41072,0.0,0.0,False,False,0.011141,0.41072
2,가스공급설비 면적,-0.0284417,-0.209224,0.0,0.0,False,False,0.028442,0.209224
3,가스사고발생건수,-0.00989687,-0.190529,0.0,0.0,False,False,0.009897,0.190529
4,가해(타살) 사망자수,-0.0884516,-0.191561,0.0,0.0,False,False,0.088452,0.191561
5,감염병 발생건수,0.0467156,0.218697,0.0,0.0,False,False,0.046716,0.218697
6,감염병 사망자수,0.0302418,0.571542,0.0,0.0,False,True,0.030242,0.571542
7,건강보험급여실적,0.069519,0.512053,0.0,0.0,False,False,0.069519,0.512053
8,건설업 업체수,-0.0646215,0.131918,0.0,0.0,False,False,0.064622,0.131918
9,건설업 종사자수,-0.0980002,-0.027906,0.0,0.0250437,False,False,0.098,0.027906


In [21]:
df.to_csv(path + '변수중요도_감염병.csv',encoding='cp949',index=False)