In [2]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import numpy  as np
import seaborn as sns
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
path = './data/'
pd.options.display.max_rows = 150
pd.options.display.max_columns = 350
plt.rc('font',family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

In [3]:
sido = pd.read_csv(path+'sido_imp_10.csv', encoding='cp949')

## 데이터모양확인 및 split

In [4]:
sido.shape

((17, 173), (226, 311))

In [5]:
sido_y = sido.iloc[:,-7:]
sido_x = sido.iloc[:,3:-7]
sido_name = sido.iloc[:,:3]

In [6]:
sido_y.shape, sido_x.shape,sido_name.shape

((17, 7), (17, 163), (17, 3))

## 표준화 (MinmaxScaler)
* 다른 스케일러 robust_scale(), maxabs_scale(), scale(), 
* minmaxscaler() 범위조정(0,1)
* 로 만들어진 데이터셋도 시도해봐야한다.

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
#독립변수에 일괄적용
sido_x_scaled = scaler.fit_transform(sido_x)
#sigungu_x_scaled = scaler.fit_transform(sigungu_x)

In [9]:
#scale결과 array를 데이터프레임으로 만들고 열이름을 새로 지정해준다.
#sigungu_x_scaled = pd.DataFrame(data = sigungu_x_scaled)
#sigungu_x_scaled.columns = sigungu_x.columns
sido_x_scaled = pd.DataFrame(data = sido_x_scaled)
sido_x_scaled.columns = sido_x.columns

In [10]:
sido_x_scaled.shape#, sigungu_x_scaled.shape

(17, 163)

## Feature Importance

In [11]:
#importance
rf = RandomForestClassifier(n_estimators=10).fit(sido_x_scaled, sido.화재)
feature_importance_rf = pd.DataFrame(data=np.c_[sido_x_scaled.columns.values,rf.feature_importances_],
                                     columns=['feature','FI'])
#feature_importance_rf.sort_values('FI',ascending=False,inplace=True)
feature_importance_rf

Unnamed: 0,feature,FI
0,1인가구수,0.0
1,5대범죄 발생건수,0.0
2,가스공급설비 면적,0.0
3,가스사고발생건수,0.0377778
4,가해(타살) 사망자수,0.0303571
5,감염병 발생건수,0.0
6,감염병 사망자수,0.0
7,건강보험급여실적,0.0
8,건설업 업체수,0.0
9,건설업 종사자수,0.0


# RFE

In [12]:
#importance
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=10),n_features_to_select =20)
select.fit(sido_x_scaled, sido.화재)
feature_importance_rfe = pd.DataFrame(data=np.c_[sido_x_scaled.columns.values,select.get_support()],
                                     columns=['feature','RFE'])
#feature_importance_rfe.sort_values('RFE',ascending=False,inplace=True)
feature_importance_rfe

Unnamed: 0,feature,RFE
0,1인가구수,False
1,5대범죄 발생건수,False
2,가스공급설비 면적,False
3,가스사고발생건수,False
4,가해(타살) 사망자수,False
5,감염병 발생건수,False
6,감염병 사망자수,False
7,건강보험급여실적,False
8,건설업 업체수,False
9,건설업 종사자수,False


# Logistic Regression
regression coefficient

In [13]:
from sklearn.linear_model import LinearRegression
model_fire = LinearRegression().fit(sido_x_scaled, sido.화재)

In [14]:
#coefficient
feature_importance_lr = pd.DataFrame(np.c_[sido_x_scaled.columns.values,model_fire.coef_.ravel()],
                                    columns=['feature','lr'])
#feature_importance_lr.sort_values('lr', ascending=False, inplace=True)
feature_importance_lr

Unnamed: 0,feature,lr
0,1인가구수,-0.0773959
1,5대범죄 발생건수,-0.0195941
2,가스공급설비 면적,-0.0584446
3,가스사고발생건수,-0.119737
4,가해(타살) 사망자수,0.0345255
5,감염병 발생건수,0.082391
6,감염병 사망자수,-0.0548124
7,건강보험급여실적,0.0688554
8,건설업 업체수,0.0185448
9,건설업 종사자수,0.0560857


# SelectKBest

In [15]:
from sklearn.feature_selection import SelectKBest, chi2
k_best = SelectKBest(chi2, k=20)
k_best.fit(sido_x_scaled, sido.화재)
feature_importance_k = pd.DataFrame(np.c_[sido_x_scaled.columns.values,k_best.get_support()],
                                    columns=['feature','Kbest'])
#feature_importance_k.sort_values('Kbest', ascending=False, inplace=True)

feature_importance_k

Unnamed: 0,feature,Kbest
0,1인가구수,False
1,5대범죄 발생건수,False
2,가스공급설비 면적,False
3,가스사고발생건수,False
4,가해(타살) 사망자수,False
5,감염병 발생건수,False
6,감염병 사망자수,False
7,건강보험급여실적,False
8,건설업 업체수,False
9,건설업 종사자수,False


# corr

In [16]:
sumdf = pd.concat([sido_x_scaled, sido.화재], axis=1)
corr_df = pd.DataFrame(sumdf.corr(method='pearson')['화재']).reset_index()
corr_df.sort_values(by='화재', ascending=False).drop(163, axis=0)
corr_df.rename(columns={'화재':'corr','index':'feature'},inplace=True)
corr_df

Unnamed: 0,feature,corr
0,1인가구수,-0.035845
1,5대범죄 발생건수,-0.347625
2,가스공급설비 면적,-0.208023
3,가스사고발생건수,-0.008319
4,가해(타살) 사망자수,0.247751
5,감염병 발생건수,0.070277
6,감염병 사망자수,0.267476
7,건강보험급여실적,0.246755
8,건설업 업체수,0.375513
9,건설업 종사자수,0.31735


# 합치기

In [17]:
df=feature_importance_lr.merge(corr_df)
#df.sort_values('Kbest', ascending=False, inplace=True)
df.lr

0      -0.0773959
1      -0.0195941
2      -0.0584446
3       -0.119737
4       0.0345255
5        0.082391
6      -0.0548124
7       0.0688554
8       0.0185448
9       0.0560857
10     0.00743036
11      -0.076968
12    -0.00165689
13     -0.0626012
14      0.0206104
15        0.13323
16     -0.0369738
17      0.0966486
18      0.0928122
19      0.0414846
20      -0.156596
21     -0.0129422
22     -0.0158849
23     -0.0462149
24     -0.0111664
25      0.0191682
26       0.061695
27      -0.029259
28     -0.0459587
29      0.0941884
30     -0.0277158
31      0.0231094
32     -0.0295357
33      -0.185677
34      -0.016518
35     -0.0244037
36     -0.0178874
37     -0.0265955
38     -0.0363147
39     -0.0445279
40      0.0420896
41     -0.0155639
42       0.047469
43      0.0533028
44     -0.0436467
45      0.0160578
46      0.0933674
47     -0.0455755
48     0.00657941
49       0.123923
50      -0.055112
51     0.00704207
52      -0.120504
53     -0.0762297
54      0.0108032
55      0.

In [18]:
df=feature_importance_rf.merge(feature_importance_rfe).merge(feature_importance_k)
#df.sort_values(['Kbest'], ascending=False, inplace=True)
df
df.loc[(df.RFE==True)|(df.Kbest==True)].sort_values(['RFE'],ascending=False)

Unnamed: 0,feature,FI,RFE,Kbest
124,제조업 종사자수,0.0,True,False
141,총 사업체수,0.0,True,False
130,주민등록인구(여자),0.0696429,True,False
142,총전입자수,0.0,True,False
123,제조업 업체수,0.0,True,False
122,제방면적 비율,0.0,True,False
120,전기화재발생건수,0.0,True,True
143,추락 사망자수,0.0,True,True
144,특수의료장비수,0.0,True,False
145,폐기물 처리시설수,0.0,True,True


In [19]:
# sns.pairplot(sumdf, hue='화재')
# plt.savefig(path+'corr2.png',transparent=True)

In [20]:
# plt.figure(figsize=(100,100))
# sns.heatmap(data = sido_x_scaled.corr(method='pearson'),
#             annot=True, fmt = '.2f', linewidths=.1, cmap='Reds')
# plt.savefig(path+'corr.png',transparent=True)