In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

In [2]:
def plot_feature_importance(model, X_train, figsize=(12, 6)):
    sns.set_style('darkgrid')
    
    # Plot feature importance
    feature_importance = model.feature_importances_
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    plt.figure(figsize=figsize)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X_train.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

In [9]:
from geoband.API import *

GetCompasData('PJT002_train.csv')
GetCompasData('PJT002_test.csv')
GetCompasData('PJT002_validation.csv')
GetCompasData('PJT002_submission.csv')

train = pd.read_csv('PJT002_train.csv')
val = pd.read_csv('PJT002_validation.csv')
test = pd.read_csv('PJT002_test.csv')
sub = pd.read_csv('PJT002_submission.csv')

In [10]:
train=train[train['bldng_ar']<train['ttl_ar']]
val=val[val['bldng_ar']<val['ttl_ar']]

In [11]:
X_train = train.drop(['ele_engry_us_201401','ele_engry_us_201402','ele_engry_us_201403','ele_engry_us_201404','ele_engry_us_201405','ele_engry_us_201406',
     'ele_engry_us_201407','ele_engry_us_201408','ele_engry_us_201409','ele_engry_us_201410','ele_engry_us_201411','ele_engry_us_201412','gas_engry_us_201401','gas_engry_us_201402','gas_engry_us_201403','gas_engry_us_201404','gas_engry_us_201405','gas_engry_us_201406',
     'gas_engry_us_201407','gas_engry_us_201408','gas_engry_us_201409','gas_engry_us_201410','gas_engry_us_201411','gas_engry_us_201412','fr_yn','id','trgt_crtr','fr_fghtng_fclt_spcl_css_5_yn','fr_fghtng_fclt_spcl_css_6_yn','us_yn',
                      'dngrs_thng_yn','slf_fr_brgd_yn','blk_dngrs_thng_mnfctr_yn','cltrl_hrtg_yn','lw_13101010','lw_13101110','lw_13101210','lw_13101211','lw_13101310','lw_13101410',
'lw_13111010','lw_13111110','lw_13121010','lw_13121011','lw_13131010','lw_13131110','lw_13141010','lw_13141011', 'dt_of_fr','cctv_in_100m','fr_wthr_fclt_in_100m'], 1)
y_train = train['fr_yn']
X_val = val.drop(['ele_engry_us_201401','ele_engry_us_201402','ele_engry_us_201403','ele_engry_us_201404','ele_engry_us_201405','ele_engry_us_201406',
     'ele_engry_us_201407','ele_engry_us_201408','ele_engry_us_201409','ele_engry_us_201410','ele_engry_us_201411','ele_engry_us_201412','gas_engry_us_201401','gas_engry_us_201402','gas_engry_us_201403','gas_engry_us_201404','gas_engry_us_201405','gas_engry_us_201406',
     'gas_engry_us_201407','gas_engry_us_201408','gas_engry_us_201409','gas_engry_us_201410','gas_engry_us_201411','gas_engry_us_201412','fr_yn','id','trgt_crtr','fr_fghtng_fclt_spcl_css_5_yn','fr_fghtng_fclt_spcl_css_6_yn','us_yn',
                      'dngrs_thng_yn','slf_fr_brgd_yn','blk_dngrs_thng_mnfctr_yn','cltrl_hrtg_yn','lw_13101010','lw_13101110','lw_13101210','lw_13101211','lw_13101310','lw_13101410',
'lw_13111010','lw_13111110','lw_13121010','lw_13121011','lw_13131010','lw_13131110','lw_13141010','lw_13141011', 'dt_of_fr','cctv_in_100m','fr_wthr_fclt_in_100m'], 1)
y_val = val['fr_yn']

#y_train = X_train['fr_yn']
#X_train = X_train.drop(['fr_yn', 'dt_of_fr','id'], 1)
#y_val = X_val['fr_yn']
#X_val = X_val.drop(['fr_yn', 'dt_of_fr','id'], 1)
test = test.drop([ 'dt_of_fr'], 1)

In [12]:
all_data = pd.concat([X_train, X_val, test],sort=False)

In [13]:
all_data['bldng_us_clssfctn'] = all_data['bldng_us_clssfctn'].fillna("None")
all_data['bldng_us'] = all_data['bldng_us'].fillna("None")
all_data['bldng_archtctr'] = all_data['bldng_archtctr'].fillna("None")
all_data['bldng_ar_prc'] = all_data['bldng_ar_prc'].fillna(0)
all_data['lnd_us_sttn_nm'] = all_data['lnd_us_sttn_nm'].fillna("단독")
all_data['rd_sd_nm'] = all_data['rd_sd_nm'].fillna("세로한면(가)")
all_data['rgnl_ar_nm2'] = all_data['rgnl_ar_nm2'].fillna("지정되지않음")
all_data['rgnl_ar_nm'] = all_data['rgnl_ar_nm'].fillna("계획관리지역")
all_data['hm_cnt'] = all_data['hm_cnt'].fillna(0)
all_data['ttl_dwn_flr'] = all_data['ttl_dwn_flr'].fillna(0)
all_data['ttl_grnd_flr'] = all_data['ttl_grnd_flr'].fillna(0)
all_data['wnd_drctn'] = all_data['wnd_drctn'].fillna(0)
all_data['fr_mn_cnt'] = all_data['fr_mn_cnt'].fillna(176)
all_data['wnd_spd'] = all_data['wnd_spd'].fillna(0)
all_data['hmdt'] = all_data['hmdt'].fillna(98)

In [8]:
# 참고. 실제로 스코어 돌릴때는 빼야함.
train['fr_yn'] = train['fr_yn'].map({'Y':1, 'N':0})  

train['tbc_rtl_str_dstnc2'] = pd.qcut(train['tbc_rtl_str_dstnc'],4)
train[['tbc_rtl_str_dstnc2', 'fr_yn']].groupby(['tbc_rtl_str_dstnc2'], as_index=False).mean().sort_values(by='tbc_rtl_str_dstnc2', ascending=True)

Unnamed: 0,tbc_rtl_str_dstnc2,fr_yn
0,"(-0.001, 260.0]",0.223563
1,"(260.0, 534.0]",0.188437
2,"(534.0, 1852.0]",0.230568
3,"(1852.0, 23228.0]",0.209877


In [14]:
all_data.loc[all_data['tbc_rtl_str_dstnc'] <= 260, 'tbc_rtl_str_dstnc'] = 0
all_data.loc[(all_data['tbc_rtl_str_dstnc'] > 260) & (all_data['tbc_rtl_str_dstnc'] <= 534), 'tbc_rtl_str_dstnc'] = 1
all_data.loc[(all_data['tbc_rtl_str_dstnc'] > 534) & (all_data['tbc_rtl_str_dstnc'] <= 1852), 'tbc_rtl_str_dstnc'] = 2
all_data.loc[(all_data['tbc_rtl_str_dstnc'] > 1852), 'tbc_rtl_str_dstnc'] = 3

In [15]:
list1 = ['공동주택','단독주택','제1종근린생활시설','제2종근린생활시설','근린생활시설']
all_data['bldng_us2'] = [1 if w in list1 else 0 for w in all_data['bldng_us']]
all_data = all_data.drop('bldng_us', axis=1)

In [16]:
list1 = ['통나무구조','일반목구조','목구조','기타구조']
list2 = ['기타조적구조','블록구조','석구조','벽돌구조','조적구조']
all_data['bldng_archtctr2'] = [1 if w in list1 else 2 if w in list2 else 0 for w in all_data['bldng_archtctr']]
all_data = all_data.drop('bldng_archtctr', axis=1)

In [17]:
all_data = all_data.drop('rgnl_ar_nm2', axis=1)

In [18]:
#all_data = pd.concat([all_data, test],sort=False)

categorical_cols = all_data.select_dtypes(['object']).columns
for col in categorical_cols:
    all_data[col] = pd.Categorical(all_data[col]).codes

#from sklearn.preprocessing import OneHotEncoder
#onehotencoder = OneHotEncoder(categorical_features = [0])
#all_data = pd.DataFrame(onehotencoder.fit_transform(all_data).toarray())

X_train = all_data[:len(train)]
X_val = all_data[len(train):-len(test)]
test = all_data[-len(test):]



In [19]:
X_train = X_train.fillna(-1)
X_val = X_val.fillna(-1)
test = test.fillna(-1)

In [20]:
from  sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=-1, n_estimators=500, random_state=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

f1_score(y_val, y_pred,pos_label='Y')

0.5796344647519581

In [21]:
y=model.predict(test)

In [22]:
sub['fr_yn']=y
sub.head()

Unnamed: 0,fr_yn
0,N
1,N
2,N
3,N
4,N


In [None]:
sub.to_csv('baseline_rf.csv',index=False)