In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
ROOT_PATH = r'./open/'

train = pd.read_csv(f'{ROOT_PATH}train.csv', encoding='cp949')
test = pd.read_csv(f'{ROOT_PATH}test.csv', encoding='cp949')
ss = pd.read_csv(f'{ROOT_PATH}sample_submission.csv', encoding='cp949')

In [3]:
def making_val_table(df):
    '''입력받은 데이터셋의 유효성 검증을 위한 요약 테이블'''
    # dtypes 
    df_dtypes = df.dtypes
    
    # nunique
    df_nunique = df.nunique()
    
    # null_values
    df_nan = df.isna().sum()
    
    val_table = pd.concat([df_dtypes, df_nunique, df_nan], axis=1)
    val_table.columns = ['dtype', 'nunique', 'nan']
        
    return val_table.reset_index()

In [4]:
test[['사망자수', '중상자수', '경상자수', '부상자수', 'ECLO']] = 0

In [5]:
# 기상상태, 시군구 데이터의 unique 값이 각각 7vs6, 199vs 192로 다르다
train_info = making_val_table(train)
test_info = making_val_table(test)

pd.merge(left=train_info, right=test_info, on='index', 
         how='left', suffixes=('_train','_test')).set_index('index')

Unnamed: 0_level_0,dtype_train,nunique_train,nan_train,dtype_test,nunique_test,nan_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID,object,39609,0,object,10963.0,0.0
사고일시,object,18057,0,object,5548.0,0.0
요일,object,7,0,object,7.0,0.0
기상상태,object,6,0,object,5.0,0.0
시군구,object,199,0,object,192.0,0.0
도로형태,object,11,0,object,11.0,0.0
노면상태,object,6,0,object,6.0,0.0
사고유형,object,3,0,object,3.0,0.0
사고유형 - 세부분류,object,14,0,,,
법규위반,object,11,0,,,


In [6]:
def remove_outliers(df, column=None):
    if column is not None:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df_without_outliers = df[(df[column] >= Q1 - 1.5*IQR) & (df[column] <= Q3 + 1.5*IQR)]
    else:
        df_without_outliers = df
    return df_without_outliers

In [7]:
# target 변수 지정
#y_train = train_without_outliers['ECLO']

# train, test 데이터간 컬럼 동기화 
cols = test.columns
train_total = train[cols]
test_total = test

In [8]:
train_without_outliers = remove_outliers(train_total, 'ECLO')
#train_without_outliers = remove_outliers(train_total)

In [9]:
# 기상상태, 시군구 데이터의 unique 값이 각각 7vs6, 199vs 192로 다르다
train_info = making_val_table(train_without_outliers)
test_info = making_val_table(test_total)

pd.merge(left=train_info, right=test_info, on='index', 
         how='left', suffixes=('_train','_test')).set_index('index')

Unnamed: 0_level_0,dtype_train,nunique_train,nan_train,dtype_test,nunique_test,nan_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID,object,37485,0,object,10963,0
사고일시,object,17618,0,object,5548,0
요일,object,7,0,object,7,0
기상상태,object,6,0,object,5,0
시군구,object,199,0,object,192,0
도로형태,object,11,0,object,11,0
노면상태,object,6,0,object,6,0
사고유형,object,3,0,object,3,0
사망자수,int64,2,0,int64,1,0
중상자수,int64,3,0,int64,1,0


In [10]:
y_train = train_without_outliers[['사망자수', '중상자수', '경상자수', '부상자수', 'ECLO']]
X_train = train_without_outliers[['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형']]

X_test = test_total[['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형']]

In [11]:
def feat_eng(df):
    # datetime 변환
    df['사고일시'] = pd.to_datetime(df['사고일시'])
    
    # 월,일,시 컬럼 생성
    df['월'] = df['사고일시'].dt.month
    df['일'] = df['사고일시'].dt.day
    df['시'] = df['사고일시'].dt.hour
    
    # 불필요 컬럼 제거
    subs = ['ID','사고일시','기상상태','시군구']
    df = df.drop(subs, axis=1)
    
    # one-hot encoding 실시
    df = pd.get_dummies(df)
    
    return df

In [12]:
X_train_eng = feat_eng(X_train)
X_test_eng = feat_eng(X_test)

In [13]:
# X_train_eng.drop('일', axis=1, inplace=True)
# X_test_eng.drop('일', axis=1, inplace=True)

In [14]:
print(f'X_train 데이터 shape : {X_train_eng.shape}')
print(f'y_train 데이터 shape : {y_train.shape}')
print('-'*40)
print(f'X_test 데이터 shape : {X_test_eng.shape}')

X_train 데이터 shape : (37485, 30)
y_train 데이터 shape : (37485, 5)
----------------------------------------
X_test 데이터 shape : (10963, 30)


In [15]:
X_train_eng.head()

Unnamed: 0,월,일,시,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,요일_화요일,...,도로형태_주차장 - 주차장,노면상태_건조,노면상태_기타,노면상태_서리/결빙,노면상태_적설,노면상태_젖음/습기,노면상태_침수,사고유형_차대사람,사고유형_차대차,사고유형_차량단독
0,1,1,0,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False
1,1,1,0,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False
2,1,1,1,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False
3,1,1,2,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,False,True,False
4,1,1,4,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,False,True,False


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_eng, y_train , test_size=0.2, random_state=42)

In [17]:
# 사망자수 예측

y_train1 = y_train['사망자수'].copy()
y_val1 = y_val['사망자수'].copy()

In [18]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs = 6,
                           random_state=42)
rf.fit(X_train, y_train1)

y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)

In [19]:
from sklearn.metrics import mean_squared_log_error

rmsle_train = np.sqrt(mean_squared_log_error(y_train1, y_train_pred))
rmsle_val = np.sqrt(mean_squared_log_error(y_val1, y_val_pred))

In [20]:
print('train rmsle :', rmsle_train)
print('val rmsle :', rmsle_val)

train rmsle : 0.023712823149750893
val rmsle : 0.05518409259483811


In [21]:
y_pred_val = pd.DataFrame()
y_pred_test = pd.DataFrame()

In [22]:
y_pred_val['사망자수'] = rf.predict(X_val)
y_pred_test['사망자수'] = rf.predict(X_test_eng)

In [23]:
#X_train['사망자수'] = y_train['사망자수']
X_train['사망자수'] = rf.predict(X_train)

X_val['사망자수'] = rf.predict(X_val)

X_test_eng['사망자수'] =  rf.predict(X_test_eng)

In [24]:
# 중상자수 예측

y_train2 = y_train['중상자수'].copy()
y_val2 = y_val['중상자수'].copy()

In [25]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs = 6,
                           random_state=42)
rf.fit(X_train, y_train2)

y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)

In [26]:
from sklearn.metrics import mean_squared_log_error

rmsle_train = np.sqrt(mean_squared_log_error(y_train2, y_train_pred))
rmsle_val = np.sqrt(mean_squared_log_error(y_val2, y_val_pred))

In [27]:
print('train rmsle :', rmsle_train)
print('val rmsle :', rmsle_val)

train rmsle : 0.14916128597006087
val rmsle : 0.32616127173400367


In [28]:
y_pred_val['중상자수'] = rf.predict(X_val)
y_pred_test['중상자수'] = rf.predict(X_test_eng)

In [29]:
#X_train['중상자수'] = y_train['중상자수']
X_train['중상자수'] = rf.predict(X_train)

X_val['중상자수'] = rf.predict(X_val)

X_test_eng['중상자수'] =  rf.predict(X_test_eng)

In [30]:
# 경상자수 예측

y_train3 = y_train['경상자수'].copy()
y_val3 = y_val['경상자수'].copy()

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs = 6,
                           random_state=42)
rf.fit(X_train, y_train3)

y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)

In [32]:
from sklearn.metrics import mean_squared_log_error

rmsle_train = np.sqrt(mean_squared_log_error(y_train3, y_train_pred))
rmsle_val = np.sqrt(mean_squared_log_error(y_val3, y_val_pred))

In [33]:
print('train rmsle :', rmsle_train)
print('val rmsle :', rmsle_val)

train rmsle : 0.18326170209513826
val rmsle : 0.4412764376270027


In [34]:
y_pred_val['경상자수'] = rf.predict(X_val)
y_pred_test['경상자수'] = rf.predict(X_test_eng)

In [35]:
#X_train['경상자수'] = y_train['경상자수']
X_train['경상자수'] = rf.predict(X_train)

X_val['경상자수'] = rf.predict(X_val)

X_test_eng['경상자수'] =  rf.predict(X_test_eng)

In [36]:
# 부상자수 예측

y_train4 = y_train['부상자수'].copy()
y_val4 = y_val['부상자수'].copy()

In [37]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs = 6,
                           random_state=42)
rf.fit(X_train, y_train4)

y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)

In [38]:
from sklearn.metrics import mean_squared_log_error

rmsle_train = np.sqrt(mean_squared_log_error(y_train4, y_train_pred))
rmsle_val = np.sqrt(mean_squared_log_error(y_val4, y_val_pred))

In [39]:
print('train rmsle :', rmsle_train)
print('val rmsle :', rmsle_val)

train rmsle : 0.10390372621144091
val rmsle : 0.28344310643659737


In [40]:
y_pred_val['부상자수'] = rf.predict(X_val)
y_pred_test['부상자수'] = rf.predict(X_test_eng)

In [41]:
y_pred_val['ECLO'] = y_pred_val['사망자수']*10 + y_pred_val['중상자수']*5 + y_pred_val['경상자수']*3 + y_pred_val['부상자수']*1
y_pred_test['ECLO'] = y_pred_test['사망자수']*10 + y_pred_test['중상자수']*5 + y_pred_test['경상자수']*3 + y_pred_test['부상자수']*1

In [42]:
rmsle_val = np.sqrt(mean_squared_log_error(y_val['ECLO'], y_pred_val['ECLO']))

In [43]:
print('val rmsle :', rmsle_val)

val rmsle : 0.4242532557851512


In [44]:
y_pred_test.head()

Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
0,0.11,0.172,0.966667,0.02,4.88
1,0.0,0.138667,0.77,0.01,3.013333
2,0.0,0.35,1.094333,0.187,5.22
3,0.0,0.413333,0.863,0.130333,4.786
4,0.0,0.315,1.218476,0.225,5.455429


In [45]:
# y_pred = rf.predict(X_test_eng)
# y_pred

In [46]:
ss['ECLO'] = y_pred_test['ECLO']
ss.head()

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.88
1,ACCIDENT_39610,3.013333
2,ACCIDENT_39611,5.22
3,ACCIDENT_39612,4.786
4,ACCIDENT_39613,5.455429


In [47]:
OUTPUT_PATH = './output/'
ss.to_csv(f'{OUTPUT_PATH}submission_5.csv', index=False)