In [1]:
from prophet import Prophet

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import random
from copy import deepcopy
import math

In [2]:
train_df=pd.read_csv('./data/train.csv')
test_df=pd.read_csv('./data/test.csv')

In [3]:
df=train_df[['date','rental']].copy()
df.columns=['ds','y']
m = Prophet()
m.daily_seasonality=True
m.weekly_seasonality=True
m.yearly_seasonality=True
m.fit(df)
future=m.make_future_dataframe(365)
forecast = m.predict(future)

Initial log joint probability = -73.9282
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       1699.14   5.17586e-06       69.4858      0.7698      0.7698      133   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     110       1699.17   0.000410967       93.1272   3.446e-06       0.001      181  LS failed, Hessian reset 
     150       1699.19   3.83621e-05       77.7678   4.552e-07       0.001      266  LS failed, Hessian reset 
     177       1699.19    1.5188e-07       69.0376      0.2292           1      304   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance


In [4]:
train_df['rental']=train_df['rental']/forecast[:-365]['trend']

In [5]:
train_df['date']=pd.to_datetime(train_df['date'])
train_df['year']=train_df['date'].dt.year
train_df['month']=train_df['date'].dt.month
train_df['day']=train_df['date'].dt.day
train_df = train_df[['date', 'precipitation', 'temp_mean', 'temp_highest', 'temp_lowest',
       'PM10', 'PM2.5', 'humidity', 'sunshine_sum', 'sunshine_rate',
       'wind_mean', 'wind_max', 'year', 'month', 'day', 'rental']]

In [6]:
# 옵션에 따라, 선택한 데이터프레임의 컬럼의 결측값을 채워주는 함수
# option = 0, min, max, mean, median
def cal(my_df, column_name, option):
    df=my_df.copy()
    df['date']=pd.to_datetime(df['date'])
    df['month']=df['date'].dt.month
    nn_df = df.iloc[df[column_name].dropna().index]
    month_df_list=[nn_df[nn_df['month']==i] for i in range(1,13)]
    
    if option == 0:
        replace_list=[0 for month_df in month_df_list]
    elif option =='min':
        replace_list=[month_df[column_name].min() for month_df in month_df_list]
    elif option =='max':
        replace_list=[month_df[column_name].max() for month_df in month_df_list]
    elif option == 'mean':
        replace_list=[month_df[column_name].mean() for month_df in month_df_list]
    elif option == 'median':
        replace_list=[month_df[column_name].median() for month_df in month_df_list]
    
    replace_list = ['']+ replace_list
    
    result=[]
    for i in range(my_df.shape[0]):
        n=my_df.iloc[i][column_name]
        if math.isnan(n):
            result.append(replace_list[df.loc[i]['month']])
        else:
            result.append(n)
    my_df[column_name] = result

In [7]:
null_feature = ['precipitation','PM10','PM2.5','sunshine_sum']
best_option_group = ['min' , 'max' , 'median' , 'min']

In [8]:
for column_name, option in zip(null_feature, best_option_group):
    cal(train_df, column_name, option)

In [9]:
random_state_list=[random.randint(0, 100) for _ in range(10)]
random_state_list

[98, 19, 31, 100, 74, 60, 100, 18, 92, 80]

In [10]:
X,y = train_df.iloc[:,1:-1].to_numpy(), train_df.iloc[:,-1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=123456)
for i, state in enumerate(random_state_list): 
    rf = RandomForestRegressor(oob_score=True, random_state=state)
    rf.fit(X_train, y_train)
    score=rf.score(X_test, y_test)
    print(f'{i+1}번째')
    print(f'Score : {score}')
    print(f'Oob_Score : {rf.oob_score_}')

1번째
Score : 0.9051765214418668
Oob_Score : 0.915483623744452
2번째
Score : 0.9056748908810417
Oob_Score : 0.9139581984199489
3번째
Score : 0.9056155186719743
Oob_Score : 0.913500172432686
4번째
Score : 0.905986611913236
Oob_Score : 0.914692024740952
5번째
Score : 0.9047391613538618
Oob_Score : 0.9161476837128741
6번째
Score : 0.9058925282505335
Oob_Score : 0.9158925810045763
7번째
Score : 0.905986611913236
Oob_Score : 0.914692024740952
8번째
Score : 0.9055982715905576
Oob_Score : 0.9146505780875567
9번째
Score : 0.9050981406055559
Oob_Score : 0.9125181571846351
10번째
Score : 0.9069977710748123
Oob_Score : 0.912468628174645


In [12]:
print(*sorted(zip(train_df.iloc[:,1:-1].columns ,rf.feature_importances_), key=lambda x : x[1], reverse=True), sep='\n')

('temp_highest', 0.5322309626586224)
('year', 0.12294065387876452)
('precipitation', 0.10075859565327158)
('humidity', 0.06813271619735152)
('temp_mean', 0.047766880332681366)
('sunshine_rate', 0.03649684120954235)
('temp_lowest', 0.01980265613211493)
('sunshine_sum', 0.019625495341337888)
('month', 0.014715043402780917)
('PM10', 0.01016343683441076)
('PM2.5', 0.008615190492803602)
('day', 0.007567235420294856)
('wind_max', 0.006165331680895891)
('wind_mean', 0.005018960765127282)
