In [1]:
from prophet import Prophet

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import random
from copy import deepcopy
import math
from itertools import product

In [2]:
train_df=pd.read_csv('./data/train.csv')
test_df=pd.read_csv('./data/test.csv')

In [3]:
train_df_backup = deepcopy(train_df)
test_df_backup = deepcopy(test_df)

In [4]:
random_state_list=[random.randint(0, 100) for _ in range(10)]
random_state_list

[73, 98, 28, 7, 29, 44, 44, 84, 90, 4]

# All Null = 0

In [5]:
train_df = deepcopy(train_df_backup)
train_df.fillna(0, inplace=True)
X,y = train_df.iloc[:,1:-1].to_numpy(), train_df.iloc[:,-1].to_numpy()
X.shape, y.shape

((1095, 11), (1095,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=123456)
for i, state in enumerate(random_state_list): 
    rf = RandomForestRegressor(oob_score=True, random_state=state)
    rf.fit(X_train, y_train)
    score=rf.score(X_test, y_test)
    print(f'{i+1}번째')
    print(f'Score : {score}')
    print(f'Oob_Score : {rf.oob_score_}')

1번째
Score : 0.7118678906920503
Oob_Score : 0.7168796720696826
2번째
Score : 0.7040531708641079
Oob_Score : 0.7207707278719083
3번째
Score : 0.7095627184699234
Oob_Score : 0.724328400580201
4번째
Score : 0.7055263003547211
Oob_Score : 0.7205111770824671
5번째
Score : 0.705387615788355
Oob_Score : 0.7279123921940758
6번째
Score : 0.7077059482574044
Oob_Score : 0.7270143149286589
7번째
Score : 0.7077059482574044
Oob_Score : 0.7270143149286589
8번째
Score : 0.7128741855105762
Oob_Score : 0.7214537041366476
9번째
Score : 0.7162016844170175
Oob_Score : 0.72253599150032
10번째
Score : 0.7099208577769747
Oob_Score : 0.7192214262418741


# Null =?

In [7]:
# 옵션에 따라, 선택한 데이터프레임의 컬럼의 결측값을 채워주는 함수
# option = 0, min, max, mean, median
def cal(my_df, column_name, option):
    df=my_df.copy()
    df['date']=pd.to_datetime(df['date'])
    df['month']=df['date'].dt.month
    nn_df = df.iloc[df[column_name].dropna().index]
    month_df_list=[nn_df[nn_df['month']==i] for i in range(1,13)]
    
    if option == 0:
        replace_list=[0 for month_df in month_df_list]
    elif option =='min':
        replace_list=[month_df[column_name].min() for month_df in month_df_list]
    elif option =='max':
        replace_list=[month_df[column_name].max() for month_df in month_df_list]
    elif option == 'mean':
        replace_list=[month_df[column_name].mean() for month_df in month_df_list]
    elif option == 'median':
        replace_list=[month_df[column_name].median() for month_df in month_df_list]
    
    replace_list = ['']+ replace_list
    
    result=[]
    for i in range(my_df.shape[0]):
        n=my_df.iloc[i][column_name]
        if math.isnan(n):
            result.append(replace_list[df.loc[i]['month']])
        else:
            result.append(n)
    my_df[column_name] = result

In [8]:
null_feature = ['precipitation','PM10','PM2.5','sunshine_sum']

In [9]:
options = [0, 'min', 'max', 'mean', 'median']

In [16]:
model_list = []
score_list = []
for i, option_group in enumerate(product(options, repeat=len(null_feature))):
    train_df = deepcopy(train_df_backup)
    for column_name, option in zip(null_feature, option_group):
        cal(train_df, column_name, option)
    X,y = train_df.iloc[:,1:-1].to_numpy(), train_df.iloc[:,-1].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=123456)
    rf = RandomForestRegressor(oob_score=True, random_state=state)
    rf.fit(X_train, y_train)
    model_list.append(rf)
    score_list.append(rf.score(X_test, y_test))
    print(i, 'done')

0 done
1 done
2 done
3 done
4 done
5 done
6 done
7 done
8 done
9 done
10 done
11 done
12 done
13 done
14 done
15 done
16 done
17 done
18 done
19 done
20 done
21 done
22 done
23 done
24 done
25 done
26 done
27 done
28 done
29 done
30 done
31 done
32 done
33 done
34 done
35 done
36 done
37 done
38 done
39 done
40 done
41 done
42 done
43 done
44 done
45 done
46 done
47 done
48 done
49 done
50 done
51 done
52 done
53 done
54 done
55 done
56 done
57 done
58 done
59 done
60 done
61 done
62 done
63 done
64 done
65 done
66 done
67 done
68 done
69 done
70 done
71 done
72 done
73 done
74 done
75 done
76 done
77 done
78 done
79 done
80 done
81 done
82 done
83 done
84 done
85 done
86 done
87 done
88 done
89 done
90 done
91 done
92 done
93 done
94 done
95 done
96 done
97 done
98 done
99 done
100 done
101 done
102 done
103 done
104 done
105 done
106 done
107 done
108 done
109 done
110 done
111 done
112 done
113 done
114 done
115 done
116 done
117 done
118 done
119 done
120 done
121 done
122 done
123

In [42]:
my_option_groups = list(product(options, repeat=len(null_feature)))
result_df=pd.DataFrame([[score, *my_option_groups[index]] for score, index in sorted(zip(score_list,range(625)), reverse=True)])
result_df.columns=['score', *null_feature]

In [43]:
result_df.head(60)

Unnamed: 0,score,precipitation,PM10,PM2.5,sunshine_sum
0,0.717834,min,max,median,min
1,0.717834,min,max,median,0
2,0.717834,0,max,median,min
3,0.717834,0,max,median,0
4,0.71783,min,max,median,median
5,0.71783,0,max,median,median
6,0.717686,min,max,median,mean
7,0.717686,0,max,median,mean
8,0.717336,min,max,min,median
9,0.717336,0,max,min,median
