In [1]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

development = pd.read_csv("./../../DSL_Winter_Project_2024/development.csv")

outlier_column_index=[0, 7, 12, 15, 16, 17]
columns_to_drop=[]

for index in outlier_column_index:
    columns_to_drop.append('pmax[%s]' % index)
    columns_to_drop.append('negpmax[%s]' % index)
    columns_to_drop.append('tmax[%s]' % index)
    columns_to_drop.append('area[%s]' % index)
    columns_to_drop.append('rms[%s]' % index)

df=development.drop(columns=columns_to_drop)

train_datasets = []
test_datasets=[]

for i in range(0, len(df), 100):
    subset1 = df.iloc[i:i+100].head(5)
    train_datasets.append(subset1)

    subset2 = df.iloc[i:i+100].tail(1)
    test_datasets.append(subset2)

train_df=pd.concat(train_datasets, ignore_index=False)
test_df=pd.concat(test_datasets, ignore_index=False)

X_train=train_df.drop(columns=['x', 'y'])
y_train=train_df.loc[:,['x', 'y']]

X_test=test_df.drop(columns=['x', 'y'])
y_test=test_df.loc[:,['x', 'y']]

In [None]:
n_estimators_range=[490,495,500,505,510]
max_features_range=['auto','sqrt','log2']
max_depth_range=[140,145,150,155,160]
max_depth_range.append(None)
min_samples_split_range=[2,5,10,20,30]
min_samples_leaf_range=[1, 2, 5, 10, 20]

rfr_hp_range={'n_estimators':n_estimators_range,
              'max_features':max_features_range,
              'max_depth':max_depth_range,
              'min_samples_split':min_samples_split_range,
              'min_samples_leaf':min_samples_leaf_range
             }
print(rfr_hp_range)

rfr_base=RandomForestRegressor()
rfr_random=RandomizedSearchCV(estimator=rfr_base,
                              param_distributions=rfr_hp_range,
                              n_iter=200,
                              n_jobs=-1,
                              cv=5,
                              verbose=1,
                              random_state=42)
rfr_random.fit(X_train,y_train)

best_hp_now=rfr_random.best_params_
print(best_hp_now)

In [3]:
n_estimators_range=[480,485,490,495,499]
max_features_range=['sqrt']
max_depth_range=[45,47,50,52,55]
max_depth_range.append(None)
min_samples_split_range=[2,5,10,20,30]
min_samples_leaf_range=[1, 2, 5, 10, 20]

rfr_hp_range={'n_estimators':n_estimators_range,
              'max_features':max_features_range,
              'max_depth':max_depth_range,
              'min_samples_split':min_samples_split_range,
              'min_samples_leaf':min_samples_leaf_range
             }
print(rfr_hp_range)

rfr_base=RandomForestRegressor()
rfr_random=RandomizedSearchCV(estimator=rfr_base,
                              param_distributions=rfr_hp_range,
                              n_iter=200,
                              n_jobs=-1,
                              cv=10,
                              verbose=1,
                              random_state=42)
rfr_random.fit(X_train,y_train)

best_hp_now=rfr_random.best_params_
print(best_hp_now)

{'n_estimators': [480, 485, 490, 495, 499], 'max_features': ['sqrt'], 'max_depth': [45, 47, 50, 52, 55, None], 'min_samples_split': [2, 5, 10, 20, 30], 'min_samples_leaf': [1, 2, 5, 10, 20]}
Fitting 10 folds for each of 200 candidates, totalling 2000 fits




{'n_estimators': 495, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 45}


In [4]:
n_estimators_range=[493,495,497]
max_features_range=['sqrt']
max_depth_range=[43,45,47]
max_depth_range.append(None)
min_samples_split_range=[2,5,10]
min_samples_leaf_range=[1,2,5,10]

rfr_hp_range={'n_estimators':n_estimators_range,
              'max_features':max_features_range,
              'max_depth':max_depth_range,
              'min_samples_split':min_samples_split_range,
              'min_samples_leaf':min_samples_leaf_range
             }
print(rfr_hp_range)

rfr_base=RandomForestRegressor()
rfr_random=RandomizedSearchCV(estimator=rfr_base,
                              param_distributions=rfr_hp_range,
                              n_iter=200,
                              n_jobs=-1,
                              cv=10,
                              verbose=1,
                              random_state=42)
rfr_random.fit(X_train,y_train)

best_hp_now=rfr_random.best_params_
print(best_hp_now)

{'n_estimators': [493, 495, 497], 'max_features': ['sqrt'], 'max_depth': [43, 45, 47, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5, 10]}
Fitting 10 folds for each of 144 candidates, totalling 1440 fits




{'n_estimators': 495, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 43}


In [None]:
n_estimators_range=[800, 1000, 1200]
max_features_range=['sqrt']
max_depth_range=[150, 200]
max_depth_range.append(None)
min_samples_split_range=[2,5,10,20,50]
min_samples_leaf_range=[1, 2, 5, 10, 20]

rfr_hp_range={'n_estimators':n_estimators_range,
              'max_features':max_features_range,
              'max_depth':max_depth_range,
              'min_samples_split':min_samples_split_range,
              'min_samples_leaf':min_samples_leaf_range
             }
print(rfr_hp_range)

rfr_base=RandomForestRegressor()
rfr_random=RandomizedSearchCV(estimator=rfr_base,
                              param_distributions=rfr_hp_range,
                              n_iter=200,
                              n_jobs=-1,
                              cv=10,
                              verbose=1,
                              random_state=42)
rfr_random.fit(X_train,y_train)

best_hp_now=rfr_random.best_params_
print(best_hp_now)