In [None]:
%reload_ext autoreload
%autoreload 2

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from preprocessing_train_test import *
pd.set_option('display.max_columns', 100)  # 设置显示100列
pd.set_option('display.max_rows', 100)   # 设置显示100行

In [None]:
file_path_train = "../data/backup/merge_auxiliary_data_train.csv"
file_path_test = "../data/backup/merge_auxiliary_data_test.csv"

train_gdf = pd.read_csv(file_path_train)
test_gdf = pd.read_csv(file_path_test)

print("train dataset shape: ", train_gdf.shape)
print("test dataset shape: ",test_gdf.shape)

In [None]:
# print(train_gdf.groupby(['storey_range'])['resale_price'].mean().index)
# train_gdf[train_gdf['storey_range']=='31 to 33'].index.size

In [None]:
# train_gdf["street_name"] = train_gdf["street_name"].str.lower()
# train_gdf.groupby(['street_name']).count().sort_values(by='month', ascending=True).head(20)

In [None]:
'''grid search with preprocessing'''
max_depths = [10, 15, 18, 20]
max_features = [0.4, 0.6, 0.8, 1.0]
min_samples_splits = [2, 0.00001]
min_samples_leafs = [1, 0.00001]

scores = {}
for max_depth in max_depths:
    for max_feature in max_features:
        for min_samples_split in min_samples_splits:
            for min_samples_leaf in min_samples_leafs:
                print('start training with max_depth: {}, max_feature: {}, min_samples_split: {}, min_samples_leaf: {}'.format(max_depth, max_feature, min_samples_split, min_samples_leaf))
                mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
                split = KFold(n_splits=5, shuffle=True, random_state=2958).split(train_gdf)
                for idx, (train_idx, test_idx) in enumerate(split):
                    print('fold {}'.format(idx + 1))
                    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])

                    x_train, y_train = train_df.drop(columns=["resale_price"], errors='ignore'), train_df["resale_price"]
                    x_valid, y_valid = test_df.drop(columns=["resale_price"], errors='ignore'), test_df["resale_price"]

                    regressor = RandomForestRegressor(n_estimators=10, max_depth=max_depth, max_features=max_feature, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_jobs=3).fit(x_train, y_train)
                    y_train_predict = regressor.predict(x_train)
                    y_valid_predict = regressor.predict(x_valid)

                    mae_test.append(mean_absolute_error(y_train, y_train_predict))
                    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
                    mse_test.append(mean_squared_error(y_train, y_train_predict))
                    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

                print(mae_test, mae_valid, mse_test, mse_valid)
                scores[(max_depth, max_feature, min_samples_split, min_samples_leaf)] = (mae_test, mae_valid, mse_test, mse_valid)

In [None]:
min_mae_key, min_mae = None, 100000
min_mse_key, min_mse = None, 10000000000000
for key, (_, mae_valid, _, mse_valid) in scores.items():
    if np.mean(mae_valid) < min_mae:
        min_mae_key = key
        min_mae = np.mean(mae_valid)
    if np.mean(mse_valid) < min_mse:
        min_mse_key = key
        min_mse = np.mean(mse_valid)
print(min_mae_key, min_mae)
print(min_mse_key, min_mse)

In [None]:
'''individual k-fold'''
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    print('fold {}'.format(idx + 1))
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])

    x_train, y_train = train_df.drop(columns=["resale_price"], errors='ignore'), train_df["resale_price"]
    x_valid, y_valid = test_df.drop(columns=["resale_price"], errors='ignore'), test_df["resale_price"]

    regressor = RandomForestRegressor(n_estimators=10, max_depth=20, max_features=0.6, min_samples_split=0.00001, min_samples_leaf=0.00001, n_jobs=3).fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

print(mae_test, mae_valid, mse_test, mse_valid)

In [None]:
'''Run best random forest hyperparam on full dataset'''
test_gdf.drop(columns=["opening_year"], inplace=True, errors='ignore')
train_df, test_df = preprocess_train_test(train_gdf, test_gdf)

x_train, y_train = train_df.drop(columns=["resale_price"], errors='ignore'), train_df["resale_price"]
# x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]

regressor = RandomForestRegressor(n_estimators=10, max_depth=20, max_features=0.6, min_samples_split=0.00001, min_samples_leaf=0.00001, n_jobs=3).fit(x_train, y_train)
y_train_predict = regressor.predict(x_train)
# y_valid_predict = regressor.predict(x_valid)

print(mean_absolute_error(y_train, y_train_predict))
# mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
print(mean_squared_error(y_train, y_train_predict))
# mse_valid.append(mean_squared_error(y_valid, y_valid_predict))