In [None]:
%reload_ext autoreload
%autoreload 2

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from preprocessing_train_test import *
pd.set_option('display.max_columns', 100)  # 设置显示100列
pd.set_option('display.max_rows', 100)   # 设置显示100行

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV



In [None]:
file_path_train = "../data/backup/merge_auxiliary_data_train.csv"
file_path_test = "../data/backup/merge_auxiliary_data_test.csv"

train_gdf = pd.read_csv(file_path_train)
test_gdf = pd.read_csv(file_path_test)

print("train dataset shape: ", train_gdf.shape)
print("test dataset shape: ",test_gdf.shape)

In [None]:
# opening year has lots of nan values. why?
train_gdf.drop(columns=["opening_year"], inplace=True, errors='ignore')
test_gdf.drop(columns=["opening_year"], inplace=True, errors='ignore')
drop_columns = [
    "resale_price",
    "town",
    "block",
    "flat_type",
    "street_name",
    "storey_range",
    "flat_model",
    "eco_category",
    "lease_commence_date",
    "elevation",
    "subzone",
    "planning_area",
    "region",
    "mrt_name",
    "mrt_type",
    "codes",
    "codes_name",
    "type_commerical",
]


In [None]:
train_gdf.head()

In [None]:
# Get once fold of training and testing data
split = KFold(n_splits=5, shuffle=True).split(train_gdf)

for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"].astype(int)
    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"].astype(int)
    break

In [None]:
print("x_train_shape", x_train.shape)
print("y_train_shape", y_train.shape)
print("x_valid_shape", x_valid.shape)
print("y_valid_shape", y_valid.shape)

In [None]:
y_train.head()

# XGBoost

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror')
param_grid = {
    'max_depth': [5],
    'learning_rate': [0.1],
    'n_estimators': [1000]
}
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)


In [None]:
grid_search.fit(x_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(x_valid)
mae = mean_absolute_error(y_valid, y_pred)
print("MAE on testing set: ", mae)

# Linear Regresssion

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    break

from sklearn.preprocessing import MinMaxScaler

In [None]:
    # Standardlize the data feature
scaler = MinMaxScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
train_df

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    
    # Standardlize the data feature
    scaler = MinMaxScaler()
 
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    x_valid = pd.DataFrame(scaler.transform(x_valid), columns = x_valid.columns)
    
    regressor = LinearRegression()
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

In [None]:
def getAVG(list_num):
    return sum(list_num) / len(list_num)

In [None]:
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result

# Lasso Model

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search over
# param_grid = {
#     'alpha': [0.01, 0.1, 1.0, 10.0],
#     'fit_intercept': [True, False],
#     'normalize': [True, False],
#     'max_iter': [100, 500, 1000],
# }

# Create a Lasso model
model = Lasso(alpha=1.0, fit_intercept=True, max_iter = 500)

# # Perform a grid search over the hyperparameter grid using 5-fold cross-validation
# grid_search = GridSearchCV(model, param_grid, cv=5)

# Fit the grid search to the training data
model.fit(x_train, y_train)

# Print the best hyperparameters
# print(grid_search.best_params_)

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    
    
    regressor = Lasso(alpha=1.0, fit_intercept=True, max_iter = 500)
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

In [None]:
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result

# Ridge Model

In [None]:
# param_grid = {
#     'alpha': [0.01, 0.1, 1.0, 10.0],
#     'fit_intercept': [True, False],
#     'normalize': [True, False],
#     'max_iter': [100, 500, 1000],
# }

# # Create a Lasso model
# model = Lasso()

# # Perform a grid search over the hyperparameter grid using 5-fold cross-validation
# grid_search = GridSearchCV(model, param_grid, cv=2)

# # Fit the grid search to the training data
# grid_search.fit(x_train, y_train)

# # Print the best hyperparameters
# print(grid_search.best_params_)

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    
    # {'alpha': 0.01, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False}
    regressor = Ridge(alpha = 0.01, fit_intercept=True, max_iter = 1000, normalize = False)
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))
    
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result

In [None]:
# scores = {}
# for alpha in param_grid['alpha']:
#     for fit_intercept in param_grid['fit_intercept']:
#         for normalize in param_grid['normalize']:
#             for max_iter in param_grid['max_iter']:
#                 print('start training with alpha: {}, fit_intercept: {}, normalize: {}, max_iter: {}'.format(v, fit_intercept, normalize, max_iter))
#                 mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
#                 split = KFold(n_splits=5, shuffle=True).split(train_gdf)
#                 for idx, (train_idx, test_idx) in enumerate(split):
#                     print('fold {}'.format(idx + 1))
#                     train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])

#                     x_train, y_train = train_df.drop(columns=["resale_price"], errors='ignore'), train_df["resale_price"]
#                     x_valid, y_valid = test_df.drop(columns=["resale_price"], errors='ignore'), test_df["resale_price"]

#                     regressor = Ridge(alpha = alpha, fit_intercept = fit_intercept, normalize = normalize, max_iter = max_iter).fit(x_train, y_train)
#                     y_train_predict = regressor.predict(x_train)
#                     y_valid_predict = regressor.predict(x_valid)

#                     mae_test.append(mean_absolute_error(y_train, y_train_predict))
#                     mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
#                     mse_test.append(mean_squared_error(y_train, y_train_predict))
#                     mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

#                 result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
#                 scores[(max_depth, max_feature, min_samples_split, min_samples_leaf)] = result


In [None]:
# Import XGBoost Regressor
from xgboost import XGBRegressor

mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    
    # Standardlize the data feature
    scaler = StandardScaler()
 
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    x_valid = pd.DataFrame(scaler.transform(x_valid), columns = x_valid.columns)
    
    regressor = XGBRegressor()
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))


In [None]:
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result