In [None]:
%reload_ext autoreload
%autoreload 2

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from preprocessing_train_test import *
pd.set_option('display.max_columns', 100)  # 设置显示100列
pd.set_option('display.max_rows', 100)   # 设置显示100行

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV



In [None]:
file_path_train = "../data/backup/merge_auxiliary_data_train.csv"
file_path_test = "../data/backup/merge_auxiliary_data_test.csv"

train_gdf = pd.read_csv(file_path_train)
test_gdf = pd.read_csv(file_path_test)

print("train dataset shape: ", train_gdf.shape)
print("test dataset shape: ",test_gdf.shape)

In [None]:
train_gdf.drop(columns=["opening_year"], inplace=True, errors='ignore')
test_gdf.drop(columns=["opening_year"], inplace=True, errors='ignore')
drop_columns = [
    "resale_price",
    "town",
    "block",
    "flat_type",
    "street_name",
    "storey_range",
    "flat_model",
    "eco_category",
    "lease_commence_date",
    "elevation",
    "subzone",
    "planning_area",
    "region",
    "mrt_name",
    "mrt_type",
    "codes",
    "codes_name",
    "type_commerical",
]


In [None]:
train_gdf.head()

In [None]:
def getAVG(list_num):
    return sum(list_num) / len(list_num)

# Linear Regression

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    
    # Standardlize the data feature
    scaler = MinMaxScaler()
 
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    x_valid = pd.DataFrame(scaler.transform(x_valid), columns = x_valid.columns)
    
    regressor = LinearRegression()
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

In [None]:
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result

# Lasso

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    
    
    regressor = Lasso(alpha=1.0, fit_intercept=True, max_iter = 500)
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))

In [None]:
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result

# Ridge Model

In [None]:
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'max_iter': [100, 500, 1000],
}

# Create a Lasso model
model = Lasso()

# Perform a grid search over the hyperparameter grid using 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, cv=2)

# Fit the grid search to the training data
grid_search.fit(x_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
mae_test, mae_valid, mse_test, mse_valid = [], [], [], []
split = KFold(n_splits=5, shuffle=True).split(train_gdf)
for idx, (train_idx, test_idx) in enumerate(split):
    train_df, test_df = preprocess_train_test(train_gdf.iloc[list(train_idx)], train_gdf.iloc[list(test_idx)])
    x_train, y_train = train_df.drop(columns=drop_columns, errors='ignore'), train_df["resale_price"]
    x_valid, y_valid = test_df.drop(columns=drop_columns, errors='ignore'), test_df["resale_price"]
    
    # {'alpha': 0.01, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False}
    regressor = Ridge(alpha = 0.01, fit_intercept=True, max_iter = 1000, normalize = False)
    regressor.fit(x_train, y_train)
    y_train_predict = regressor.predict(x_train)
    y_valid_predict = regressor.predict(x_valid)

    mae_test.append(mean_absolute_error(y_train, y_train_predict))
    mae_valid.append(mean_absolute_error(y_valid, y_valid_predict))
    mse_test.append(mean_squared_error(y_train, y_train_predict))
    mse_valid.append(mean_squared_error(y_valid, y_valid_predict))
    
result = [getAVG(i) for i in [mae_test, mae_valid, mse_test, mse_valid]]
result