In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression ,  Lasso
from sklearn.model_selection import cross_val_score, KFold,  GridSearchCV 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from util.preprocess import preprocess
from util.geo_data import add_geo_data

In [2]:
df_original = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
outpath = "../result/"

In [7]:
df_processed = preprocess(df_original)
df_processed = add_geo_data(df_processed)

In [9]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162691 entries, 0 to 162690
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   MONTH                     162691 non-null  object 
 1   TOWN                      162691 non-null  object 
 2   FLAT_TYPE                 162691 non-null  object 
 3   BLOCK                     162691 non-null  object 
 4   STREET                    162691 non-null  object 
 5   FLOOR_RANGE               162691 non-null  object 
 6   FLOOR_AREA_SQM            162691 non-null  float64
 7   FLAT_MODEL                162691 non-null  object 
 8   LEASE_COMMENCE_DATA       162691 non-null  int64  
 9   RESALE_PRICE              162691 non-null  float64
 10  RESALE_YEAR               162691 non-null  int64  
 11  RESALE_MONTH              162691 non-null  int64  
 12  FLAT_AGE                  162691 non-null  int64  
 13  FLOOR_AVG                 162691 non-null  i

In [10]:
# remove BLOCK and STREET as the result from LASSO

target_col = 'RESALE_PRICE'
categorical_features = ['TOWN', 'FLAT_TYPE',  'FLAT_MODEL']
numerical_features = ['FLOOR_AREA_SQM', 'RESALE_YEAR', 'RESALE_MONTH', 
                      'FLAT_AGE', 'FLOOR_AVG', 'DIST_TO_NEAREST_MRT_M', 
                      'DIST_TO_NEAREST_HAWKER_M', 'DIST_TO_NEAREST_SHOP_M']

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

### simple linear regression > no log transform, with new features

In [12]:
# 5. Prepare data for the model and apply LOG transformation to the target
X = df_processed.drop('RESALE_PRICE', axis=1)
# Target variable is log-transformed due to right skewness
y = (df_processed['RESALE_PRICE']) 

# 6. Build the full model pipeline (using Linear Regression)
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

In [13]:
print("\nValidating model performance using 5-fold Cross-Validation...")

# Define CV method
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate CV scores for required metrics
rmse_scores = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv))
mae_scores = -cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv)
r2_scores = cross_val_score(model, X, y, scoring='r2', cv=cv)


Validating model performance using 5-fold Cross-Validation...


 Convert log-error metrics back to original price space for better interpretability
-  Note: This is an approximation since back-transforming MAE/RMSE from log space is complex.
 We will present the log-space metrics as standard practice for log-transformed targets.

In [31]:
#### print("\nCross-Validation Results (Metrics are calculated on Price):")

# Primary Metric: RMSE
print(f"Primary Metric (RMSE): {rmse_scores.mean():.4f} (Avg across 5 folds)")

# Supporting Metrics: MAE and R-squared
print(f"Supporting Metric (MAE): {mae_scores.mean():.4f} (Avg across 5 folds)")
print(f"Supporting Metric (R-squared): {r2_scores.mean():.4f} (Avg across 5 folds)")

Primary Metric (RMSE): 62586.6689 (Avg across 5 folds)
Supporting Metric (MAE): 47767.3367 (Avg across 5 folds)
Supporting Metric (R-squared): 0.8833 (Avg across 5 folds)


In [16]:
#final model 
model.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
# predict test data
new_data_processed = preprocess(df_test)
new_data_processed = add_geo_data(new_data_processed)

# Make the prediction (this predicts log(price))
predicted_price = model.predict(new_data_processed)

In [20]:
predicted_price

array([516344.15664988, 630517.15257654, 510193.35675123, ...,
       568922.62291056, 634856.28407728, 399709.10639684], shape=(50000,))

In [21]:
predicted_df = pd.DataFrame(data={'predicted price': predicted_price}, index=df_test.index)

In [22]:
predicted_df.reset_index().rename(columns={"index":"id","predicted price":"Predicted"}) \
    .to_csv(outpath+"submission_baseline2_normal.csv",header=True, index=False)

## Next we try to use a LASSO regression, with grid search on its parameters

In [23]:
# Build the initial pipeline (Lasso needs to be named 'regressor' to match the param_grid)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', Lasso(random_state=42))])

# Define the parameter grid for alpha tuning
# We will search over a range of common alpha values for Lasso
param_grid = {
    'regressor__alpha': [ 0.001, 0.01, 0.1, 1.0, 10.0, 100]
}

# Define CV method (5-fold)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Set up GridSearchCV
# We use 'neg_mean_squared_error' because GridSearchCV maximizes the score, 
# and minimizing MSE is equivalent to maximizing negative MSE.
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # Use all available cores
    verbose=1
)

In [24]:
grid_search.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'regressor__alpha': [0.001, 0.01, ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [25]:
best_model = grid_search.best_estimator_
best_alpha = grid_search.best_params_['regressor__alpha']
best_rmse_cv = np.sqrt(-grid_search.best_score_) # Convert negative MSE back to RMSE
print(f"\nOptimal Lasso Alpha found: {best_alpha}")


Optimal Lasso Alpha found: 0.01


In [26]:
mae_scores = -cross_val_score(best_model, X, y, scoring='neg_mean_absolute_error', cv=cv)
r2_scores = cross_val_score(best_model, X, y, scoring='r2', cv=cv)

print("\nCross-Validation Results (Metrics are calculated on Log-Transformed Price using optimal Lasso):")

# Primary Metric: RMSE
print(f"Primary Metric (RMSE): {best_rmse_cv:.4f} (from GridSearchCV)")
# Supporting Metrics: MAE and R-squared
print(f"Supporting Metric (MAE): {mae_scores.mean():.4f} (Avg across 5 folds)")
print(f"Supporting Metric (R-squared): {r2_scores.mean():.4f} (Avg across 5 folds)")

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(



Cross-Validation Results (Metrics are calculated on Log-Transformed Price using optimal Lasso):
Primary Metric (RMSE): 62586.7330 (from GridSearchCV)
Supporting Metric (MAE): 47767.3367 (Avg across 5 folds)
Supporting Metric (R-squared): 0.8833 (Avg across 5 folds)


  model = cd_fast.sparse_enet_coordinate_descent(


In [27]:
# Make the prediction (this predicts log(price))
predicted_price = best_model.predict(new_data_processed)

In [28]:

predicted_df = pd.DataFrame(data={'predicted price': predicted_price}, index=df_test.index)
predicted_df.reset_index().rename(columns={"index":"id","predicted price":"Predicted"}) \
    .to_csv(outpath+"submission_lasso_2.csv",header=True, index=False)

In [29]:
print("\n--- Feature Coefficients (Lasso Feature Selection) ---")

# 1. Get feature names from the preprocessor's transformers
preprocessor = best_model.named_steps['preprocessor']

# Get names of all transformed features: numerical (scaled) + categorical (one-hot encoded)
# Note: get_feature_names_out() automatically includes prefixes like 'cat__TOWN_'
cat_feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
all_feature_names = numerical_features + cat_feature_names

# 2. Get the coefficients from the best Lasso model
lasso_coefficients = best_model.named_steps['regressor'].coef_

# 3. Create a DataFrame to inspect coefficients
coeff_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient': lasso_coefficients
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(f"\nTotal features used: {len(coeff_df)}")
# Use a small tolerance (1e-6) for "zero" due to floating point arithmetic
zero_coeffs_count = (coeff_df['Coefficient'].abs() < 1e-6).sum()
print(f"Features whose coefficient was set to zero by Lasso: {zero_coeffs_count}")


--- Feature Coefficients (Lasso Feature Selection) ---

Total features used: 62
Features whose coefficient was set to zero by Lasso: 1


In [30]:
print(coeff_df[:50])
# 1st round: we observe that BLOCK and STREET are both irrelevant
# 2nd round: we can potentially group flat model types that are set to 0 with to "others" to reduce noise

                              Feature    Coefficient
59                 FLAT_MODEL_terrace  361031.519366
61                 FLAT_MODEL_type s2  257994.657879
14                   TOWN_bukit timah  257709.921997
60                 FLAT_MODEL_type s1  218388.788402
23                 TOWN_marine parade  175121.778341
47     FLAT_MODEL_improved maisonette  159084.769438
10                        TOWN_bishan  149935.555459
26                    TOWN_queenstown  144161.368559
12                   TOWN_bukit merah  143704.385619
55  FLAT_MODEL_premium apartment loft  132414.444721
40         FLAT_TYPE_multi generation  120318.026924
15                  TOWN_central area  116363.418162
31                     TOWN_toa payoh  109871.981187
22               TOWN_kallang/whampoa  107668.398389
0                      FLOOR_AREA_SQM  104733.700771
18                       TOWN_geylang  102102.509859
27                     TOWN_sembawang  -98107.491278
45                    FLAT_MODEL_dbss   94431.