# Simple linear regession

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

In [6]:
aparts_reduced = pd.read_csv('Datasets/aparts_train_ready.csv')

In [7]:
# Prepare data
X_reduced = aparts_reduced.drop(columns=['price_z', 'log_price_z'], errors='ignore')
y_reduced = aparts_reduced['log_price_z']

# Fit linear regression with 5-fold cross-validation
lr = LinearRegression()
cv_scores = cross_val_score(lr, X_reduced, y_reduced, cv=5, scoring='neg_root_mean_squared_error')

# Report average RMSE across folds
rmse_mean = -np.mean(cv_scores)
rmse_mean

np.float64(0.10597872131802363)

In [None]:
# Custom RMSE function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load test data
aparts_test = pd.read_csv('Datasets/aparts_test_ready.csv')  # make sure this file is up-to-date

# 1. Prepare training data
X_train = aparts_reduced.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_train_log = aparts_reduced['log_price_z']

# 2. Prepare test data
X_test = aparts_test.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_test_log = aparts_test['log_price_z']
y_test_price = aparts_test['price_z']

# 3. Train final model on full training data
lr = LinearRegression()
lr.fit(X_train, y_train_log)

# 4. Predict on test set
y_pred_log = lr.predict(X_test)
y_pred_price = np.expm1(y_pred_log)

# 5. Evaluate
rmse_log = root_mean_squared_error(y_test_log, y_pred_log)
rmse_real = root_mean_squared_error(y_test_price, y_pred_price)

# 6. Report results
print("✅ Test RMSE (log scale):", round(rmse_log, 4))
print("✅ Test RMSE (real price scale):", round(rmse_real, 2), "currency units")

✅ Test RMSE (log scale): 0.106
✅ Test RMSE (real price scale): 96970.44 currency units


# Lasso regression

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Custom RMSE function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load test data
aparts_test = pd.read_csv('Datasets/aparts_test_ready.csv')

# 1. Prepare training data
X_train = aparts_reduced.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_train_log = aparts_reduced['log_price_z']

# 2. Prepare test data
X_test = aparts_test.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_test_log = aparts_test['log_price_z']
y_test_price = aparts_test['price_z']

# 🔧 Ensure both train and test have the same features in the same order
X_test = X_test[X_train.columns]

# 3. Fit final Lasso model
lasso = Lasso(alpha=0.001, max_iter=10000, random_state=42)
lasso.fit(X_train, y_train_log)

# 4. Predict
y_pred_log_lasso = lasso.predict(X_test)
y_pred_price_lasso = np.expm1(y_pred_log_lasso)

# 5. Evaluate
rmse_log_lasso = root_mean_squared_error(y_test_log, y_pred_log_lasso)
rmse_real_lasso = root_mean_squared_error(y_test_price, y_pred_price_lasso)

# 6. Report
print("✅ Lasso Test RMSE (log scale):", round(rmse_log_lasso, 4))
print("✅ Lasso Test RMSE (real price scale):", round(rmse_real_lasso, 2), "currency units")


✅ Lasso Test RMSE (log scale): 0.1084
✅ Lasso Test RMSE (real price scale): 97857.75 currency units


Now, let's run Lasso again, but with hyperparameter tunning

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

# Custom RMSE function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load test data
aparts_test = pd.read_csv('Datasets/aparts_test_ready.csv')

# 1. Prepare training data
X_train = aparts_reduced.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_train_log = aparts_reduced['log_price_z']

# 2. Prepare test data
X_test = aparts_test.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_test_log = aparts_test['log_price_z']
y_test_price = aparts_test['price_z']

# 🔒 Ensure test set columns match training set exactly
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 3. Tune alpha using LassoCV
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train, y_train_log)

# 4. Predict using the fitted model
best_alpha = lasso_cv.alpha_
y_pred_log = lasso_cv.predict(X_test)
y_pred_price = np.expm1(y_pred_log)

# 5. Evaluate
rmse_log = root_mean_squared_error(y_test_log, y_pred_log)
rmse_real = root_mean_squared_error(y_test_price, y_pred_price)

# 6. Report results
print(f"✅ Best alpha: {best_alpha}")
print("✅ LassoCV Test RMSE (log scale):", round(rmse_log, 4))
print("✅ LassoCV Test RMSE (real price scale):", round(rmse_real, 2), "currency units")

✅ Best alpha: 0.0004889832730135739
✅ LassoCV Test RMSE (log scale): 0.1069
✅ LassoCV Test RMSE (real price scale): 97073.66 currency units


## Random Forest

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Custom RMSE function
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load test data
aparts_test = pd.read_csv('Datasets/aparts_test_ready.csv')

# 1. Prepare training data
X_train = aparts_reduced.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_train_log = aparts_reduced['log_price_z']

# 2. Prepare test data
X_test = aparts_test.drop(columns=['price_z', 'log_price_z', 'Unnamed: 0'], errors='ignore')
y_test_log = aparts_test['log_price_z']
y_test_price = aparts_test['price_z']

# 3. Align test set columns with training set
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 4. Train Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=1
)
rf.fit(X_train, y_train_log)

# 5. Predict on test set
y_pred_log = rf.predict(X_test)
y_pred_price = np.expm1(y_pred_log)

# 6. Evaluate
rmse_log = root_mean_squared_error(y_test_log, y_pred_log)
rmse_real = root_mean_squared_error(y_test_price, y_pred_price)

# 7. Report results
print("✅ Random Forest RMSE (log scale):", round(rmse_log, 4))
print("✅ Random Forest RMSE (real price scale):", round(rmse_real, 2), "currency units")


✅ Random Forest RMSE (log scale): 0.1083
✅ Random Forest RMSE (real price scale): 97378.98 currency units
