# Develop two models using only the real estate transaction data:


## One to estimate property sale prices.
## Another to estimate property rental prices. (15 points)


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# Load the dataset
data = pd.read_csv("Cleaned Datasets\\Rents & Transactions\\rents_annual.csv")

# Preprocess the data
data['Annual Amount'] = data['Annual Amount'] / data['Count']  # Normalize by count
data['Property Size (sq.m)'] = data['Property Size (sq.m)'] / data['Count']  # Normalize by count
data['Contract Amount'] = data['Contract Amount'] / data['Count']  # Normalize by count
data = data.drop(columns=['Count'])  # Drop the Count column

# Encode categorical variables
data = pd.get_dummies(data, columns=['Version', 'Area', 'Property Type', 'Property Sub Type', 'Usage', 'Is Free Hold?'], drop_first=True)

# Split the data into features (X) and target (y)
X = data.drop(columns=['Annual Amount', 'Nearest Metro', 'Nearest Mall', 'Nearest Landmark'])
y = data['Annual Amount']

# Scale numerical features
scaler = StandardScaler()
X[['Property Size (sq.m)', 'Contract Amount']] = scaler.fit_transform(X[['Property Size (sq.m)', 'Contract Amount']])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor with hyperparameter tuning
model = RandomForestRegressor(random_state=42, verbose=1)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Feature importance
feature_importances = best_model.feature_importances_
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   39.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   43.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   42.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   42.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   41.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   41.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  2.8min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   41.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  2.8min
[Parallel(n_jobs=1)]: Do

Mean Absolute Error: 1459660061.4849474
R-squared: -0.04320165179370283
Year: 0.01562542423480566
Contract Amount: 0.5444920016893242
Property Size (sq.m): 0.010726734289925283
Version_Renewed: 2.002047153588772e-11
Area_Al Baraha: 2.0807215887078657e-13
Area_Al Barsha First: 4.097718756314643e-12
Area_Al Barsha Second: 3.293543962942823e-13
Area_Al Barsha South Fifth: 1.6555055447144952e-12
Area_Al Barsha South Fourth: 2.2815723468653647e-14
Area_Al Barsha Third: 5.803656832071482e-14
Area_Al Barshaa South First: 5.257816376169218e-15
Area_Al Barshaa South Second: 1.3897242883978881e-12
Area_Al Barshaa South Third: 1.889100808264146e-13
Area_Al Buteen: 1.092232849604863e-13
Area_Al Dhagaya: 1.1169071822564735e-13
Area_Al Garhoud: 2.236045738945551e-11
Area_Al Goze First: 1.0678196419688115e-11
Area_Al Goze Fourth: 2.6284576858573925e-14
Area_Al Goze Industrial First: 1.1011842085008944e-11
Area_Al Goze Industrial Fourth: 7.949920282241982e-14
Area_Al Goze Industrial Second: 2.13076621

In [None]:
# Select top features based on importance
top_features = ['Area_Burj Khalifa', 'Area_Trade Center Second', 'Property Sub Type_Shop', 'Property Sub Type_Office', 'Is Free Hold?_Non Free Hold']
X_top = X[top_features]

# Split into training and testing sets
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model_top = RandomForestRegressor(random_state=42, verbose=1)
model_top.fit(X_train_top, y_train)

# Make predictions
y_pred_top = model_top.predict(X_test_top)

# Evaluate the model
mae_top = mean_absolute_error(y_test, y_pred_top)
r2_top = r2_score(y_test, y_pred_top)
print(f"Mean Absolute Error (Top Features): {mae_top}")
print(f"R-squared (Top Features): {r2_top}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.6s


Mean Absolute Error (Top Features): 1355561195.9214294
R-squared (Top Features): -0.03976242232548066


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [22]:
from xgboost import XGBRegressor

# Train an XGBoost Regressor with increased verbosity
xgb_model = XGBRegressor(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1, verbosity=3)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"Mean Absolute Error (XGBoost): {mae_xgb}")
print(f"R-squared (XGBoost): {r2_xgb}")

[08:22:32] AllReduce: 0.00193s, 1 calls @ 1930us

[08:22:32] MakeCuts: 0.003031s, 1 calls @ 3031us

[08:22:32] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-06abd128ca6c1688d-1\xgboost\xgboost-ci-windows\src\gbm\gbtree.cc:130: Using tree method: 0
[08:22:32] Configure: 0.001781s, 1 calls @ 1781us

[08:22:32] EvalOneIter: 0.000653s, 100 calls @ 653us

[08:22:32] GetGradient: 0.001874s, 100 calls @ 1874us

[08:22:32] PredictRaw: 0.000104s, 100 calls @ 104us

[08:22:32] UpdateOneIter: 0.26712s, 100 calls @ 267120us

[08:22:32] BoostNewTrees: 0.261465s, 100 calls @ 261465us

[08:22:32] CommitModel: 6.2e-05s, 100 calls @ 62us

[08:22:32] BuildHistogram: 0.030349s, 400 calls @ 30349us

[08:22:32] EvaluateSplits: 0.014708s, 500 calls @ 14708us

[08:22:32] InitData: 0.006045s, 100 calls @ 6045us

[08:22:32] InitRoot: 0.190601s, 100 calls @ 190601us

[08:22:32] LeafPartition: 1.6e-05s, 100 calls @ 16us

[08:22:32] UpdatePosition: 0.013339s, 500 calls @ 13339us

[08:

# Create two additional models incorporating both real estate data and supplementary datasets:
## One to estimate property sale prices.
## Another to estimate property rental prices. (15 points)


# Evaluate the performance of all four models using metrics such as RMSE, MSE, and RÂ². Analyze whether the inclusion of additional data enhances model accuracy. (10 points)

You should at least train your model on these 9 features but you are free to add more:
 - Property Type or/and Property Sub Type
 - Number of rooms/bedrooms
 - Property Size (sq.m)
 - Area/Neighborhood
 - Nearest Metro
 - Nearest Mall
 - Nearest Landmark
 - Usage (commercial/residential)
 - Average prices the previous month/week (for the same kind of property)