# Basic Supervised Learning model
Learn the supervised learning model basics

In [2]:
# Install the project libraries
!pip install pandas
!pip install -U scikit-learn
!pip install -U matplotlib
!pip install scikit-optimize
!pip install catboost
!pip install pycaret notebook

Collecting matplotlib
  Using cached matplotlib-3.8.4-cp310-cp310-win_amd64.whl.metadata (5.9 kB)
Using cached matplotlib-3.8.4-cp310-cp310-win_amd64.whl (7.7 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.7.5
    Uninstalling matplotlib-3.7.5:
      Successfully uninstalled matplotlib-3.7.5
Successfully installed matplotlib-3.8.4


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pycaret 3.3.0 requires matplotlib<3.8.0, but you have matplotlib 3.8.4 which is incompatible.


Collecting matplotlib<3.8.0 (from pycaret)
  Using cached matplotlib-3.7.5-cp310-cp310-win_amd64.whl.metadata (5.8 kB)
Using cached matplotlib-3.7.5-cp310-cp310-win_amd64.whl (7.5 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.8.4
    Uninstalling matplotlib-3.8.4:
      Successfully uninstalled matplotlib-3.8.4
Successfully installed matplotlib-3.7.5


## Medellin properties price prediction
Using the Medellin properties dataset 2023, predict the property price.

You can download the dataset from: https://www.kaggle.com/datasets/cesaregr/medelln-properties

In [3]:
# Import python libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

### 1. Frame the problem
The goal es predict the Medellin properties price using the properties features: geolocation, area, # bathrooms, # rooms, etc.

### 2. Selec the Performance Measure
In order to identify how well our model is doing, we can calculate the difference between the real and predicted value. The performance measure to use is Mean Squared Error (MSE). Later we will understand how it work

### 3. Download and read the Data
The data is in the file named 'medellin_properties.csv'

### 4. Take a look at the dataset - Analyze the dataset
Review the dataset, understand what it cotains, check data quality, understand the data limitations, the risks, the non-relevant data, etc.

In [4]:
# Read medellin_properties file
properties_filepath = 'medellin_properties.csv'
properties_df = pd.read_csv(properties_filepath)
properties_df

Unnamed: 0,neighbourhood,latitude,longitude,property_type,price,rooms,baths,area,administration_price,age,garages,stratum
0,Suramerica,6.186203,-75.599437,Apartamento,435000000,3.0,2.0,83.0,354400.0,1.0,1.0,4
1,Escobero,6.162800,-75.573519,Apartamento,680000000,3.0,4.0,124.0,480000.0,2.0,2.0,4
2,Castropol,6.216140,-75.566970,Apartamento,900000000,3.0,3.0,111.0,813000.0,1.0,2.0,6
3,Toledo,6.162762,-75.639307,Casa,650000000,3.0,2.0,127.0,0.0,2.0,1.0,4
4,La pilarica,6.247638,-75.565815,Apartamento,320000000,3.0,2.0,72.0,250000.0,,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
9923,Loma de los Bernal,6.213318,-75.607806,Apartamento,685000000,3.0,3.0,94.0,380000.0,1.0,2.0,5
9924,Conjunto residencial san jose,6.157310,-75.578221,Apartamento,582000000,3.0,2.0,88.0,0.0,2.0,1.0,4
9925,Centro,6.244732,-75.560750,Apartamento,320000000,3.0,2.0,59.0,0.0,,,4
9926,La america,6.254452,-75.611331,Apartamento,450000000,4.0,3.0,147.0,0.0,3.0,1.0,3


### 5. Prepare Data for Machine Learning Algorithms - Generate Train and Test set
Apply the transformations needed to prepare the data to train the model. Remove irrelevant features, handle outliers and anomalies, handle missing values, scale features, etc.

In [5]:
# Remove propertyType = Proyecto. It's not into the current scope
properties_df=properties_df.loc[properties_df['property_type']!='Proyecto']

In [6]:
# Split features and target
y = properties_df['price']
properties_df.drop(columns='price', inplace=True)
X = properties_df.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  properties_df.drop(columns='price', inplace=True)


In [7]:
# Split dataset
train_set, test_set, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
# Copy the original test dataset to review the results later
test_o_set = test_set.copy()

In [9]:
# Remove irrelevant features
train_set.drop(columns=['neighbourhood'], inplace=True)
test_set.drop(columns=['neighbourhood'], inplace=True)
train_set

Unnamed: 0,latitude,longitude,property_type,rooms,baths,area,administration_price,age,garages,stratum
1795,6.240919,-75.556469,Apartamento,3.0,2.0,138.0,0.0,1.0,,3
5912,6.269560,-75.608969,Apartamento,3.0,2.0,71.0,290000.0,2.0,1.0,4
3606,6.282073,-75.615990,Apartamento,4.0,1.0,46.0,13000.0,4.0,1.0,3
8407,6.263200,-75.545278,Casa,3.0,2.0,118.0,0.0,3.0,,2
8222,6.146531,-75.609541,Apartamento,3.0,2.0,63.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...
5743,6.151145,-75.636623,Casa,3.0,2.0,98.0,284000.0,3.0,1.0,4
5197,6.181666,-75.579754,Casa,3.0,2.0,90.0,0.0,,,5
5397,6.180376,-75.559996,Apartamento,3.0,2.0,67.0,250000.0,2.0,1.0,5
861,6.268205,-75.559055,Apartamento,2.0,1.0,57.8,0.0,2.0,,3


In [10]:
# Replace the value in a specific column. Join Apartaestudio and Apartamento
train_set['property_type'] = train_set['property_type'].replace('Apartaestudio', 'Apartamento')
test_set['property_type'] = test_set['property_type'].replace('Apartaestudio', 'Apartamento')
# Transform the propery type to binary
train_set['property_type'] = train_set['property_type'].replace('Apartamento', 1)
train_set['property_type'] = train_set['property_type'].replace('Casa', 0)
test_set['property_type'] = test_set['property_type'].replace('Apartamento', 1)
test_set['property_type'] = test_set['property_type'].replace('Casa', 0)

In [11]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# NOTE: We will use RandomForestRegressor as estimato because there are some features highly skew
# and the BayesRidge stimator is more sensitive to outliers and data scale
imp = IterativeImputer(estimator=RandomForestRegressor()) 
# fit on the dataset 
imp.fit(train_set) 
# transform the dataset 
X_train_trans = imp.transform(train_set)
X_test_trans = imp.transform(test_set)
# Transfor to the dataset
train_set = pd.DataFrame(X_train_trans, columns=train_set.columns, index=train_set.index)
test_set = pd.DataFrame(X_test_trans, columns=test_set.columns, index=test_set.index)

In [12]:
from sklearn.ensemble import IsolationForest

# Remove outliers
iso_forest = IsolationForest(n_estimators=200, contamination=0.12, random_state=42)
iso_forest.fit(train_set)
train_set['outlier'] = iso_forest.predict(train_set)
test_set['outlier'] = iso_forest.predict(test_set)
# Remove global outliers
train_set = train_set[train_set['outlier'] != -1]
test_set = test_set[test_set['outlier'] != -1]
# Remove the outlier column
train_set.drop(columns='outlier', inplace=True)
test_set.drop(columns='outlier', inplace=True)
# Filter valid prices
y_train = y_train.loc[train_set.index]
y_test = y_test.loc[test_set.index]

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Apply min-max scaler
scaler = MinMaxScaler()
scaler_model = scaler.fit(train_set)
train_scaled = scaler_model.transform(train_set)
test_scaled = scaler_model.transform(test_set)
train_set = pd.DataFrame(train_scaled, columns=train_set.columns, index=train_set.index)
test_set = pd.DataFrame(test_scaled, columns=test_set.columns, index=test_set.index)

In [14]:
y_train = y_train/1000000
y_test = y_test/1000000

### 6. Train the model
Select a model and use the training dataset to train the model. In following sessions we will improve the model (fine-tune). 

The ML model selected is Random Tree algorithm.

In [15]:
train_set

Unnamed: 0,latitude,longitude,property_type,rooms,baths,area,administration_price,age,garages,stratum
1795,0.891680,0.193745,1.0,0.105263,0.166667,0.253704,0.000000,0.2500,0.070000,0.500000
5912,0.892331,0.183180,1.0,0.105263,0.166667,0.129630,0.000957,0.5000,0.000000,0.666667
3606,0.892615,0.181767,1.0,0.157895,0.000000,0.083333,0.000043,1.0000,0.000000,0.500000
8407,0.892186,0.195997,0.0,0.105263,0.166667,0.216667,0.000000,0.7500,0.020000,0.333333
8222,0.889536,0.183065,1.0,0.105263,0.166667,0.114815,0.000000,0.0000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...
5743,0.889641,0.177615,0.0,0.105263,0.166667,0.179630,0.000937,0.7500,0.000000,0.666667
5197,0.890334,0.189059,0.0,0.105263,0.166667,0.164815,0.000000,0.5275,0.073333,0.833333
5397,0.890305,0.193035,1.0,0.105263,0.166667,0.122222,0.000825,0.5000,0.000000,0.833333
861,0.892300,0.193225,1.0,0.052632,0.000000,0.105185,0.000000,0.5000,0.013333,0.500000


In [16]:
y_train

1795    309.0
5912    420.0
3606    120.0
8407    190.0
8222    350.0
        ...  
5743    385.0
5197    375.0
5397    454.0
861     235.0
7281    455.0
Name: price, Length: 7853, dtype: float64

##### ML Algorithms
* Linear Regression  
* Logistic Regression  
* Support Vector Machines (SVMs)  
* Decision Trees and Random Forests  
* Neural networks2

To understand random forest:

https://www.youtube.com/watch?v=g9c66TUylZ4

https://www.youtube.com/watch?v=J4Wdy0Wc_xQ&t=91s

In [17]:
# Initialize the RandomForestRegressor
regr = RandomForestRegressor(n_estimators=100, random_state=42, criterion='squared_error')
# Fit the model on the training data
regr.fit(train_set, y_train)

### 7. Evaluate Model performance
Check how well the mode is doing using the performance metric (cost function). Mean Squared Error (MSE) in this case

In [18]:
# Make predictions on the test data
y_pred = regr.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

MSE: 587611559.71


In [19]:
# Initialize the RandomForestRegressor (n_estimators=10)
regr = RandomForestRegressor(n_estimators=10, random_state=42, criterion='squared_error')
# Fit the model on the training data
regr.fit(train_set, y_train)

In [20]:
# Make predictions on the test data
y_pred = regr.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

MSE: 619931925.32


In [21]:
# Previous MSE
# MSE: 163'247.086'315.270'635.520.00

# Current  MSE
# MSE: 1.128'063.980.17

### 8. Fine-tune the model
Choose the best hyper parameters to get the best model performance

In [22]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [5, 10, 30, 40, 50, 100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 3, 5, 10]  # Minimum number of samples required to split an internal node
}

# Initialize Random Forest Regressor algorithm 
regr = RandomForestRegressor(random_state=42, criterion='squared_error')

# Initialize GridSearchCV with the RandomForest model and the parameter grid
grid_search = GridSearchCV(estimator=regr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(train_set, y_train)

In [23]:
# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score (negative mean squared error): {grid_search.best_score_}")

# Use the best estimator to make predictions on the test set
best_model = grid_search.best_estimator_
# Make predictions on the test data
y_pred = best_model.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 40}
Best score (negative mean squared error): -555983562.8190459
MSE: 702527800.64


In [24]:
# Previous MSE
# MSE: 151'073.261'725.553'917.952.00

# Current MSE
# MSE: 223'316.391.03

### 9. Fine-tune the model - Experimentation
Choose the best hyper parameters to get the best model performance. Using other techniques. BayesSearchCV

In [25]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Define the parameter bayesian to search over
param_space = {
    'n_estimators': Integer(10, 100),  # Number of trees in the forest
    'max_depth': Integer(1, 50),       # Maximum depth of the tree
    'min_samples_split': Real(0.01, 1.0, 'uniform'),  # Minimum number of samples required to split an internal node
}

# Initialize Random Forest Regressor algorithm 
regr = RandomForestRegressor(random_state=42, criterion='squared_error')

optimizer = BayesSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    search_spaces=param_space,
    n_iter=25,  # Number of parameter settings that are sampled
    cv=5,       # 5-fold cross-validation
    random_state=42
)

# Fit the BayesSearchCV to the training data
optimizer.fit(train_set, y_train)

In [26]:
# Print the best parameters and the best score
print(f"Best parameters: {optimizer.best_params_}")
print(f"Best score (negative mean squared error): {optimizer.best_score_}")

# Use the best estimator to make predictions on the test set
best_model = optimizer.best_estimator_
# Make predictions on the test data
y_pred = best_model.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

Best parameters: OrderedDict([('max_depth', 23), ('min_samples_split', 0.9195352964526833), ('n_estimators', 19)])
Best score (negative mean squared error): -1.1267886241582403
MSE: 50602784.03


In [27]:
# Previous MSE
# MSE: 223'316.391.03

# Current MSE
# MSE: 49'355.045.36

Using an a different algorithm. CatBoost
https://catboost.ai/en/docs/

In [28]:
from catboost import CatBoostRegressor

# Define the parameter bayesian to search over
param_space = {
    'learning_rate': Real(0.01, 0.3),
    'depth': Integer(3, 16),
    'iterations': Integer(10, 100),
    'l2_leaf_reg': Real(1, 10)
}
# Initalize the model
catboost = CatBoostRegressor(verbose=1, random_state=42)

optimizer = BayesSearchCV(
    estimator=catboost,
    search_spaces=param_space,
    n_iter=30,  # Number of parameter settings that are sampled
    cv=4,       # 5-fold cross-validation
    random_state=42
)

optimizer.fit(train_set, y_train)

0:	learn: 25801.9318959	total: 140ms	remaining: 10.3s
1:	learn: 25731.5270491	total: 142ms	remaining: 5.18s
2:	learn: 25653.3029278	total: 146ms	remaining: 3.49s
3:	learn: 25577.2670042	total: 150ms	remaining: 2.66s
4:	learn: 25501.8100310	total: 154ms	remaining: 2.16s
5:	learn: 25359.9565719	total: 159ms	remaining: 1.83s
6:	learn: 25284.3700083	total: 161ms	remaining: 1.57s
7:	learn: 25213.4952686	total: 164ms	remaining: 1.37s
8:	learn: 25155.2129942	total: 169ms	remaining: 1.24s
9:	learn: 25068.0029706	total: 174ms	remaining: 1.13s
10:	learn: 24983.0105259	total: 178ms	remaining: 1.03s
11:	learn: 24832.8525704	total: 181ms	remaining: 952ms
12:	learn: 24770.7976475	total: 186ms	remaining: 889ms
13:	learn: 24685.9130297	total: 192ms	remaining: 836ms
14:	learn: 24618.6739887	total: 197ms	remaining: 787ms
15:	learn: 24550.6949195	total: 202ms	remaining: 744ms
16:	learn: 24406.2364593	total: 207ms	remaining: 706ms
17:	learn: 24341.9316000	total: 212ms	remaining: 672ms
18:	learn: 24279.088

In [29]:
# Print the best parameters and the best score
print(f"Best parameters: {optimizer.best_params_}")
print(f"Best score (negative mean squared error): {optimizer.best_score_}")

# Use the best estimator to make predictions on the test set
best_model = optimizer.best_estimator_
# Make predictions on the test data
y_pred = best_model.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

Best parameters: OrderedDict([('depth', 16), ('iterations', 60), ('l2_leaf_reg', 9.738967273796947), ('learning_rate', 0.01)])
Best score (negative mean squared error): -0.007921650068484637
MSE: 50487851.20


In [30]:
# Previous MSE
# MSE: 49'355.045.36

# Current MSE
# MSE: 50'487.851.20

In [33]:
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae:.2f}')

MAE: 771.66


### 9. Review results
Review the final results after apply fine-tuning

In [32]:
# Get the valid result
test_o_set = test_o_set.loc[test_set.index] 
test_o_set['price'] = y_test
test_o_set['pred_price'] = y_pred
test_o_set.head(30)

Unnamed: 0,neighbourhood,latitude,longitude,property_type,rooms,baths,area,administration_price,age,garages,stratum,price,pred_price
6565,López de Mesa,6.254569,-75.59928,Apartamento,3.0,1.0,78.0,0.0,3.0,1.0,3,250.0,1145.225893
9296,Calasanz,6.270774,-75.600117,Apartamento,3.0,2.0,55.0,190000.0,1.0,,4,310.0,1025.44541
361,Vegas de la doctora,6.144721,-75.610881,Apartamento,2.0,2.0,115.0,460000.0,1.0,1.0,4,560.0,888.554048
1263,El Poblado,6.203481,-75.571659,Apartamento,2.0,2.0,87.0,600000.0,2.0,1.0,6,730.0,992.545345
3043,sabaneta aves marias,6.148975,-75.617243,Apartamento,3.0,2.0,71.0,234000.0,1.0,2.0,4,395.0,834.971955
9513,suramerica,6.160509,-75.628399,Casa,3.0,4.0,244.0,665000.0,3.0,2.0,5,1300.0,1343.396403
1157,Asomadera no.2,6.226906,-75.561853,Apartamento,2.0,1.0,42.0,170000.0,,,4,265.0,995.33882
2875,Buenos Aires,6.237129,-75.556054,Apartamento,3.0,2.0,54.0,210000.0,2.0,1.0,4,240.0,1024.96297
7352,Belén,6.236265,-75.59537,Apartamento,3.0,2.0,101.0,336000.0,3.0,1.0,5,580.0,916.932841
6107,Envigado,6.23934,-75.570055,Apartamento,4.0,2.0,117.0,0.0,4.0,,3,620.0,920.919325


### 10. Use AutoML
Use autoML to automate the supervised regression model training