# Basic Supervised Learning model
Learn the supervised learning model basics

In [22]:
# Install the project libraries
!pip install pandas
!pip install -U scikit-learn
!pip install -U matplotlib



## Medellin properties price prediction
Using the Medellin properties dataset 2023, predict the property price.

You can download the dataset from: https://www.kaggle.com/datasets/cesaregr/medelln-properties

In [23]:
# Import python libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

### 1. Frame the problem
The goal es predict the Medellin properties price using the properties features: geolocation, area, # bathrooms, # rooms, etc.

### 2. Selec the Performance Measure
In order to identify how well our model is doing, we can calculate the difference between the real and predicted value. The performance measure to use is Mean Squared Error (MSE). Later we will understand how it work

### 3. Download and read the Data
The data is in the file named 'medellin_properties.csv'

### 4. Take a look at the dataset - Analyze the dataset
Review the dataset, understand what it cotains, check data quality, understand the data limitations, the risks, the non-relevant data, etc.

In [24]:
# Read medellin_properties file
properties_filepath = 'medellin_properties.csv'
properties_df = pd.read_csv(properties_filepath)
properties_df

Unnamed: 0,neighbourhood,latitude,longitude,property_type,price,rooms,baths,area,administration_price,age,garages,stratum
0,Suramerica,6.186203,-75.599437,Apartamento,435000000,3.0,2.0,83.0,354400.0,1.0,1.0,4
1,Escobero,6.162800,-75.573519,Apartamento,680000000,3.0,4.0,124.0,480000.0,2.0,2.0,4
2,Castropol,6.216140,-75.566970,Apartamento,900000000,3.0,3.0,111.0,813000.0,1.0,2.0,6
3,Toledo,6.162762,-75.639307,Casa,650000000,3.0,2.0,127.0,0.0,2.0,1.0,4
4,La pilarica,6.247638,-75.565815,Apartamento,320000000,3.0,2.0,72.0,250000.0,,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
9923,Loma de los Bernal,6.213318,-75.607806,Apartamento,685000000,3.0,3.0,94.0,380000.0,1.0,2.0,5
9924,Conjunto residencial san jose,6.157310,-75.578221,Apartamento,582000000,3.0,2.0,88.0,0.0,2.0,1.0,4
9925,Centro,6.244732,-75.560750,Apartamento,320000000,3.0,2.0,59.0,0.0,,,4
9926,La america,6.254452,-75.611331,Apartamento,450000000,4.0,3.0,147.0,0.0,3.0,1.0,3


### 5. Prepare Data for Machine Learning Algorithms - Generate Train and Test set
Apply the transformations needed to prepare the data to train the model. Remove irrelevant features, handle outliers and anomalies, handle missing values, scale features, etc.

In [25]:
# Remove propertyType = Proyecto. It's not into the current scope
properties_df=properties_df.loc[properties_df['property_type']!='Proyecto']

In [26]:
# Split features and target
y = properties_df['price']
properties_df.drop(columns='price', inplace=True)
X = properties_df.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  properties_df.drop(columns='price', inplace=True)


In [27]:
# Split dataset
train_set, test_set, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [28]:
# Remove irrelevant features
train_set.drop(columns=['neighbourhood'], inplace=True)
test_set.drop(columns=['neighbourhood'], inplace=True)
train_set

Unnamed: 0,latitude,longitude,property_type,rooms,baths,area,administration_price,age,garages,stratum
1795,6.240919,-75.556469,Apartamento,3.0,2.0,138.0,0.0,1.0,,3
5912,6.269560,-75.608969,Apartamento,3.0,2.0,71.0,290000.0,2.0,1.0,4
3606,6.282073,-75.615990,Apartamento,4.0,1.0,46.0,13000.0,4.0,1.0,3
8407,6.263200,-75.545278,Casa,3.0,2.0,118.0,0.0,3.0,,2
8222,6.146531,-75.609541,Apartamento,3.0,2.0,63.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...
5743,6.151145,-75.636623,Casa,3.0,2.0,98.0,284000.0,3.0,1.0,4
5197,6.181666,-75.579754,Casa,3.0,2.0,90.0,0.0,,,5
5397,6.180376,-75.559996,Apartamento,3.0,2.0,67.0,250000.0,2.0,1.0,5
861,6.268205,-75.559055,Apartamento,2.0,1.0,57.8,0.0,2.0,,3


In [29]:
# Replace the value in a specific column. Join Apartaestudio and Apartamento
train_set['property_type'] = train_set['property_type'].replace('Apartaestudio', 'Apartamento')
test_set['property_type'] = test_set['property_type'].replace('Apartaestudio', 'Apartamento')
# Transform the propery type to binary
train_set['property_type'] = train_set['property_type'].replace('Apartamento', 1)
train_set['property_type'] = train_set['property_type'].replace('Casa', 0)
test_set['property_type'] = test_set['property_type'].replace('Apartamento', 1)
test_set['property_type'] = test_set['property_type'].replace('Casa', 0)

  train_set['property_type'] = train_set['property_type'].replace('Casa', 0)
  test_set['property_type'] = test_set['property_type'].replace('Casa', 0)


In [30]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

# NOTE: We will use RandomForestRegressor as estimato because there are some features highly skew
# and the BayesRidge stimator is more sensitive to outliers and data scale
imp = IterativeImputer(estimator=RandomForestRegressor()) 
# fit on the dataset 
imp.fit(train_set) 
# transform the dataset 
X_train_trans = imp.transform(train_set)
X_test_trans = imp.transform(test_set)
# Transfor to the dataset
train_set = pd.DataFrame(X_train_trans, columns=train_set.columns, index=train_set.index)
test_set = pd.DataFrame(X_test_trans, columns=test_set.columns, index=test_set.index)

In [31]:
from sklearn.preprocessing import MinMaxScaler

# Apply min-max scaler
scaler = MinMaxScaler()
scaler_model = scaler.fit(train_set)
train_scaled = scaler_model.transform(train_set)
test_scaled = scaler_model.transform(test_set)
train_set = pd.DataFrame(train_scaled, columns=train_set.columns, index=train_set.index)
test_set = pd.DataFrame(test_scaled, columns=test_set.columns, index=test_set.index)

### 6. Train the model
Select a model and use the training dataset to train the model. In following sessions we will improve the model (fine-tune). 

The ML model selected is Random Tree algorithm.

In [32]:
train_set

Unnamed: 0,latitude,longitude,property_type,rooms,baths,area,administration_price,age,garages,stratum
1795,0.468351,0.191869,1.0,0.105263,0.111111,0.000010,0.000000,0.250,0.007778,0.500000
5912,0.468681,0.191442,1.0,0.105263,0.111111,0.000005,0.000195,0.500,0.000000,0.666667
3606,0.468824,0.191385,1.0,0.157895,0.000000,0.000003,0.000009,1.000,0.000000,0.500000
8407,0.468607,0.191960,0.0,0.105263,0.111111,0.000008,0.000000,0.750,0.012222,0.333333
8222,0.467265,0.191438,1.0,0.105263,0.111111,0.000004,0.000000,0.000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...
5743,0.467318,0.191218,0.0,0.105263,0.111111,0.000007,0.000191,0.750,0.000000,0.666667
5197,0.467669,0.191680,0.0,0.105263,0.111111,0.000006,0.000000,0.545,0.022222,0.833333
5397,0.467654,0.191840,1.0,0.105263,0.111111,0.000005,0.000168,0.500,0.000000,0.833333
861,0.468665,0.191848,1.0,0.052632,0.000000,0.000004,0.000000,0.500,0.002222,0.500000


In [33]:
y_train

1795    309000000
5912    420000000
3606    120000000
8407    190000000
8222    350000000
          ...    
5743    385000000
5197    375000000
5397    454000000
861     235000000
7281    455000000
Name: price, Length: 8924, dtype: int64

##### ML Algorithms
* Linear Regression  
* Logistic Regression  
* Support Vector Machines (SVMs)  
* Decision Trees and Random Forests  
* Neural networks2

To understand random forest:

https://www.youtube.com/watch?v=g9c66TUylZ4

https://www.youtube.com/watch?v=J4Wdy0Wc_xQ&t=91s

In [34]:
# Initialize the RandomForestRegressor
regr = RandomForestRegressor(n_estimators=100, random_state=42, criterion='squared_error')
# Fit the model on the training data
regr.fit(train_set, y_train)

### 7. Evaluate Model performance
Check how well the mode is doing using the performance metric (cost function). Mean Squared Error (MSE) in this case

In [35]:
# Make predictions on the test data
y_pred = regr.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

MSE: 339760902441353740288.00


In [36]:
# Initialize the RandomForestRegressor (n_estimators=10)
regr = RandomForestRegressor(n_estimators=10, random_state=42, criterion='squared_error')
# Fit the model on the training data
regr.fit(train_set, y_train)

In [37]:
# Make predictions on the test data
y_pred = regr.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

MSE: 166745090840181899264.00


In [38]:
# Previous MSE
# MSE: 306'773.058'127.162'900.480.00

# Current MSE
# MSE: 163'247.086'315.270'635.520.00

### 8. Fine-tune the model
Choose the best hyper parameters to get the best model performance

In [39]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [5, 10, 30, 40, 50, 100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 3, 5, 10]  # Minimum number of samples required to split an internal node
}

# Initialize Random Forest Regressor algorithm 
regr = RandomForestRegressor(random_state=42, criterion='squared_error')

# Initialize GridSearchCV with the RandomForest model and the parameter grid
grid_search = GridSearchCV(estimator=regr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(train_set, y_train)

Fitting 3 folds for each of 224 candidates, totalling 672 fits


In [40]:
# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score (negative mean squared error): {grid_search.best_score_}")

# Use the best estimator to make predictions on the test set
best_model = grid_search.best_estimator_
# Make predictions on the test data
y_pred = best_model.predict(test_set)
# Evaluate the model using the root mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse:.2f}')

Best parameters: {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 5}
Best score (negative mean squared error): -1.5474782560888053e+21
MSE: 151073261725553917952.00


In [41]:
# Previous MSE
# MSE: 163'247.086'315.270'635.520.00

# Current MSE
# MSE: 151'073.261'725.553'917.952.00