In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
# Feature selection (X, y) & Split into xtrain,xtest,ytrain,ytest
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2, random_state=1)

In [4]:
print("Random Forest")
model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)
print("score:", model2.score(xtest,ytest) * 100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mae:",mean_absolute_error(y,pred))

Random Forest
score: 72.2000872758184
mse: 853468338.947107
mae: 15242.034697425517


In [5]:
forest_score = cross_val_score(model2,X,y,cv=6)
print(forest_score, f"average: {forest_score.mean():.2f}",f"std :{forest_score.std():.2f}")

[0.80800374 0.69295761 0.71921476 0.73268795 0.49499625 0.61987999] average: 0.68 std :0.10


grid search

In [None]:
# we are going to create a dictionary with all the parameer and their value options
RandomForestRegressor?

In [7]:
params = {
    'n_estimators' : list(range(100,501,200)),
    'criterion': ["squared_error", "absolute_error", "poisson"],
    'max_depth': list(range(5,51,25)),
}
params

{'n_estimators': [100, 300, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 30]}

In [13]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=1)

In [14]:
grid.fit(X,y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 30],
                         'n_estimators': [100, 300, 500]},
             verbose=1)

In [15]:
gf = pd.DataFrame(grid.cv_results_)

In [16]:
gf.sort_values(by='rank_test_score',inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,8.689362,0.95643,0.397502,0.082658,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823299,0.766252,0.616774,0.735441,0.087083,1
7,4.045128,0.086454,0.121934,0.004893,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823564,0.76783,0.612315,0.73457,0.089392,2
6,1.397044,0.065017,0.049309,0.005249,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.822951,0.766717,0.613703,0.734457,0.088418,3
1,1.765599,0.107407,0.124352,0.005258,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.803017,0.767456,0.615974,0.728816,0.081101,4
2,2.833396,0.014929,0.200746,0.002997,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.79972,0.766281,0.618943,0.728315,0.078533,5
0,0.724251,0.015426,0.04964,0.007582,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.798553,0.769067,0.609192,0.725604,0.083191,6
10,10.129194,0.629709,0.174901,0.016053,absolute_error,30,300,"{'criterion': 'absolute_error', 'max_depth': 3...",0.771046,0.757191,0.580327,0.702855,0.086825,7
11,11.350387,1.156914,0.267639,0.023958,absolute_error,30,500,"{'criterion': 'absolute_error', 'max_depth': 3...",0.768954,0.758401,0.576321,0.701225,0.088426,8
3,0.745074,0.022811,0.054303,0.003398,squared_error,30,100,"{'criterion': 'squared_error', 'max_depth': 30...",0.76189,0.754497,0.585547,0.700645,0.081442,9
5,3.58426,0.070385,0.24522,0.003325,squared_error,30,500,"{'criterion': 'squared_error', 'max_depth': 30...",0.757499,0.754625,0.581439,0.697854,0.082327,10


In [17]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=500)

In [19]:
from joblib import dump

In [21]:
dump(grid.best_estimator_,"house_pricing_model_73.pkl")

['house_pricing_model_73.pkl']