In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from joblib import dump
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate

In [27]:
url = "https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19


In [28]:
df.describe(include='object')

Unnamed: 0,City,Type
count,814,814
unique,36,3
top,SACRAMENTO,Residential
freq,424,759


In [29]:
df.isna().sum()

City          0
Type          0
Beds          0
Baths         0
SquareFeet    0
Price         0
dtype: int64

In [30]:
cityenc = OneHotEncoder(drop='first')
city_dummies = cityenc.fit_transform(df[['City']]).toarray()

res_type = OneHotEncoder(drop='first')
res_dummies= res_type.fit_transform(df[['Type']]).toarray()

In [31]:
df.drop(columns=['City', 'Type'], inplace=True)
df = pd.concat([df, pd.DataFrame(city_dummies), pd.DataFrame(res_dummies)], axis=1)
df.head()

Unnamed: 0,Beds,Baths,SquareFeet,Price,0,1,2,3,4,5,...,27,28,29,30,31,32,33,34,0.1,1.1
0,2,1,836,138159.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,1,1167,167541.46,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,796,119095.12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1,852,130904.95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,797,120266.19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [32]:
X = df.drop(columns=['Price'])
y = df['Price']

In [33]:
X.columns = X.columns.astype(str) # to avoid warning in sklearn

In [34]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

training

In [35]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print('Decision Tree Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Decision Tree Regression:
MSE: 1391244765.1299412
MAE: 15979.365419222904
R2: 0.8339267538698355


In [51]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print('Random Forest Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Random Forest Regression:
MSE: 1096926151.4429364
MAE: 14629.210256451444
R2: 0.8690596426300427


In [55]:
for i in range(5,50,5):
    model2 = RandomForestRegressor(max_depth=i)
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    print(f'Random Forest Regression with max_depth:{i}')
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-'*25)

Random Forest Regression with max_depth:5
MSE: 1406829769.5012488
MAE: 20550.910649352827
R2: 0.8320663678818573
-------------------------
Random Forest Regression with max_depth:10
MSE: 1102318353.305545
MAE: 15133.507563544752
R2: 0.8684159741041604
-------------------------
Random Forest Regression with max_depth:15
MSE: 1119831365.4007397
MAE: 14774.859858959659
R2: 0.8663254413373441
-------------------------
Random Forest Regression with max_depth:20
MSE: 1162536728.1402733
MAE: 14551.738472601874
R2: 0.8612276911821741
-------------------------
Random Forest Regression with max_depth:25
MSE: 1146274457.365065
MAE: 14797.752276424304
R2: 0.8631689226353141
-------------------------
Random Forest Regression with max_depth:30
MSE: 1052051203.7832563
MAE: 14318.517502507544
R2: 0.8744163766962213
-------------------------
Random Forest Regression with max_depth:35
MSE: 1021711458.1453509
MAE: 14120.305466014212
R2: 0.8780380399514142
-------------------------
Random Forest Regressio

In [56]:
from sklearn.model_selection import learning_curve

lc_results = learning_curve(model2, X,y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10))

In [64]:
test_scores_df = pd.DataFrame(lc_results[2])
test_scores_df['train_record'] = lc_results[0]
test_scores_df['mean_test_score'] = test_scores_df.loc[:,[0,1,2,3,4]].mean(axis=1)
test_scores_df

Unnamed: 0,0,1,2,3,4,train_record,mean_test_score
0,0.610713,0.422173,0.215781,0.53118,0.366874,65,0.429344
1,0.917929,0.752648,0.690265,0.643614,0.689383,130,0.738768
2,0.91992,0.76896,0.721897,0.639863,0.710451,195,0.752218
3,0.925698,0.77558,0.730618,0.674362,0.712564,260,0.763765
4,0.937956,0.806764,0.778273,0.774948,0.721435,325,0.803875
5,0.935172,0.786139,0.774582,0.75755,0.722952,390,0.795279
6,0.933376,0.789491,0.78137,0.754735,0.720234,455,0.795841
7,0.939182,0.851465,0.78796,0.81082,0.730666,520,0.824019
8,0.945882,0.865811,0.829158,0.826532,0.734863,585,0.840449
9,0.946106,0.870668,0.834998,0.863805,0.726039,651,0.848323


In [63]:
px.line(test_scores_df, x='train_record', y=[0,1,2,3,4], title='Learning Curve')