In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from joblib import dump
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate

In [52]:
url = "https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19


In [53]:
df.SquareFeet.describe()

count     814.000000
mean     1591.146192
std       663.841930
min       484.000000
25%      1144.000000
50%      1418.500000
75%      1851.000000
max      5822.000000
Name: SquareFeet, dtype: float64

In [54]:
cities = df.City.unique().tolist()

In [55]:
resident_types = df.Type.unique().tolist()

In [56]:
df.describe(include='object')

Unnamed: 0,City,Type
count,814,814
unique,36,3
top,SACRAMENTO,Residential
freq,424,759


In [57]:
df.isna().sum()

City          0
Type          0
Beds          0
Baths         0
SquareFeet    0
Price         0
dtype: int64

In [58]:
cityenc = OneHotEncoder(drop='first')
city_dummies = cityenc.fit_transform(df[['City']]).toarray()

res_type = OneHotEncoder(drop='first')
res_dummies= res_type.fit_transform(df[['Type']]).toarray()

In [59]:
df.drop(columns=['City', 'Type'], inplace=True)
df = pd.concat([df, pd.DataFrame(city_dummies), pd.DataFrame(res_dummies)], axis=1)
df.head()

Unnamed: 0,Beds,Baths,SquareFeet,Price,0,1,2,3,4,5,...,27,28,29,30,31,32,33,34,0.1,1.1
0,2,1,836,138159.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,1,1167,167541.46,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,796,119095.12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1,852,130904.95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,797,120266.19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
X = df.drop(columns=['Price'])
y = df['Price']

In [61]:
X

Unnamed: 0,Beds,Baths,SquareFeet,0,1,2,3,4,5,6,...,27,28,29,30,31,32,33,34,0.1,1.1
0,2,1,836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,1,1167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1,852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,4,3,2280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
810,3,2,1477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
811,3,2,1216,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
812,4,2,1685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [62]:
X.columns = X.columns.astype(str) # to avoid warning in sklearn

In [63]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

training

In [64]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print('Decision Tree Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Decision Tree Regression:
MSE: 1358519873.9468045
MAE: 15859.148670756647
R2: 0.8378331325634023


In [65]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print('Random Forest Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Random Forest Regression:
MSE: 1008303854.7450464
MAE: 14054.132012328362
R2: 0.8796385090243788


In [66]:
for i in range(5,50,5):
    model2 = RandomForestRegressor(max_depth=i)
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    print(f'Random Forest Regression with max_depth:{i}')
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-'*25)

Random Forest Regression with max_depth:5
MSE: 1388942525.583195
MAE: 20340.24437624583
R2: 0.8342015727978058
-------------------------
Random Forest Regression with max_depth:10
MSE: 1085028251.262127
MAE: 15266.24951960915
R2: 0.8704798980406534
-------------------------
Random Forest Regression with max_depth:15
MSE: 1162000689.5868878
MAE: 14528.439712962758
R2: 0.8612916782424263
-------------------------
Random Forest Regression with max_depth:20
MSE: 1220391441.4541736
MAE: 14861.13648316933
R2: 0.8543215591450326
-------------------------
Random Forest Regression with max_depth:25
MSE: 1135208793.270782
MAE: 14814.304361820024
R2: 0.8644898338098133
-------------------------
Random Forest Regression with max_depth:30
MSE: 1133401893.13755
MAE: 14790.065013623522
R2: 0.864705523944346
-------------------------
Random Forest Regression with max_depth:35
MSE: 1108204127.4900546
MAE: 14510.115780514154
R2: 0.8677133877230219
-------------------------
Random Forest Regression with 

In [67]:
from sklearn.model_selection import learning_curve

lc_results = learning_curve(model2, X,y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10))

In [68]:
test_scores_df = pd.DataFrame(lc_results[2])
test_scores_df['train_record'] = lc_results[0]
test_scores_df['mean_test_score'] = test_scores_df.loc[:,[0,1,2,3,4]].mean(axis=1)
test_scores_df

Unnamed: 0,0,1,2,3,4,train_record,mean_test_score
0,0.589874,0.421071,0.21813,0.537548,0.356925,65,0.42471
1,0.924403,0.750446,0.69335,0.649388,0.690759,130,0.741669
2,0.921638,0.770958,0.719173,0.642019,0.705215,195,0.7518
3,0.922014,0.774388,0.72767,0.676981,0.71235,260,0.762681
4,0.937977,0.806006,0.778323,0.765294,0.721868,325,0.801894
5,0.934475,0.789678,0.77525,0.747495,0.724799,390,0.794339
6,0.929775,0.781741,0.780002,0.755064,0.724238,455,0.794164
7,0.938669,0.8475,0.789388,0.835226,0.736226,520,0.829402
8,0.949332,0.860808,0.831659,0.824201,0.731936,585,0.839587
9,0.946159,0.875218,0.8337,0.852638,0.731427,651,0.847829


In [69]:
px.line(test_scores_df, x='train_record', y=[0,1,2,3,4], title='Learning Curve')

In [70]:
model_dict= {
    'city_encoder': cityenc,
    'resident_type_encoder': res_type,
    'scaler': scaler,
    'model': model2,
    'cities': cities,
    'resident_types': resident_types,
    'description':'''
    Model trained on the house_pricing dataset.
    cities = get from cities list
    resident_types = get from resident_types list
    ''',
    'author': 'Digipodium',
}
dump(model_dict,'../../models/house_pricing_model.pk') 

['../../models/house_pricing_model.pk']