In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from joblib import dump
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate

In [31]:
url = "https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19


In [32]:
cities = df.City.unique().tolist()

In [33]:
resident_types = df.Type.unique().tolist()

In [34]:
df.describe(include='object')

Unnamed: 0,City,Type
count,814,814
unique,36,3
top,SACRAMENTO,Residential
freq,424,759


In [35]:
df.isna().sum()

City          0
Type          0
Beds          0
Baths         0
SquareFeet    0
Price         0
dtype: int64

In [36]:
cityenc = OneHotEncoder(drop='first')
city_dummies = cityenc.fit_transform(df[['City']]).toarray()

res_type = OneHotEncoder(drop='first')
res_dummies= res_type.fit_transform(df[['Type']]).toarray()

In [37]:
df.drop(columns=['City', 'Type'], inplace=True)
df = pd.concat([df, pd.DataFrame(city_dummies), pd.DataFrame(res_dummies)], axis=1)
df.head()

Unnamed: 0,Beds,Baths,SquareFeet,Price,0,1,2,3,4,5,...,27,28,29,30,31,32,33,34,0.1,1.1
0,2,1,836,138159.85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,1,1167,167541.46,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,796,119095.12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1,852,130904.95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,797,120266.19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [38]:
X = df.drop(columns=['Price'])
y = df['Price']

In [39]:
X.columns = X.columns.astype(str) # to avoid warning in sklearn

In [40]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

training

In [41]:
model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print('Decision Tree Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Decision Tree Regression:
MSE: 1361790873.9947445
MAE: 15912.943456032723
R2: 0.8374426724447598


In [42]:
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print('Random Forest Regression:')
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Random Forest Regression:
MSE: 1066425481.0456202
MAE: 14344.469198461371
R2: 0.8727005155152359


In [43]:
for i in range(5,50,5):
    model2 = RandomForestRegressor(max_depth=i)
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    print(f'Random Forest Regression with max_depth:{i}')
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('R2:', r2_score(y_test, y_pred))
    print('-'*25)

Random Forest Regression with max_depth:5
MSE: 1353048917.8664181
MAE: 20395.41913575043
R2: 0.8384862020005552
-------------------------
Random Forest Regression with max_depth:10
MSE: 1065131417.8820237
MAE: 14958.120762975175
R2: 0.8728549881685477
-------------------------
Random Forest Regression with max_depth:15
MSE: 1194359311.1911614
MAE: 14685.669326706658
R2: 0.8574290212428741
-------------------------
Random Forest Regression with max_depth:20
MSE: 1122517492.130177
MAE: 14601.673001868015
R2: 0.8660047976974502
-------------------------
Random Forest Regression with max_depth:25
MSE: 1133808243.888516
MAE: 14499.833143618587
R2: 0.8646570177504888
-------------------------
Random Forest Regression with max_depth:30
MSE: 1177352181.3284855
MAE: 14798.918409377726
R2: 0.8594591667172313
-------------------------
Random Forest Regression with max_depth:35
MSE: 1167704860.036769
MAE: 14879.964368886931
R2: 0.8606107699458886
-------------------------
Random Forest Regression 

In [44]:
from sklearn.model_selection import learning_curve

lc_results = learning_curve(model2, X,y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10))

In [45]:
test_scores_df = pd.DataFrame(lc_results[2])
test_scores_df['train_record'] = lc_results[0]
test_scores_df['mean_test_score'] = test_scores_df.loc[:,[0,1,2,3,4]].mean(axis=1)
test_scores_df

Unnamed: 0,0,1,2,3,4,train_record,mean_test_score
0,0.574203,0.424158,0.199494,0.533002,0.355187,65,0.417209
1,0.922038,0.751408,0.691774,0.639597,0.694844,130,0.739932
2,0.917037,0.769236,0.721995,0.64153,0.70666,195,0.751292
3,0.922713,0.778378,0.727308,0.675698,0.711331,260,0.763086
4,0.938676,0.803073,0.777884,0.745542,0.72365,325,0.797765
5,0.933339,0.775735,0.776162,0.756645,0.72498,390,0.793372
6,0.933706,0.784522,0.778773,0.746509,0.720828,455,0.792868
7,0.934165,0.857204,0.792096,0.793602,0.730104,520,0.821435
8,0.946911,0.858607,0.833542,0.794976,0.730411,585,0.832889
9,0.946826,0.877045,0.835354,0.808748,0.726222,651,0.838839


In [46]:
px.line(test_scores_df, x='train_record', y=[0,1,2,3,4], title='Learning Curve')

In [47]:
model_dict= {
    'city_encoder': cityenc,
    'resident_type_encoder': res_type,
    'scaler': scaler,
    'model': model2,
    'cities': cities,
    'resident_types': resident_types,
    'description':'''
    Model trained on the house_pricing dataset.
    cities = get from cities list
    resident_types = get from resident_types list
    ''',
    'author': 'Digipodium',
}
dump(model_dict,'../../models/house_pricing_model.pk') 

['../../models/house_pricing_model.pk']