# Property Price Prediction Project

# Problem Objective :

The project aims at building a model of housing prices to predict median house values in California using the provided dataset. This model should learn from the data and be able to predict the median housing price in any district, given all the other metrics.

Districts or block groups are the smallest geographical units for which the US Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). There are 20,640 districts in the project dataset.

In [9]:
import IPython
IPython.display.Image('https://i-media.vyaparify.com/vcards/blogs/95898/Buysell1.jpg')

<IPython.core.display.Image object>

# Step 1: Import Libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score #root_mean_squared_error
from sklearn.datasets import fetch_california_housing

warnings.filterwarnings('ignore')
print('Modules Loaded Successfully!!')

Modules Loaded Successfully!!


# Step 2 Load Data Create DF

In [11]:
# Internet Required For This Code to Run
data_dict = fetch_california_housing()

data_dict.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [12]:
print(data_dict['feature_names'])

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [13]:
print(data_dict['target_names'])

['MedHouseVal']


In [14]:
df = pd.DataFrame(data_dict['data'],
                  columns = data_dict['feature_names'])

df['MedHouseVal'] = data_dict['target']


print('Data Loaded Successfully!!')

Data Loaded Successfully!!


# Step 3: Understanding Data using EDA

In [15]:
# Shape
df.shape

(20640, 9)

In [16]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [17]:
# Checking Null values
df.isna().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [18]:
print(data_dict['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [19]:
# hundreds of thousands of dollars ($100,000) 1 lakh Dollars
# average number of household members
df.sample()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
880,5.0,18.0,5.669413,1.035015,2641.0,2.719876,37.56,-121.99,2.697


In [20]:
# All data Must be in Numerical, Dataset contains all values in numerical
# We can proceed this for Analysis

In [21]:
# Checking data Distribution

sns.pairplot(data = df,corner=True)
plt.show()

In [22]:
# Checking data Distribution

plt.title('Features vs target Corr')
sns.heatmap(df.corr().round(2), annot = True,cmap = 'mako')
# cmap = color_map ( -1 to 1)
plt.show()

In [23]:
# Data Describe()
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [24]:
plt.figure(figsize = (15,12))
for i,j in enumerate(df.columns):
  plt.subplot(3,3, i+1)
  plt.hist(df[j],color = 'blue', alpha = 0.3)
  plt.title(j+' Analysis')
plt.show()

In [25]:
# Distribution of target Value MedHousevalue vs MedInc

sns.jointplot(data = df, x = 'MedHouseVal', y = 'MedInc')
plt.show()

In [26]:
# sns.regplot(data = df, x = 'MedHouseVal', y = 'MedInc')
# plt.show()

In [27]:
df['MedHouseVal'].describe()
# 75 % of price less than 2.64 lakh dollars

count    20640.000000
mean         2.068558
std          1.153956
min          0.149990
25%          1.196000
50%          1.797000
75%          2.647250
max          5.000010
Name: MedHouseVal, dtype: float64

# Step 4: Feature Engineering and Preprocessing

In [28]:
# Convert data to Same Scale for better prediction
# normalization = MinMaxScaler
from sklearn.preprocessing import MinMaxScaler


X = df.iloc[:,:-1]
y = df['MedHouseVal']



scaler = MinMaxScaler()

scaler.fit(X)  # Learning: Fit data in 0 to 1
X_scaled = scaler.transform(X)

print('Done')

Done


In [29]:
print(X_scaled.min())
print(X_scaled.max())

0.0
1.0000000000000002


# Step 5: Train test Split: Split Data for Training and testing Part

In [30]:
# Train test Split: Divide into train Part Test Part
# train_test_split: Func: divide


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=.2, random_state= 150 )
print('Done')

Done


In [31]:
b,a,c,d = [23,534,6,65] # Unpacking
print(a)

534


In [32]:
import random
random.seed(56)
random.randint(1,50)

36

In [33]:
print('Shape of Xtrain',X_train.shape)
print('Shape of X_test',X_test.shape)
print('Shape of y_train',y_train.shape)
print('Shape of y_test',y_test.shape)


Shape of Xtrain (16512, 8)
Shape of X_test (4128, 8)
Shape of y_train (16512,)
Shape of y_test (4128,)


In [34]:
X_scaled.shape

(20640, 8)

In [35]:
20640*.8

16512.0

In [36]:
df.shape

(20640, 9)

# Step 6: Model Building

###### Step 6.1 Linear Model Using Linear Regression

In [37]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression() # Object Creation

lr_model.fit(X_train,y_train) # Trainig Model

###### Step 6.1.2: Model prediction

In [38]:
# Test data: X_test

y_pred = lr_model.predict(X_test)
print('Done')

Done


In [39]:
# 20 %: Actual answer: y_test, Predcited: y_pred

lr_compare_df = pd.DataFrame({'Actual House price':y_test,
             'Predicted House price':y_pred})

print('Done')

Done


In [40]:
lr_compare_df

Unnamed: 0,Actual House price,Predicted House price
13569,0.93300,1.600182
6743,2.79900,2.290845
13230,3.92900,2.618237
18799,0.52000,0.209449
1052,1.23600,1.236397
...,...,...
19369,2.68800,2.276322
12808,0.69300,0.976300
6589,5.00001,3.410402
19719,0.71200,0.896588


In [41]:
lr_mae = mean_absolute_error(y_test,y_pred)
print('Lr MAE',lr_mae)

Lr MAE 0.5295622016018944


In [42]:
lr_mse = mean_squared_error(y_test,y_pred)
print('Lr MSE',lr_mse)

Lr MSE 0.532217053983186


In [43]:
lr_rmse = lr_mse**.5

print('Lr RMSE',lr_rmse)

Lr RMSE 0.729532078789676


In [44]:
# Trainig Score
lr_train_score = lr_model.score(X_train,y_train)
print('Training Score',lr_train_score)

# testing Score
lr_test_score = lr_model.score(X_test,y_test)
print('testing Score',lr_test_score)


Training Score 0.6081032361747606
testing Score 0.5984024263886711


In [45]:
# Model Score: apx: 60%, Model not that much good, we need, more parameter tuning
# Alternate Model call
# : feature vs Fetaure high corr( X col: 8)

# Variance Inflation Factor> 10 we can drop that col
# Bias vs Variance Trade off: Intersect

# Training score: High, Testing Score: Low = Overfit
# Training score: Low, Testing Score: Low  = Underfit
# Training score: Good, Testing Score: Good  = Balancedfit


# Principal Component Analysis (PCA) is a dimensionality reduction technique
# PCA:  Imp Features(cols) Find: (8: 5 Imp)

![mlconcepts_image5.png](attachment:9d0800a9-9f47-458b-a328-7e12c2b68c12.png)

![testset.webp](attachment:54ad481d-491d-4654-92fd-4149bd3f5e13.webp)

In [46]:
# Lasso,Ridge :  Linear regression: Balanced fit model creation

![Bias_and_variance_contributing_to_total_error.svg.png](attachment:aae01608-98d4-4f30-8ab7-6a0512f687ff.png)

###### Step 6.1.3: checking VIF and Drop column if value vif>10

In [47]:
X_train_df = pd.DataFrame(X_train, columns = data_dict['feature_names'])
X_train_df.sample()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10690,0.292417,0.803922,0.030195,0.020467,0.031755,0.001855,0.144527,0.596614


In [48]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_df['intercept']  = 1

X_train_df.sample()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,intercept
2564,0.255748,0.156863,0.029634,0.023504,0.051935,0.001514,0.502657,0.25,1


In [49]:
# X_train_df.shape[1]

In [50]:
# X_train_df.columns

In [51]:
# X_train_df.values

In [52]:
vif_df = pd.DataFrame()

vif_df['Features'] = list(data_dict['feature_names']) + ['Intercept']

vif_value = [variance_inflation_factor(X_train_df.values,i) for i in range(X_train_df.shape[1])]

vif_df['VIF factor'] = vif_value

vif_df.sort_values(by = 'VIF factor',ascending= False)

# Because Latitude and Longitude has high corr and vif close to 10 we can drop one or both

Unnamed: 0,Features,VIF factor
8,Intercept,177.876568
6,Latitude,9.316394
7,Longitude,8.977028
2,AveRooms,8.752272
3,AveBedrms,7.46009
0,MedInc,2.459841
1,HouseAge,1.24029
4,Population,1.13696
5,AveOccup,1.009297


In [53]:
# X is an independent features

In [54]:
# print(['Hello'+str(i) for i in range(10)])
# # Run loop inside list:

In [55]:
col = list(X.columns)
col.remove('Latitude')

# print(col)

X_train_vif = X_train_df[col]
X_train_vif.sample()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Longitude
15261,0.31283,1.0,0.031264,0.018907,0.016228,0.001134,0.244024


![variance-inflation-factor.asp-Final-6cd8e4740c254821b0fa2ab057b5df88.jpg](attachment:25d879c8-fab7-46d3-928a-5ed08c635c7f.jpg)

In [56]:
def built_model(ml_model, col):

    new_X_train = pd.DataFrame(X_train,columns = data_dict['feature_names'])
    final_X_train = new_X_train[col]

    new_X_test = pd.DataFrame(X_test,columns = data_dict['feature_names'])
    final_X_test = new_X_test[col]

    machine_model = ml_model()
    machine_model.fit(final_X_train,y_train)

    model_y_pred = machine_model.predict(final_X_test)

    model_train_score = machine_model.score(final_X_train,y_train)
    model_test_score = machine_model.score(final_X_test,y_test)

    model_mae_error = mean_absolute_error(y_test,model_y_pred)
    model_mse_error = mean_squared_error(y_test,model_y_pred)
    model_rmse_error = model_mse_error**.5
    model_r2_score = r2_score(y_test,model_y_pred)

    model_metrics = {'Model Training Score':model_train_score,
                    'Model Test Score':model_test_score,
                    'MAE Error':model_mae_error,
                    'MSE Error':model_mse_error,
                    'RMSE Error':model_rmse_error,
                    'R2 Score':model_r2_score}

    model_matrix = pd.DataFrame(model_metrics,index = [1])

    return model_matrix,machine_model



In [57]:
ml_model = LinearRegression

built_model(ml_model,col)[0]

Unnamed: 0,Model Training Score,Model Test Score,MAE Error,MSE Error,RMSE Error,R2 Score
1,0.543073,0.531162,0.570342,0.621327,0.788243,0.531162


In [58]:
from sklearn.linear_model import Lasso, Ridge

In [59]:
ml_model = Lasso
# Lasso regression not giving much score, we can reject this model
built_model(ml_model,col)[0]

Unnamed: 0,Model Training Score,Model Test Score,MAE Error,MSE Error,RMSE Error,R2 Score
1,0.0,-9e-06,0.904335,1.325262,1.1512,-9e-06


In [60]:
ml_model = Ridge

built_model(ml_model,col)[0]

Unnamed: 0,Model Training Score,Model Test Score,MAE Error,MSE Error,RMSE Error,R2 Score
1,0.52581,0.513927,0.59007,0.644168,0.802601,0.513927


In [61]:
print(col)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Longitude']


In [62]:
col = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

In [83]:
all_model = [LinearRegression, Ridge]

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

ss_X = sc.fit_transform(X)

i = 1
while True:
  X_train, X_test, y_train, y_test = train_test_split(ss_X, y,random_state=i,test_size=0.1)
  i += 1

  temp_df,final_model = built_model(Ridge,col)
  score = temp_df['Model Test Score'].values[0]

  print('Score is:',score)
  if score >= 0.6:
    print('Best random State',i)

    display(temp_df)
    break

  display(clear=True)

Score is: 0.6067450510265391
Best random State 178


Unnamed: 0,Model Training Score,Model Test Score,MAE Error,MSE Error,RMSE Error,R2 Score
1,0.531671,0.606745,0.550302,0.556391,0.745916,0.606745


In [84]:
final_model

# Step 7: Model Save and Localhost Deployment

In [89]:
import pickle

with open('house_price_pred_ridge_model.pkl','wb') as f:
    pickle.dump(final_model,f)

with open('sc_scaler.pkl','wb') as f:
    pickle.dump(sc,f)

print('ML model and Scaler Saved Successfully!!')

ML model and Scaler Saved Successfully!!


# step 8: website building and localhost

In [90]:
#ml model: client
# website:input value:prediction
# website:python,django,flask,fastapi,: streamlit
# streamlit: python web-based,framework,fast ml model test website
# streamlit=development+deployment(for ml engineers)

In [91]:
#pip install streamlit

In [92]:
import streamlit as st
print("done!!")

done!!


In [93]:
#st.title("California Housing Price Prediction")

In [94]:
#!= represents to run this code in terminal

In [95]:
# !streamlit run house.py

In [96]:
import os
os.getcwd()

'C:\\Users\\91965'

In [97]:
print(col)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']


In [98]:
df.to_csv('california.csv')
print('done')

done


In [99]:
for i in df[col]:
    min_value,max_value=df[i].agg(['min','max'])
    print('min',i,min_value)
    print('max',i,max_value)

min MedInc 0.4999
max MedInc 15.0001
min HouseAge 1.0
max HouseAge 52.0
min AveRooms 0.8461538461538461
max AveRooms 141.9090909090909
min AveBedrms 0.3333333333333333
max AveBedrms 34.06666666666667
min Population 3.0
max Population 35682.0
min AveOccup 0.6923076923076923
max AveOccup 1243.3333333333333


In [100]:
temp_df=pd.read_csv('california.csv')

for i in temp_df[col]:
    min_value,max_value=df[i].agg(['min','max'])

# step 9: live deployment using streamlit

In [None]:
# import streamlit as st
# import pandas as pd
# import random
# from sklearn.preprocessing import StandardScaler
# import pickle
# import time


In [None]:
#github:folder housing price project
#requirements.txt= we will write those modules which is important for app.py tp project

In [101]:
pd.__version__

'2.2.3'

In [102]:
st.__version__

'1.45.1'

In [103]:
import sklearn

In [104]:
sklearn.__version__

'1.6.1'