# Chennai House Price Prediction using ML & Python Pandas.

### Importing Required Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import warnings 
warnings.filterwarnings("ignore")

### Data Wrangling.

In [2]:
df=pd.read_csv(r"C:\Users\ssrin\Downloads\Chennai houseing sale.csv")
df.N_BEDROOM.fillna(int(df.N_BEDROOM.mean()),inplace=True)
df.N_BATHROOM.fillna(int(df.N_BATHROOM.mean()),inplace=True)

In [3]:
df.isna().sum()

PRT_ID           0
AREA             0
Per_SQFT         0
Year             0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACILITY    0
year_BUILD       0
BUILD_TYPE       0
REG_FEE          0
COMMISION        0
SALES_PRICE      0
dtype: int64

In [4]:
len(df)

7109

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

PRT_ID           0
AREA             0
Per_SQFT         0
Year             0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACILITY    0
year_BUILD       0
BUILD_TYPE       0
REG_FEE          0
COMMISION        0
SALES_PRICE      0
dtype: int64

### One Hot Encoding for Text Columns

In [7]:
df.PARK_FACILITY=df.PARK_FACILITY.map({"Yes":1,"No":0})
df.head()

Unnamed: 0,PRT_ID,AREA,Per_SQFT,Year,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACILITY,year_BUILD,BUILD_TYPE,REG_FEE,COMMISION,SALES_PRICE
0,P03210,Karapakkam,1004,2011,131,1.0,1.0,3,AbNormal,1,1967,Commercial,380000,144400,7600000
1,P09411,Anna Nagar,1986,2006,26,2.0,1.0,5,AbNormal,0,1995,Commercial,760122,304049,21717770
2,P01812,Adyar,909,2012,70,1.0,1.0,3,AbNormal,1,1992,Commercial,421094,92114,13159200
3,P05346,Velachery,1855,2010,14,3.0,2.0,5,Family,0,1988,Other,356321,77042,9630290
4,P06210,Karapakkam,1226,2009,84,1.0,1.0,3,AbNormal,1,1979,Other,237000,74063,7406250


In [8]:
df=df.copy()
df.N_BEDROOM=df.N_BEDROOM.astype("int8")
df.N_BATHROOM=df.N_BATHROOM.astype("int8")
df.Year=df.Year.astype("int")
df.year_BUILD=df.year_BUILD.astype("int")
df.Per_SQFT=df.Per_SQFT.astype("int")

In [9]:
df1=df.iloc[:,1:]
df1.head()

Unnamed: 0,AREA,Per_SQFT,Year,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACILITY,year_BUILD,BUILD_TYPE,REG_FEE,COMMISION,SALES_PRICE
0,Karapakkam,1004,2011,131,1,1,3,AbNormal,1,1967,Commercial,380000,144400,7600000
1,Anna Nagar,1986,2006,26,2,1,5,AbNormal,0,1995,Commercial,760122,304049,21717770
2,Adyar,909,2012,70,1,1,3,AbNormal,1,1992,Commercial,421094,92114,13159200
3,Velachery,1855,2010,14,3,2,5,Family,0,1988,Other,356321,77042,9630290
4,Karapakkam,1226,2009,84,1,1,3,AbNormal,1,1979,Other,237000,74063,7406250


In [10]:
df1.BUILD_TYPE=df1.BUILD_TYPE.map({"Commercial":0,"House":1,"Other":2})
df1.isna().sum()

AREA             0
Per_SQFT         0
Year             0
DIST_MAINROAD    0
N_BEDROOM        0
N_BATHROOM       0
N_ROOM           0
SALE_COND        0
PARK_FACILITY    0
year_BUILD       0
BUILD_TYPE       0
REG_FEE          0
COMMISION        0
SALES_PRICE      0
dtype: int64

In [11]:
dum1=pd.get_dummies(df1.AREA,drop_first=True)
dum2=pd.get_dummies(df1.SALE_COND,drop_first=True)
df2=pd.concat([df1,dum1,dum2],axis=1)
df2.head(3)

Unnamed: 0,AREA,Per_SQFT,Year,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACILITY,year_BUILD,...,Anna Nagar,Chrompet,KK Nagar,Karapakkam,T Nagar,Velachery,AdjLand,Family,Normal Sale,Partial
0,Karapakkam,1004,2011,131,1,1,3,AbNormal,1,1967,...,False,False,False,True,False,False,False,False,False,False
1,Anna Nagar,1986,2006,26,2,1,5,AbNormal,0,1995,...,True,False,False,False,False,False,False,False,False,False
2,Adyar,909,2012,70,1,1,3,AbNormal,1,1992,...,False,False,False,False,False,False,False,False,False,False


In [12]:
model1=df2.drop(["SALE_COND","AREA","DIST_MAINROAD"],axis=1)
model1.head()

Unnamed: 0,Per_SQFT,Year,N_BEDROOM,N_BATHROOM,N_ROOM,PARK_FACILITY,year_BUILD,BUILD_TYPE,REG_FEE,COMMISION,...,Anna Nagar,Chrompet,KK Nagar,Karapakkam,T Nagar,Velachery,AdjLand,Family,Normal Sale,Partial
0,1004,2011,1,1,3,1,1967,0,380000,144400,...,False,False,False,True,False,False,False,False,False,False
1,1986,2006,2,1,5,0,1995,0,760122,304049,...,True,False,False,False,False,False,False,False,False,False
2,909,2012,1,1,3,1,1992,0,421094,92114,...,False,False,False,False,False,False,False,False,False,False
3,1855,2010,3,2,5,0,1988,2,356321,77042,...,False,False,False,False,False,True,False,True,False,False
4,1226,2009,1,1,3,1,1979,2,237000,74063,...,False,False,False,True,False,False,False,False,False,False


### Accuracy Measuring

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
m1=LinearRegression()
x_tr,x_t,y_tr,y_t=train_test_split(model1.drop(["SALES_PRICE"],axis=1),model1.SALES_PRICE,train_size=0.8,random_state=50)
m1.fit(x_tr,y_tr)
m1.score(x_t,y_t)

0.9134439603774468

In [14]:
from sklearn.model_selection import cross_val_score
c=cross_val_score(XGBRegressor(),model1.drop(["SALES_PRICE"],axis=1),model1.SALES_PRICE,cv=5)
c

array([0.9611757 , 0.96272373, 0.95953577, 0.95994482, 0.96891527])

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
cv=ShuffleSplit(n_splits=5,train_size=0.8,random_state=10)
m2=XGBRegressor()
m2.fit(x_tr,y_tr)
m2.score(x_t,y_t)

0.9638968689415204

In [16]:
c1=cross_val_score(m2,model1.drop(["SALES_PRICE"],axis=1),model1.SALES_PRICE,cv=cv)
c1.mean()

0.9628335860037824

### Hyper Parameter Tuning.

In [17]:
dict={
    "Lasso":{
        "model" :Lasso(),
        "params":{
            "selection":['cyclic', 'random'],
            "tol":[0.0001,0.001,0.01,0.1],
            "alpha":[0.25,0.5,0.75,1]}},
    "LinearRegression":{
        "model":LinearRegression(),
        "params":{
            "fit_intercept":[True,False]}}}

In [18]:
gs=GridSearchCV(DecisionTreeRegressor(),{
            "criterion":["squared_error", "friedman_mse", "absolute_error","poisson"],
            "splitter":["best","random"]},cv=cv)

In [50]:
X=model1.drop(["SALES_PRICE",'COMMISION',"REG_FEE"],axis=1)
len(X.columns)

18

### Creating Functions for Predicting

In [55]:
def predict():
    sq=int(input("How many Sqft?    :"))
    yr=int(input("Current Year"))
    bed=int(input("How many Bedrooms?    :"))
    bath=int(input("How many Bathrooms?   :"))
    room=int(input("How many Rooms?    :"))
    park=int(input("Park Facility ? (1/0)    :"))
    byr=int(input("Which Year it must be build ?    :"))
    bt=input("For what purpose looking for ? (Commercial,House,Other)    :")
    ar=input("In which place in chennai looking for ? (Anna Nagar,Chrompet,KK Nagar,Karapakkam,T Nagar,Velachery)      :")
    sc=input("What ids the Sale Condition you are looking for ? (AdjLand,Family,Normal Sale,Partial)       :")
    d={"Commercial":0,"House":1,"Other":2}
    X=model1.drop(["SALES_PRICE",'COMMISION',"REG_FEE"],axis=1)
    l1=list(X.columns)
    x=list(np.zeros(len(l1),dtype="int"))
    if ar in l1:
        area=l1.index(ar)
        x[area]=1
    if sc in l1:
        sk=l1.index(sc)
        x[sk]=1
    x[0]=sq
    x[1]=yr
    x[2]=bed
    x[3]=bath
    x[4]=room
    x[5]=park
    x[6]=byr
    x[7]=d[bt]
    return x

In [67]:
X.columns

Index(['Per_SQFT', 'Year', 'N_BEDROOM', 'N_BATHROOM', 'N_ROOM',
       'PARK_FACILITY', 'year_BUILD', 'BUILD_TYPE', 'Anna Nagar', 'Chrompet',
       'KK Nagar', 'Karapakkam', 'T Nagar', 'Velachery', 'AdjLand', 'Family',
       'Normal Sale', 'Partial'],
      dtype='object')

In [68]:
def sale_predicted_price(x):
    X=model1.drop(["SALES_PRICE",'COMMISION',"REG_FEE"],axis=1)
    y=model1.SALES_PRICE
    mod=RandomForestRegressor()
    mod.fit(X,y)
    pred=mod.predict([x])[0]
    return pred

In [75]:
def get_reg_fees(x):
    model2=model1.drop(["SALES_PRICE",'COMMISION'],axis=1)
    X=model2.drop(["REG_FEE"],axis=1)
    y=model2.REG_FEE
    mod=Lasso()
    mod.fit(X,y)
    q=mod.predict([x])[0]
    return q

In [81]:
from sklearn.model_selection import KFold
model2=df2.drop(["SALE_COND","AREA","DIST_MAINROAD"],axis=1)
X=model2.drop("REG_FEE",axis=1)
y=model2.REG_FEE
kf = KFold(n_splits=10,shuffle=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.2)
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,random_state=0,test_size=0.2)
lr = LinearRegression().fit(X_train,y_train)
cs = cross_val_score(lr,X_train,y_train,cv=kf)
cs.mean()

0.8642171114909389

In [83]:
x=predict()
pred=get_reg_fees(x)
q=sale_predicted_price(x)
print("Estimated Sales Price--->",q)
print("Estimated Reg Fees--->",round(pred,2))
print("Estimated Overall Charges---->",q+pred," + Commision")

How many Sqft?    :2000
Current Year2023
How many Bedrooms?    :3
How many Bathrooms?   :3
How many Rooms?    :3
Park Facility ? (1/0)    :1
Which Year it must be build ?    :2023
For what purpose looking for ? (Commercial,House,Other)    :House
In which place in chennai looking for ? (Anna Nagar,Chrompet,KK Nagar,Karapakkam,T Nagar,Velachery)      :Karapakkam
What ids the Sale Condition you are looking for ? (AdjLand,Family,Normal Sale,Partial)       :Family
Estimated Sales Price---> 8053457.5
Estimated Reg Fees---> 467993.77
Estimated Overall Charges----> 8521451.271815538  + Commision
