# Bengluru flat price estimation



In [None]:
#Adding libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#adding data set to the code:
raw_data=pd.read_csv('/content/Bengaluru_House_Data.csv')
raw_data.head()
# raw_data.describe()
data1=raw_data.drop(['area_type','society','balcony','availability'],axis=1)
data1.head()

#seeing all null or empty values:
data1.isnull().sum()
#since these are very less in number so we can take the risk to remove them:
data2=data1.dropna()
data2.isnull().sum()
#now all empty rows are removed
data2.head()
#as we can see in the data frame there are different notations in size to make it general and in int type so we can easily process it:
data2['size'].unique()
#lets make a new column called bhk:

data2['BHK']=data2['size'].apply(lambda x: int(x.split(' ')[0]))
#creating a checkpoint

data3=data2.copy()
data3.head()
data3['BHK'].unique()
data3.head()

#analyzing total_sqft column:
data3.total_sqft.unique()
#as we can see there is  a range we have to convert them into a number so they can we used in analyses:
#checking every row:
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True

data3[~data3['total_sqft'].apply(is_float)].head()
print(data3.head())
#coverting all these to mean of there range:
def converting_sqft_to_num(x):
  tokens=x.split("-")
  if len(tokens)==2:
    return (float(tokens[0])+float(tokens[1]))/2
  try:
    return float(x)
  except:
    return None

data3['total_sqft']=data3['total_sqft'].apply(converting_sqft_to_num)
data4=data3.copy()
data4.head()

data4['Price_per_sqft']=data4['price']*100000/data4['total_sqft']
data4.head()

data4.location=data4.location.apply(lambda x:x.strip())
location_stats=data4.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

location_stats_less_than_10=location_stats[location_stats<=10]
location_stats_less_than_10

data4.location=data4.location.apply(lambda x:'other' if x in location_stats_less_than_10 else x)
data4.head()
len(data4['location'].unique())

#dealing with outliers:
#total sqrft per bhk should be more than 300
data4[data4.total_sqft/data4.BHK<300].head()
data5=data4[~(data4.total_sqft/data4.BHK<300)]
# print(data5.shape)
#removing the outlires of pps according to mean and std:
def remove_pps_outliers(data5):
  data5_out=pd.DataFrame()
  for key,subdata5 in data5.groupby('location'):
    m=np.mean(subdata5.Price_per_sqft)
    st=np.std(subdata5.Price_per_sqft)
    reduced_data5=subdata5[(subdata5.Price_per_sqft>(m-st))&(subdata5.Price_per_sqft<=(m+st))]
    data5_out=pd.concat([data5_out,reduced_data5],ignore_index=True)
  return data5_out

data6=remove_pps_outliers(data5)
data6.head()
print(data6.shape)

#since many less bhk apartments st the same location  are having price more than high bhk apartments in the data set we have to remove thoose rows so applying the fun:
def remove_bhk_outliers(df):
  exclude_indices=np.array([])
  for location,location_df in df.groupby('location'):
     bhk_stats={}
     for bhk,bhk_df in location_df.groupby('BHK'):
      bhk_stats[bhk]={
          'mean':np.mean(bhk_df.Price_per_sqft),
          'std':np.std(bhk_df.Price_per_sqft),
          'count':bhk_df.shape[0]
      }
     for bhk,bhk_df in location_df.groupby('BHK'):
      stats=bhk_stats.get(bhk-1)
      if stats and stats['count']>5:
        exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.Price_per_sqft<(stats['mean'])].index.values)
        # print('hello')
        # print(len(exclude_indices))

  return df.drop(exclude_indices,axis='index')

data7=remove_bhk_outliers(data6)
print(data7.shape)

#removing those outlires in which number of bathroom are greater than number of bedrooms
data8=data7[data7.bath<data7.BHK+2]
print(data8.shape)
#since are are not majorily outlires of such kind here therfore can be used with this dataframe

#modelling:
#since location are in string format we have to covert them in numeric using one hot encoding methode:
dummies=pd.get_dummies(data8.location).applymap(lambda x: 1 if x==True else 0)
data9=pd.concat([data8,dummies],axis=1)
data9.head()
#now we will be dropping others column bcz its understood that if all others are zero than others will be 1 and also the location column bcz now its of no use also removing those columns which we added and were not given in initial dataset;

data10=data9.drop(['other','location','size','Price_per_sqft'],axis=1)
data10
print(data10.shape)


#dividing our data set into training and testing dataset:
x=data10.drop('price',axis=1)
y=data10['price']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

#now applying linear regression model:
from sklearn.linear_model import LinearRegression
lr_clf=LinearRegression()
lr_clf.fit(x_train,y_train)
print(lr_clf.score(x_test,y_test))


from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
cross_val_score(LinearRegression(),x,y,cv=cv)

#applying Gridsearchcv to find the best algorithm to be applied:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(x,y):
  algos={
      'linear_regression':{
          'model':LinearRegression(),
          'params':{
              'fit_intercept':[True,False]
          }


      },
      'lasso':{
          'model':Lasso(),
          'params':{
              'alpha':[1,2],
              'selection':['random','cyclic']
          }
      },
      'decision_tree':{
          'model':DecisionTreeRegressor(),
          'params':{
              'criterion':['mse','friedman_mse'],
              'splitter':['best','random']
          }
      }
  }
  scores=[]
  cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
  for algo_name, config in algos.items():
    gs=GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
    gs.fit(x,y)
    scores.append({
        'model':algo_name,
        'best_score':gs.best_score_,
        'best_params':gs.best_params_
    })

  return pd.DataFrame(scores,columns=['model','best_score','best_params'])

print(find_best_model_using_gridsearchcv(x,y))
x.columns
#predicting price:
def predict_price(loc,sqft,bath,bhk):
  loc_index=np.where(x.columns==loc)[0][0]

  Z=np.zeros(len(x.columns))
  Z[0]=sqft
  Z[1]=bath
  Z[2]=bhk

  if loc_index>=0:
    Z[loc_index]=1

  return lr_clf.predict([Z])[0]
print("the predicted price is:")
print(predict_price('Yelahanka',1000,2,3))

import pickle
with open('banglore_prices_model.pickel','wb') as f:
  pickle.dump(lr_clf,f)

import json
columns={
    'data_columns':[col.lower() for col in x.columns]

}
with open("columns.json","w") as f:
  f.write(json.dumps(columns))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['BHK']=data2['size'].apply(lambda x: int(x.split(' ')[0]))


                   location       size total_sqft  bath   price  BHK
0  Electronic City Phase II      2 BHK       1056   2.0   39.07    2
1          Chikka Tirupathi  4 Bedroom       2600   5.0  120.00    4
2               Uttarahalli      3 BHK       1440   2.0   62.00    3
3        Lingadheeranahalli      3 BHK       1521   3.0   95.00    3
4                  Kothanur      2 BHK       1200   2.0   51.00    2
(10241, 7)
(7329, 7)
(7251, 7)
(7251, 245)
0.8452277697874376


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/u

               model  best_score  \
0  linear_regression    0.819001   
1              lasso    0.687432   
2      decision_tree    0.712742   

                                         best_params  
0                           {'fit_intercept': False}  
1                {'alpha': 1, 'selection': 'random'}  
2  {'criterion': 'friedman_mse', 'splitter': 'best'}  
the predicted price is:
44.828294911543374




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['BHK'] = data2['size'].apply(lambda x: int(x.split(' ')[0]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['total_sqft'] = data2['total_sqft'].apply(converting_sqft_to_num)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Price_per_sqft'] = data2['price'] * 100000 / data2['total_s

0.9351389738256157
               model  best_score  \
0  linear_regression    0.941271   
1              lasso    0.924773   
2      decision_tree    0.972326   

                                         best_params  
0                            {'fit_intercept': True}  
1                {'alpha': 1, 'selection': 'cyclic'}  
2  {'criterion': 'friedman_mse', 'splitter': 'best'}  
-65.88305239879259


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/u