# Import Libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
dataset_path="bhp/Bengaluru_House_Data.csv"

In [6]:
df_house=pd.read_csv(dataset_path)
df_house.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [7]:
df_house.shape

(13320, 9)

# Column Examine

In [8]:
for col in df_house.select_dtypes(include='object').columns:
    unique_values = df_house[col].unique()
    print(f"Unique values in column '{col}': {unique_values}")
    print()

Unique values in column 'area_type': ['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']

Unique values in column 'availability': ['19-Dec' 'Ready To Move' '18-May' '18-Feb' '18-Nov' '20-Dec' '17-Oct'
 '21-Dec' '19-Sep' '20-Sep' '18-Mar' '20-Feb' '18-Apr' '20-Aug' '18-Oct'
 '19-Mar' '17-Sep' '18-Dec' '17-Aug' '19-Apr' '18-Jun' '22-Dec' '22-Jan'
 '18-Aug' '19-Jan' '17-Jul' '18-Jul' '21-Jun' '20-May' '19-Aug' '18-Sep'
 '17-May' '17-Jun' '21-May' '18-Jan' '20-Mar' '17-Dec' '16-Mar' '19-Jun'
 '22-Jun' '19-Jul' '21-Feb' 'Immediate Possession' '19-May' '17-Nov'
 '20-Oct' '20-Jun' '19-Feb' '21-Oct' '21-Jan' '17-Mar' '17-Apr' '22-May'
 '19-Oct' '21-Jul' '21-Nov' '21-Mar' '16-Dec' '22-Mar' '20-Jan' '21-Sep'
 '21-Aug' '14-Nov' '19-Nov' '15-Nov' '16-Jul' '15-Jun' '17-Feb' '20-Nov'
 '20-Jul' '16-Sep' '15-Oct' '15-Dec' '16-Oct' '22-Nov' '15-Aug' '17-Jan'
 '16-Nov' '20-Apr' '16-Jan' '14-Jul']

Unique values in column 'location': ['Electronic City Phase II' 'Chikka Tirupathi' 'Uttar

In [9]:
df_house['society'].unique().shape

(2689,)

In [10]:
df_house['society'].value_counts()

society
GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Amionce     1
JaghtDe     1
Jauraht     1
Brity U     1
RSntsAp     1
Name: count, Length: 2688, dtype: int64

In [11]:
df_house['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

 area type, location,size,total sqft are generally important in predicting price
 
 number of bathroom, number of balcony might or might not very useful but let just keep them
 
 availability will not affect the price .
 
 society also play role in price but we have 2689 different society and we have very less data per society . So let drop it.( Encoding these much types is difficult and reduce model performance.
 
 For location we will handle it later as some values have good amount of rows(data)

In [12]:
df2=df_house.drop(['availability','society'],axis=1)
df2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


# Handling NA values

In [13]:
df2.isnull().sum()

area_type       0
location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

## fill median value for balcony

In [14]:
balcony_mdeian_val=df2['balcony'].median()
balcony_mdeian_val

2.0

In [15]:
balcony_mdeian_val=df2['balcony'].median()
df2['balcony']=df2['balcony'].fillna(balcony_mdeian_val)

In [16]:
df2.isnull().sum()

area_type      0
location       1
size          16
total_sqft     0
bath          73
balcony        0
price          0
dtype: int64

## For rest na value we can drop these rows our data set size is 13320

In [17]:
df3=df2.dropna()
df3.shape

(13246, 7)

In [18]:
df3.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

# Data Cleaning

In [19]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13246 non-null  object 
 1   location    13246 non-null  object 
 2   size        13246 non-null  object 
 3   total_sqft  13246 non-null  object 
 4   bath        13246 non-null  float64
 5   balcony     13246 non-null  float64
 6   price       13246 non-null  float64
dtypes: float64(3), object(4)
memory usage: 827.9+ KB


area_type and location are okay they are string
but why size and total_sqft are object . Let see

In [20]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

## create another column called bhk with unifrom data with int value

In [21]:
df3.loc[:, 'bhk'] = df3['size'].apply(lambda x: int(x.split()[0]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.loc[:, 'bhk'] = df3['size'].apply(lambda x: int(x.split()[0]))


In [22]:
df4=df3.drop(['size'],axis=1)

In [23]:
df4

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Chikka Tirupathi,2600,5.0,3.0,120.00,4
2,Built-up Area,Uttarahalli,1440,2.0,3.0,62.00,3
3,Super built-up Area,Lingadheeranahalli,1521,3.0,1.0,95.00,3
4,Super built-up Area,Kothanur,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13315,Built-up Area,Whitefield,3453,4.0,0.0,231.00,5
13316,Super built-up Area,Richards Town,3600,5.0,2.0,400.00,4
13317,Built-up Area,Raja Rajeshwari Nagar,1141,2.0,1.0,60.00,2
13318,Super built-up Area,Padmanabhanagar,4689,4.0,1.0,488.00,4


In [24]:
df4['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [25]:
def is_numeric(x):
    try:
        float(x)
        return True
    except ValueError:
        return False

In [26]:
df4['total_sqft'][~df4['total_sqft'].apply(lambda x: is_numeric(x))].head(15)

30        2100 - 2850
122       3067 - 8156
137       1042 - 1105
165       1145 - 1340
188       1015 - 1540
410    34.46Sq. Meter
549       1195 - 1440
648         4125Perch
661       1120 - 1145
672       3090 - 5002
772       1160 - 1195
775     1000Sq. Meter
850       1115 - 1130
872     1100Sq. Yards
886         520 - 645
Name: total_sqft, dtype: object

#### Only handle number and range

In [27]:
def cvt_sqft_num(x):
    tokens=x.split('-')
    try:
        if len(tokens)==2:
            return (float(tokens[0])+float(tokens[1]))/2
        else:
            return float(x)
    except:
        return None

In [28]:
df4['total_sqft']=df4['total_sqft'].apply(lambda x: cvt_sqft_num(x))

In [29]:
df4.isnull().sum()

area_type      0
location       0
total_sqft    46
bath           0
balcony        0
price          0
bhk            0
dtype: int64

#### Drop null

In [30]:
df5=df4.dropna()

In [31]:
df5.shape

(13200, 7)

In [32]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13200 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13200 non-null  object 
 1   location    13200 non-null  object 
 2   total_sqft  13200 non-null  float64
 3   bath        13200 non-null  float64
 4   balcony     13200 non-null  float64
 5   price       13200 non-null  float64
 6   bhk         13200 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 825.0+ KB


## Handling Location

In [33]:
df_location_count=df5['location'].value_counts().reset_index()

loc_greater_than_10 = list(df_location_count.loc[df_location_count['count'] > 10, 'location'])
len(loc_greater_than_10)

240

In [34]:
df6=df5.copy()

In [35]:
df6['location']=df6['location'].apply(lambda x : x if x in loc_greater_than_10 else 'Other' )

In [36]:
len(df6['location'].unique())

241

# Creating columns price_per_sqft , sqft_per_bhk

In [37]:
df6['price_per_sqft']=df6['price']*100000/df6['total_sqft']# bcz prices are in lakhs (100k INR)
df6.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


In [38]:
df6['sqft_per_bhk']=df6['total_sqft']/df6['bhk']
df6.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
0,Super built-up Area,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606,528.0
1,Plot Area,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615,650.0
2,Built-up Area,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556,480.0
3,Super built-up Area,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861,507.0
4,Super built-up Area,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0,600.0


# Outlier Handling

let use domain knowledge , generally 400-500 is size of 1 bhk. For minimum let take 300

In [39]:
df6[df6['sqft_per_bhk']<300]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
9,Plot Area,Other,1020.0,6.0,2.0,370.0,6,36274.509804,170.000000
45,Plot Area,HSR Layout,600.0,9.0,2.0,200.0,8,33333.333333,75.000000
58,Plot Area,Murugeshpalya,1407.0,4.0,1.0,150.0,6,10660.980810,234.500000
68,Plot Area,Other,1350.0,7.0,0.0,85.0,8,6296.296296,168.750000
70,Plot Area,Other,500.0,3.0,2.0,100.0,3,20000.000000,166.666667
...,...,...,...,...,...,...,...,...,...
13277,Plot Area,Other,1400.0,7.0,2.0,218.0,7,15571.428571,200.000000
13279,Plot Area,Other,1200.0,5.0,2.0,130.0,6,10833.333333,200.000000
13281,Plot Area,Margondanahalli,1375.0,5.0,1.0,125.0,5,9090.909091,275.000000
13303,Plot Area,Vidyaranyapura,774.0,5.0,3.0,70.0,5,9043.927649,154.800000


In [40]:
df7=df6.copy()


In [41]:
df7=df7[df7['sqft_per_bhk']>=300]

In [42]:
df7[df7['sqft_per_bhk']<300]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk


In [43]:
df6[df6['price_per_sqft']>50000]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
349,Plot Area,Other,11.0,3.0,2.0,74.0,3,672727.3,3.666667
1020,Carpet Area,Other,15.0,1.0,0.0,30.0,1,200000.0,15.0
1122,Built-up Area,Other,24.0,2.0,2.0,150.0,5,625000.0,4.8
4086,Plot Area,Sarjapur Road,1.0,4.0,2.0,120.0,4,12000000.0,0.25
4972,Built-up Area,Other,5.0,7.0,3.0,115.0,7,2300000.0,0.714286
5970,Plot Area,Mysore Road,45.0,1.0,0.0,23.0,1,51111.11,45.0
6421,Plot Area,Bommenahalli,2940.0,3.0,2.0,2250.0,4,76530.61,735.0
7088,Built-up Area,Other,650.0,1.0,3.0,500.0,1,76923.08,650.0
7657,Built-up Area,Other,425.0,1.0,1.0,750.0,1,176470.6,425.0
7883,Super built-up Area,Other,2000.0,3.0,2.0,1063.0,4,53150.0,500.0


### Price per sqft

In [44]:
df7['price_per_sqft'].describe()

count     12456.000000
mean       6308.502826
std        4168.127339
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

##### 68-95-99.7 rule
So, any value which is less than 0.15 percentile and more than 99.85 percentile are outliers

In [45]:
min_val=df7['price_per_sqft'].quantile(0.0015)
max_val=df7['price_per_sqft'].quantile(0.9985)
min_val,max_val

(1593.3157894736842, 34126.61094819184)

These data are also align with domain knowledge

In [46]:
df7[df7['price_per_sqft']<min_val]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
674,Built-up Area,Yelahanka,35000.0,3.0,3.0,130.0,3,371.428571,11666.666667
810,Plot Area,Other,10961.0,4.0,1.0,80.0,4,729.860414,2740.25
1894,Plot Area,Other,52272.0,2.0,1.0,140.0,3,267.829813,17424.0
2421,Plot Area,Other,2000.0,3.0,2.0,25.0,4,1250.0,500.0
3976,Super built-up Area,Other,1500.0,1.0,1.0,19.5,1,1300.0,1500.0
4105,Super built-up Area,Other,5800.0,5.0,2.0,80.0,5,1379.310345,1160.0
4548,Plot Area,Channasandra,3040.0,2.0,1.0,48.0,2,1578.947368,1520.0
5393,Super built-up Area,Other,42000.0,8.0,3.0,175.0,9,416.666667,4666.666667
5469,Super built-up Area,Ulsoor,36000.0,4.0,2.0,450.0,4,1250.0,9000.0
5652,Built-up Area,JP Nagar,1100.0,1.0,1.0,15.0,2,1363.636364,550.0


In [47]:
df7[df7['price_per_sqft']>max_val]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
2286,Plot Area,Other,3200.0,4.0,2.0,1200.0,4,37500.0,800.0
5365,Built-up Area,Banashankari Stage II,1500.0,2.0,0.0,650.0,4,43333.333333,375.0
6421,Plot Area,Bommenahalli,2940.0,3.0,2.0,2250.0,4,76530.612245,735.0
7088,Built-up Area,Other,650.0,1.0,3.0,500.0,1,76923.076923,650.0
7657,Built-up Area,Other,425.0,1.0,1.0,750.0,1,176470.588235,425.0
7727,Super built-up Area,Other,5422.0,6.0,2.0,1900.0,4,35042.419771,1355.5
7883,Super built-up Area,Other,2000.0,3.0,2.0,1063.0,4,53150.0,500.0
8170,Super built-up Area,Other,2750.0,3.0,2.0,943.0,3,34290.909091,916.666667
8244,Super built-up Area,Other,2230.0,4.0,2.0,792.0,4,35515.695067,557.5
8398,Super built-up Area,Bannerghatta Road,2500.0,4.0,2.0,1400.0,5,56000.0,500.0


In [48]:
df8=df7[(df7['price_per_sqft']>=min_val) & (df7['price_per_sqft']<=max_val)]

In [49]:
df8.shape

(12418, 9)

In [50]:
df8['price_per_sqft'].describe()

count    12418.000000
mean      6242.885636
std       3510.081563
min       1600.000000
25%       4213.835592
50%       5294.117647
75%       6906.543508
max      34050.179211
Name: price_per_sqft, dtype: float64

In [51]:
df8.bhk.describe()

count    12418.000000
mean         2.646400
std          0.970464
min          1.000000
25%          2.000000
50%          3.000000
75%          3.000000
max         16.000000
Name: bhk, dtype: float64

In [52]:
df8.bath.describe()

count    12418.000000
mean         2.559994
std          1.072540
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: bath, dtype: float64

In [53]:
df8[(df8.bhk>10) | (df8.bath>10)]

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,sqft_per_bhk
459,Super built-up Area,Other,5000.0,9.0,3.0,360.0,11,7200.0,454.545455
1078,Plot Area,Other,3300.0,14.0,2.0,500.0,9,15151.515152,366.666667
3096,Super built-up Area,Other,12000.0,12.0,2.0,525.0,10,4375.0,1200.0
3609,Super built-up Area,Other,10000.0,16.0,2.0,550.0,16,5500.0,625.0
7979,Super built-up Area,Other,6000.0,12.0,2.0,150.0,11,2500.0,545.454545
8636,Super built-up Area,Neeladri Nagar,4000.0,12.0,2.0,160.0,10,4000.0,400.0
9935,Super built-up Area,Other,5425.0,13.0,0.0,275.0,13,5069.124424,417.307692


Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment

In [54]:

def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df9 = remove_bhk_outliers(df8)

df9.shape

(8737, 9)

In [55]:
df9=df9.drop(['price_per_sqft','sqft_per_bhk'],axis=1)

In [56]:
df9.head()

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
2,Built-up Area,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Kothanur,1200.0,2.0,1.0,51.0,2
6,Super built-up Area,Old Airport Road,2732.0,4.0,2.0,204.0,4


# Model Building

In [57]:
df9.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8737 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   8737 non-null   object 
 1   location    8737 non-null   object 
 2   total_sqft  8737 non-null   float64
 3   bath        8737 non-null   float64
 4   balcony     8737 non-null   float64
 5   price       8737 non-null   float64
 6   bhk         8737 non-null   int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 546.1+ KB


## Dummy column(One hot Encoding)

In [58]:
df10=pd.get_dummies(df9,dtype=pd.Int16Dtype())

In [59]:
df11=df10.copy()

In [60]:
df11.columns=df10.columns.str.lower()

In [61]:
df11.columns

Index(['total_sqft', 'bath', 'balcony', 'price', 'bhk',
       'area_type_built-up  area', 'area_type_carpet  area',
       'area_type_plot  area', 'area_type_super built-up  area',
       'location_ devarachikkanahalli',
       ...
       'location_vijayanagar', 'location_vishveshwarya layout',
       'location_vishwapriya layout', 'location_vittasandra',
       'location_whitefield', 'location_yelachenahalli', 'location_yelahanka',
       'location_yelahanka new town', 'location_yelenahalli',
       'location_yeshwanthpur'],
      dtype='object', length=250)

In [62]:
df11=df11.drop(['location_other','area_type_plot  area'],axis=1)

In [63]:
df11.shape

(8737, 248)

In [64]:
df11.dtypes.value_counts()

Int16      243
float64      4
int64        1
Name: count, dtype: int64

In [65]:
X=df11.drop(['price'],axis=1)
y=df11['price']

## Train-Test-Split

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2)

## Scaler

In [68]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_train_scaled.shape

(6989, 247)

In [69]:
X_test_scaled=scaler.transform(X_test)

In [70]:
from sklearn.linear_model import LinearRegression

In [71]:
lin_reg=LinearRegression()
lin_reg.fit(X_train_scaled,y_train)
lin_reg.score(X_test_scaled,y_test)

0.7558244308061074

In [72]:
from sklearn.model_selection import cross_val_score,ShuffleSplit

cv=ShuffleSplit(5,test_size=0.2,random_state=212)
cross_val_score(LinearRegression(),X,y,cv=cv)


array([0.76186755, 0.7266909 , 0.78003052, 0.7191964 , 0.7777652 ])

In [73]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2,3,5,10],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [0.5,1,2,3,5,10]
                
            }
        },
        
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ["squared_error",'friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False,n_jobs=-1)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.733806,{}
1,lasso,0.669106,"{'alpha': 1, 'selection': 'random'}"
2,ridge,0.733083,{'alpha': 0.5}
3,decision_tree,0.572989,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


best model we get is LinearRegression

# Pipeline

In [74]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('Linear_Regression', LinearRegression())])

In [75]:
pipe.fit(X_train,y_train)

In [76]:
pipe.score(X_train,y_train)

0.7499646399976463

In [77]:
pipe.score(X_test,y_test)

0.7558244308061074

In [78]:
df9.columns

Index(['area_type', 'location', 'total_sqft', 'bath', 'balcony', 'price',
       'bhk'],
      dtype='object')

In [79]:
X.columns

Index(['total_sqft', 'bath', 'balcony', 'bhk', 'area_type_built-up  area',
       'area_type_carpet  area', 'area_type_super built-up  area',
       'location_ devarachikkanahalli', 'location_1st block jayanagar',
       'location_1st phase jp nagar',
       ...
       'location_vijayanagar', 'location_vishveshwarya layout',
       'location_vishwapriya layout', 'location_vittasandra',
       'location_whitefield', 'location_yelachenahalli', 'location_yelahanka',
       'location_yelahanka new town', 'location_yelenahalli',
       'location_yeshwanthpur'],
      dtype='object', length=247)

In [80]:
def predict_price(area_type,location,total_sqft,bath,balcony,bhk):
    x=pd.DataFrame(columns=X.columns)
    x.loc[0]=np.zeros(len(X.columns))
    
    x['total_sqft'] = total_sqft
    x['bath'] = bath
    x['balcony'] = balcony
    x['bhk'] = bhk
    loc='location_'+location.lower()
    area='area_type_'+area_type.lower()
    if loc in x.columns:
        x[loc]=1
    if area in x.columns:
        x[area]=1
        
#     print(x.where(x > 0).dropna(axis=1))
    
    return pipe.predict(x)[0]

In [81]:
predict_price('super built-up  area','1st Phase JP Nagar',1000, 2,1, 2)

85.12657305247484

In [82]:
df9.sample(5,random_state=3)

Unnamed: 0,area_type,location,total_sqft,bath,balcony,price,bhk
7962,Built-up Area,Sarjapur,3854.5,6.0,0.0,385.5,4
7478,Super built-up Area,Uttarahalli,1180.0,2.0,1.0,41.3,2
4203,Plot Area,Whitefield,3004.0,4.0,2.0,285.0,4
7369,Built-up Area,Sarjapur Road,3800.0,3.0,3.0,325.0,4
3043,Super built-up Area,Sultan Palaya,1100.0,2.0,2.0,40.0,2


In [83]:
predict_price('super built-up  area','Sultan Palaya',1100, 2,2, 2)

31.931894401599152

In [84]:
predict_price('super built-up  area','Uttarahalli',1180, 2,1, 2)

52.57881050781418

In [85]:
predict_price('built-up  area','Sarjapur',3854.5, 6,0, 4)

356.6239969077198

# Exporting model

In [86]:
import joblib

In [87]:
joblib.dump(pipe,'pipeline.pkl')

['pipeline.pkl']

In [88]:
import json


In [89]:
data_columns={
    'columns': X.columns.to_list()
}
with open('columns.json','w') as file:
    json.dump(data_columns,file,indent=4)