In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

In [2]:
df =  pd.read_csv("Bengaluru_House_Data (1).csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [3]:
df.shape

(13320, 9)

In [4]:
df.info


<bound method DataFrame.info of                   area_type   availability                  location  \
0      Super built-up  Area         19-Dec  Electronic City Phase II   
1                Plot  Area  Ready To Move          Chikka Tirupathi   
2            Built-up  Area  Ready To Move               Uttarahalli   
3      Super built-up  Area  Ready To Move        Lingadheeranahalli   
4      Super built-up  Area  Ready To Move                  Kothanur   
...                     ...            ...                       ...   
13315        Built-up  Area  Ready To Move                Whitefield   
13316  Super built-up  Area  Ready To Move             Richards Town   
13317        Built-up  Area  Ready To Move     Raja Rajeshwari Nagar   
13318  Super built-up  Area         18-Jun           Padmanabhanagar   
13319  Super built-up  Area  Ready To Move              Doddathoguru   

            size  society total_sqft  bath  balcony   price  
0          2 BHK  Coomee        1056   2.

In [5]:
#First five rows of dataset
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [6]:
#Printing the number of missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [7]:
for i in df.columns:
    print(df[i].value_counts())
    print("\n")

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64


availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64


location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64


size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK    

In [8]:
#Printing the number of missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [9]:
df = df.drop(columns=['area_type','society','availability','balcony'])

In [10]:
df.describe()


Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [12]:
df["location"]= df["location"].fillna('Sarojapur Road')
df["size"]= df["size"].fillna('2 BHK')
df["bath"]= df["bath"].fillna(df['bath'].median())

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [14]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [15]:
def convert(x):
    temp = x.split("-")
    if len(temp) == 2 :
        return(float(temp[0]) + float(temp[1]))/2
    try :
        return float(x)
    except :
        return None 

In [16]:
df['total_sqft']= df['total_sqft'].apply(convert)

In [17]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0
4,Kothanur,2 BHK,1200.0,2.0,51.0


In [18]:
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [19]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [20]:
df['pricepersqft']= df['price']*100000/df['total_sqft']

In [21]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,pricepersqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [22]:
df['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Uvce Layout                         1
Abshot Layout                       1
Name: count, Length: 1306, dtype: int64

In [23]:
df['location']= df['location'].apply(lambda x: x.strip())
lc = df['location'].value_counts()
lc

location
Whitefield                            541
Sarjapur  Road                        399
Electronic City                       304
Kanakpura Road                        273
Thanisandra                           237
                                     ... 
1Channasandra                           1
Hosahalli                               1
Vijayabank bank layout                  1
near Ramanashree California resort      1
Abshot Layout                           1
Name: count, Length: 1295, dtype: int64

In [24]:
(df['total_sqft']/df['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [25]:
lc10 = lc[lc<=50]
lc10

location
Ramagondanahalli                      50
Bhoganhalli                           50
Mysore Road                           50
Hegde Nagar                           49
Gottigere                             48
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1242, dtype: int64

In [26]:
df['location'] = df['location'].apply(lambda x: 'Banglore city' if x in lc10 else x)

In [27]:
df['location'].value_counts()

location
Banglore city               7242
Whitefield                   541
Sarjapur  Road               399
Electronic City              304
Kanakpura Road               273
Thanisandra                  237
Yelahanka                    213
Uttarahalli                  186
Hebbal                       177
Marathahalli                 175
Raja Rajeshwari Nagar        171
Hennur Road                  152
Bannerghatta Road            152
7th Phase JP Nagar           149
Haralur Road                 142
Electronic City Phase II     132
Rajaji Nagar                 107
Chandapura                   100
Bellandur                     96
KR Puram                      91
Electronics City Phase 1      88
Hoodi                         88
Yeshwanthpur                  85
Begur Road                    84
Sarjapur                      82
Kasavanhalli                  80
Harlur                        79
Banashankari                  75
Hormavu                       74
Kengeri                       73
R

In [28]:
df = df[((df['total_sqft']/df['bhk']) >= 300)]
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,pricepersqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [29]:
df.shape

(12530, 7)

In [30]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.pricepersqft)
        st = np.std(subdf.pricepersqft)
        gen_df = subdf[(subdf.pricepersqft > (m-st)) & (subdf.pricepersqft <=(m+st))]
        df_output= pd.concat([df_output,gen_df], ignore_index = True)
    return df_output
df = remove_outliers_sqft(df)
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,pricepersqft
0,7th Phase JP Nagar,2 BHK,1080.0,2.0,72.00,2,6666.666667
1,7th Phase JP Nagar,2 BHK,1270.0,2.0,93.00,2,7322.834646
2,7th Phase JP Nagar,3 BHK,1420.0,2.0,100.00,3,7042.253521
3,7th Phase JP Nagar,3 BHK,1850.0,3.0,150.00,3,8108.108108
4,7th Phase JP Nagar,2 BHK,1245.0,2.0,94.00,2,7550.200803
...,...,...,...,...,...,...,...
10823,Yeshwanthpur,3 BHK,1676.0,3.0,92.13,3,5497.016706
10824,Yeshwanthpur,3 BHK,2503.0,3.0,138.00,3,5513.383939
10825,Yeshwanthpur,3 BHK,1855.0,3.0,135.00,3,7277.628032
10826,Yeshwanthpur,3 BHK,1876.0,3.0,160.00,3,8528.784648


In [31]:
def bhk_outlier_remover(df2):
    exclude_incidies = np.array([])
    for location, location_df2 in df2.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df2 in df2.groupby('bhk'):
            bhk_stats[bhk] = {
            'mean' : np.mean(bhk_df2.pricepersqft),
            'std' : np.std(bhk_df2.pricepersqft),
            'count' : bhk_df2.shape[0]
        }
        for bhk, bhk_df in location_df2.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_incidies = np.append(exclude_incidies, bhk_df2[bhk_df2.pricepersqft<(stats['mean'])].index.values)
    
    return df.drop(exclude_incidies, axis = 'index')   
        

In [32]:
df = bhk_outlier_remover(df)

In [33]:
df.shape

(10827, 7)

In [34]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,pricepersqft
0,7th Phase JP Nagar,2 BHK,1080.0,2.0,72.00,2,6666.666667
1,7th Phase JP Nagar,2 BHK,1270.0,2.0,93.00,2,7322.834646
2,7th Phase JP Nagar,3 BHK,1420.0,2.0,100.00,3,7042.253521
3,7th Phase JP Nagar,3 BHK,1850.0,3.0,150.00,3,8108.108108
4,7th Phase JP Nagar,2 BHK,1245.0,2.0,94.00,2,7550.200803
...,...,...,...,...,...,...,...
10823,Yeshwanthpur,3 BHK,1676.0,3.0,92.13,3,5497.016706
10824,Yeshwanthpur,3 BHK,2503.0,3.0,138.00,3,5513.383939
10825,Yeshwanthpur,3 BHK,1855.0,3.0,135.00,3,7277.628032
10826,Yeshwanthpur,3 BHK,1876.0,3.0,160.00,3,8528.784648


In [35]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,pricepersqft
0,7th Phase JP Nagar,2 BHK,1080.0,2.0,72.0,2,6666.666667
1,7th Phase JP Nagar,2 BHK,1270.0,2.0,93.0,2,7322.834646
2,7th Phase JP Nagar,3 BHK,1420.0,2.0,100.0,3,7042.253521
3,7th Phase JP Nagar,3 BHK,1850.0,3.0,150.0,3,8108.108108
4,7th Phase JP Nagar,2 BHK,1245.0,2.0,94.0,2,7550.200803


In [36]:
df.isnull().sum()

location        0
size            0
total_sqft      0
bath            0
price           0
bhk             0
pricepersqft    0
dtype: int64

In [75]:
X = df.drop(columns = ['price','size'])
y = df['price']
X

Unnamed: 0,location,total_sqft,bath,bhk,pricepersqft
0,7th Phase JP Nagar,1080.0,2.0,2,6666.666667
1,7th Phase JP Nagar,1270.0,2.0,2,7322.834646
2,7th Phase JP Nagar,1420.0,2.0,3,7042.253521
3,7th Phase JP Nagar,1850.0,3.0,3,8108.108108
4,7th Phase JP Nagar,1245.0,2.0,2,7550.200803
...,...,...,...,...,...
10823,Yeshwanthpur,1676.0,3.0,3,5497.016706
10824,Yeshwanthpur,2503.0,3.0,3,5513.383939
10825,Yeshwanthpur,1855.0,3.0,3,7277.628032
10826,Yeshwanthpur,1876.0,3.0,3,8528.784648


In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Assuming `df` is your dataframe, and `X` and `y` are your input and output variables
X = df[['location', 'total_sqft', 'bath', 'bhk']]  # Features
y = df['price']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for preprocessing
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['location']),  # OneHotEncode the 'location' column
    (StandardScaler(with_mean=False), ['total_sqft', 'bath', 'bhk'])  # Scale numeric columns with `with_mean=False`
)

# Create a pipeline that first transforms the data, then fits a model
pipeline = make_pipeline(
    column_trans,
    Ridge()  # You can replace LinearRegression with Lasso() or Ridge()
)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(f"R^2 score: {r2_score(y_test, y_pred)}")


R^2 score: 0.7849383413068937


In [77]:
import pickle

In [78]:

# Assuming `pipeline` is your trained pipeline model from your code
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)

In [79]:
df.to_csv('Clean_data.csv', index=False)

In [80]:
# df['location'].value_counts()

location
Banglore city               5973
Whitefield                   484
Sarjapur  Road               305
Electronic City              279
Kanakpura Road               196
Yelahanka                    171
Uttarahalli                  167
Raja Rajeshwari Nagar        164
Thanisandra                  153
Marathahalli                 149
Bannerghatta Road            145
Hennur Road                  132
Hebbal                       128
Haralur Road                 127
Electronic City Phase II     122
7th Phase JP Nagar           101
Bellandur                     88
Chandapura                    83
KR Puram                      72
Harlur                        71
Rajaji Nagar                  71
Kasavanhalli                  70
Yeshwanthpur                  70
Sarjapur                      68
Begur Road                    64
Banashankari                  63
Electronics City Phase 1      60
Kothanur                      57
Hormavu                       55
Koramangala                   54
J

In [81]:
# new_data = pd.DataFrame([['7th Phase JP Nagar', 1080.0, 2.0, 2]], columns=['location', 'total_sqft', 'bath', 'bhk'])

# # Predict the price
# predicted_price = pipeline.predict(new_data)
# print(predicted_price)

[69.21135692]
