In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
df.shape


(13320, 9)

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [None]:
df.describe()


Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [None]:
df = df.drop(['area_type', 'society', 'balcony', 'availability'], axis=1)


In [None]:
df.isnull().sum()


Unnamed: 0,0
location,1
size,16
total_sqft,0
bath,73
price,0


In [None]:
df = df.dropna()


In [None]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop('size', axis=1)


In [None]:
def convert_sqft(x):
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna()


In [None]:
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']


In [None]:
location_stats = df['location'].value_counts()
location_stats_less_than_10 = location_stats[location_stats <= 10]

df['location'] = df['location'].apply(
    lambda x: 'other' if x in location_stats_less_than_10 else x
)


In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,...,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_other
0,1056.0,2.0,39.07,2,3699.810606,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2600.0,5.0,120.0,4,4615.384615,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1440.0,2.0,62.0,3,4305.555556,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1521.0,3.0,95.0,3,6245.890861,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1200.0,2.0,51.0,2,4250.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
X = df.drop('price', axis=1)
y = df['price']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
model.score(X_test, y_test)


0.34023601097110456

In [None]:
y_pred = model.predict(X_test)
y_pred[:5]


array([ 30.1110121 , 173.4278207 , 118.35803483,  56.08719475,
        69.54183558])

In [None]:
comparison = pd.DataFrame({
    "Actual Price": y_test[:5],
    "Predicted Price": y_pred[:5]
})
comparison


Unnamed: 0,Actual Price,Predicted Price
2677,40.44,30.111012
10212,115.0,173.427821
11720,95.0,118.358035
2995,130.0,56.087195
2555,103.0,69.541836


In [None]:
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns == 'location_' + location)[0]
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if len(loc_index) > 0:
        x[loc_index[0]] = 1
    return model.predict([x])[0]

predict_price('Whitefield', 1000, 2, 2)




np.float64(63.29878537610928)