In [None]:
# project on property prices

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

In [2]:
file = pd.read_csv("Bengaluru_House_Data.csv")
file.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
file.drop(['area_type','availability','society','balcony'], axis='columns', inplace=True)

In [4]:
file.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [5]:
file.bath.fillna(file.bath.mean(), inplace=True)

In [6]:
file.isnull().sum()

location       1
size          16
total_sqft     0
bath           0
price          0
dtype: int64

In [7]:
file.replace({
    "size" : '[A-Za-z]'
},"", regex=True, inplace=True)

In [8]:
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056,2.0,39.07
1,Chikka Tirupathi,4,2600,5.0,120.0
2,Uttarahalli,3,1440,2.0,62.0
3,Lingadheeranahalli,3,1521,3.0,95.0
4,Kothanur,2,1200,2.0,51.0


In [9]:
file.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [10]:
def convertToFloat(x):
    split = x.split("-")
    if len(split) == 2:
        final = (float(split[0]) + float(split[1]))/2
        return final
    try:
        return float(x)
    except:
        return 0.0

In [11]:
file['total_sqft'] = file['total_sqft'].apply(convertToFloat)
file.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0


In [12]:
file.rename(columns = {'size':'bhk'}, inplace = True)
file.head()

Unnamed: 0,location,bhk,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0


In [13]:
file.location.dropna()

0        Electronic City Phase II
1                Chikka Tirupathi
2                     Uttarahalli
3              Lingadheeranahalli
4                        Kothanur
                   ...           
13315                  Whitefield
13316               Richards Town
13317       Raja Rajeshwari Nagar
13318             Padmanabhanagar
13319                Doddathoguru
Name: location, Length: 13319, dtype: object

In [14]:
file.isnull().sum()

location       1
bhk           16
total_sqft     0
bath           0
price          0
dtype: int64

In [15]:
file.location = file.location.str.strip()
file.head()

Unnamed: 0,location,bhk,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0


In [16]:
len(file.location.unique())

1295

In [17]:
location = file.groupby('location')['location'].agg('count').sort_values(ascending=False)

In [18]:
len(location)

1294

In [19]:
len(location[location>10])

241

In [20]:
len(location[location<10])

1040

In [21]:
len(location[location==10])

13

In [22]:
location_count = location[location<=10]

In [23]:
file.location = file.location.apply(lambda x: 'others' if x in location_count else x)

In [24]:
len(file.location.unique())

243

In [25]:
file.head(10)

Unnamed: 0,location,bhk,total_sqft,bath,price
0,Electronic City Phase II,2,1056.0,2.0,39.07
1,Chikka Tirupathi,4,2600.0,5.0,120.0
2,Uttarahalli,3,1440.0,2.0,62.0
3,Lingadheeranahalli,3,1521.0,3.0,95.0
4,Kothanur,2,1200.0,2.0,51.0
5,Whitefield,2,1170.0,2.0,38.0
6,Old Airport Road,4,2732.0,4.0,204.0
7,Rajaji Nagar,4,3300.0,4.0,600.0
8,Marathahalli,3,1310.0,3.0,63.25
9,others,6,1020.0,6.0,370.0


In [26]:
file.shape

(13320, 5)

In [27]:
def isFloat(x):
    return float(x)

In [28]:
file.bhk = file.bhk.apply(isFloat)

In [29]:
file.head()

Unnamed: 0,location,bhk,total_sqft,bath,price
0,Electronic City Phase II,2.0,1056.0,2.0,39.07
1,Chikka Tirupathi,4.0,2600.0,5.0,120.0
2,Uttarahalli,3.0,1440.0,2.0,62.0
3,Lingadheeranahalli,3.0,1521.0,3.0,95.0
4,Kothanur,2.0,1200.0,2.0,51.0


In [30]:
file = file[~(file.total_sqft/file.bhk<300)]
file.shape

(12530, 5)

In [31]:
file['price_per_square'] = file.price*100000/file.total_sqft
file.head()

Unnamed: 0,location,bhk,total_sqft,bath,price,price_per_square
0,Electronic City Phase II,2.0,1056.0,2.0,39.07,3699.810606
1,Chikka Tirupathi,4.0,2600.0,5.0,120.0,4615.384615
2,Uttarahalli,3.0,1440.0,2.0,62.0,4305.555556
3,Lingadheeranahalli,3.0,1521.0,3.0,95.0,6245.890861
4,Kothanur,2.0,1200.0,2.0,51.0,4250.0
