In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**About this file
What are the things that a potential home buyer considers before purchasing a house? The location, the size of the property, vicinity to offices, schools, parks, restaurants, hospitals or the stereotypical white picket fence? What about the most important factor — the price?

**Now with the lingering impact of demonetization, the enforcement of the Real Estate (Regulation and Development) Act (RERA), and the lack of trust in property developers in the city, housing units sold across India in 2017 dropped by 7 percent. In fact, the property prices in Bengaluru fell by almost 5 percent in the second half of 2017, said a study published by property consultancy Knight Frank.
For example, for a potential homeowner, over 9,000 apartment projects and flats for sale are available in the range of ₹42-52 lakh, followed by over 7,100 apartments that are in the ₹52-62 lakh budget segment, says a report by property website Makaan. According to the study, there are over 5,000 projects in the ₹15-25 lakh budget segment followed by those in the ₹34-43 lakh budget category.

****Buying a home, especially in a city like Bengaluru, is a tricky choice. While the major factors are usually the same for all metros, there are others to be considered for the Silicon Valley of India. With its help millennial crowd, vibrant culture, great climate and a slew of job opportunities, it is difficult to ascertain the price of a house in Bengaluru.

In [2]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [4]:
df.info()

In [5]:
df.duplicated().sum()

In [6]:
df.drop_duplicates(inplace = True)

In [7]:
df['size'] = df['size'].astype(str)
df['size'] = df['size'].fillna(df['size'].mode())
df['location'] = df['location'].astype(str)
df['location'] = df['location'].fillna(df['location'].mode())
df['society'] = df['society'].astype(str)
df['society'] = df['society'].fillna(df['society'].mode()) 
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].median())

In [8]:
df.isna().sum()

In [9]:
for col in df.columns[:]:
    print(col, ' : ', len(df[col].unique()), 'Unique Value')

In [10]:
def total_sqft(x):
    list1 = re.findall('\d+', x)
    list1 = [float(i) for i in list1]
    if len(list1) == 1:
        return list1[0]
    else:
        #avg = (list1[0] + list1[1])/2
        avg = np.mean(list1)
        return avg

In [11]:
df['total_sqft'] = df['total_sqft'].apply(lambda x: total_sqft(x))

In [12]:
df['availability'][:50].value_counts().plot(kind = 'bar')

In [13]:
def finder(text):
    if text != 'Ready To Move':
        return 'Not Ready'
    else:
        return 'Ready To Move'

In [14]:
df['availability'] = df['availability'].apply(finder)

sns.countplot(df['availability'])

In [15]:
df['area_type'].value_counts().plot(kind = 'bar')

In [16]:
sns.distplot(np.log(df['total_sqft']))

In [17]:
sns.distplot(np.log(df['bath']))

In [18]:
sns.boxplot(df['balcony'])

In [19]:
sns.distplot(np.log(df['price']))

In [20]:
df['total_sqft'] = np.log(df['total_sqft'])
df['bath'] = np.log(df['bath'])
df['price'] = np.log(df['price'])

In [22]:
pip install feature_engine

In [23]:
from feature_engine.outliers import Winsorizer

w = Winsorizer(capping_method='iqr', fold = 1.5, tail = 'both',
              variables=['total_sqft', 'bath', 'price'])
winsor_df = w.fit_transform(df[['total_sqft', 'bath', 'price']])

In [24]:
winsor_df.boxplot()

In [25]:
df.drop(['total_sqft', 'bath', 'price'], axis = 1, inplace = True)

In [26]:
final = pd.concat([df, winsor_df], axis = 1)

In [27]:
final

In [28]:
final['size'] = np.where(final['size'] == 'nan', '2 BHK', final['size'])
final['location'] = np.where(final['location'] == 'nan', 'Whitefield', final['location'])
final['society'] = np.where(final['society'] == 'nan', 'Others', final['society'])

In [29]:
final

In [30]:
plt.figure(figsize=(15, 7))
final['society'][:50].value_counts().plot(kind = 'bar')

In [31]:
plt.figure(figsize=(15, 7))
final['size'].value_counts().plot(kind = 'bar')

In [32]:
final['size'].unique()

In [33]:
def bhk(text):
    if text == '2 BHK' or text == '2 Bedroom' or text == '1 RK' or text == '1 Bedroom':
        return 'One or Two BHK'
    else: 
        if text == '3 BHK' or text == '3 Bedroom':
            return 'Three BHK'
        else:
            return 'More than 3 BHK'

In [34]:
final['size'] = final['size'].apply(bhk)

In [35]:
sns.countplot(final['size'])

In [36]:
final.info()

In [37]:
final_dummy = pd.get_dummies(final, columns=['area_type', 'availability', 'location', 'size', 'society'], drop_first=True)

In [38]:
x = final_dummy.drop(['price'], axis = 1)
y = final_dummy[['price']]

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

In [40]:
from sklearn.svm import SVR
from xgboost import XGBRegressor 
from sklearn.linear_model import Ridge 
from sklearn.ensemble import  RandomForestRegressor, BaggingRegressor

In [41]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3)

xtrain.shape, xtest.shape

In [42]:
bagg = BaggingRegressor()

bagg.fit(xtrain, ytrain)

ypred = bagg.predict(xtest)

print('R2 score: {:.4f}'.format(r2_score(ytest, ypred)))
print('Mean Absolute Error: {:.4f}'.format(mean_absolute_error(ytest, ypred))) 

In [43]:
ranf = RandomForestRegressor()

ranf.fit(xtrain, ytrain)

ypred = ranf.predict(xtest)

print('R2 score: {:.4f}'.format(r2_score(ytest, ypred)))
print('Mean Absolute Error: {:.4f}'.format(mean_absolute_error(ytest, ypred))) 

In [44]:
svr = SVR()

svr.fit(xtrain, ytrain)

ypred = svr.predict(xtest)

print('R2 score: {:.4f}'.format(r2_score(ytest, ypred)))
print('Mean Absolute Error: {:.4f}'.format(mean_absolute_error(ytest, ypred))) 

In [45]:
ridge = Ridge()

ridge.fit(xtrain, ytrain)

ypred = ridge.predict(xtest)

print('R2 score: {:.4f}'.format(r2_score(ytest, ypred)))
print('Mean Absolute Error: {:.4f}'.format(mean_absolute_error(ytest, ypred))) 

In [46]:
xgb = XGBRegressor()

xgb.fit(xtrain, ytrain)

ypred = xgb.predict(xtest)

print('R2 score: {:.4f}'.format(r2_score(ytest, ypred)))
print('Mean Absolute Error: {:.4f}'.format(mean_absolute_error(ytest, ypred))) 

** Best accruacy by Ridge Regressor : 83%