In [135]:
import pandas as pd
import numpy as np

### Reading Data

In [136]:
data = pd.read_csv(r'C:\Users\Saptarshi Majumder\Projects\house_price_predictor\Bengaluru_House_Data.csv')

### Dropping the unnecessary columns

In [137]:
data.drop(columns=['area_type','availability','society','balcony'],inplace = True)

### Filling the missing values

In [138]:
data['location'] = data['location'].fillna('Sarjapur Road')    #filling the missing values of 'location'

In [139]:
data['bath'] = data['bath'].fillna('2.0')    #filling the missing values of 'bath'

In [140]:
data['size'] = data['size'].fillna('2 BHK')     #filling the missing values of 'size'

### Organizing the 'size' and 'BHK' columns

In [141]:
data['BHK'] = data['size'].str.split().str.get(0).astype(int)

In [142]:
 data.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


### Removing the ranged 'total_sqft' values (Eg: 1133 - 1384)

In [143]:
def convertRange_total_sqft(x):
    
    temp = x.split('-')
    
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    
    try:
        return float(x)
    
    except:
        return None

In [144]:
data['total_sqft'] = data['total_sqft'].apply(convertRange_total_sqft)

### Price_per_sqft column 

In [145]:
data['price_per_sqft'] = (data['price'] * 100000) / data['total_sqft']

### Filtering 'location' column

In [146]:
data['location'] = data['location'].apply(lambda x: x.strip())    # removing the ahead and trailing whitespaces 

location_count_below_10 =  data['location'].value_counts()[data['location'].value_counts() <= 10]    # filtering all the location with count <=10 

In [147]:
data['location'] = data['location'].apply(lambda x: 'others' if x in location_count_below_10 else x)

### Outliers detection and removal

In [148]:
data = data[(data['total_sqft'] / data['BHK']) >= 300]

In [149]:
data['price_per_sqft'].describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [150]:
def remove_outliers_sqft(df):
    df_op = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        sd = np.std(subdf.price_per_sqft)
        
        gen_df = subdf[(subdf.price_per_sqft  >  (m-sd)) & (subdf.price_per_sqft   <=   (m+sd))]
        df_op = pd.concat([df_op , gen_df], ignore_index = True)
    return df_op

data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,total_sqft,price,BHK,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,91.286372,2.574896,5659.062876
std,880.694214,86.342786,0.897649,2265.774749
min,300.0,10.0,1.0,1250.0
25%,1110.0,49.0,2.0,4244.897959
50%,1286.0,67.0,2.0,5175.600739
75%,1650.0,100.0,3.0,6428.571429
max,30400.0,2200.0,16.0,24509.803922


In [151]:
def BHK_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        BHK_stats = { }
        for BHK, BHK_df in location_df.groupby('BHK'):
            BHK_stats[BHK] = {
                'mean' : np.mean(BHK_df.price_per_sqft),
                'std' : np.std(BHK_df.price_per_sqft),
                'count' : BHK_df.shape[0]
            }
            
        for BHK, BHK_df in location_df.groupby('BHK'):
            stats = BHK_stats.get(BHK-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, BHK_df[BHK_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis = 'index')

In [152]:
data = BHK_outlier_remover(data)

### Dropping unnecessary columns

In [153]:
data.drop(columns=['size', 'price_per_sqft'],inplace = True)         # 'size' and 'price_per_sqft' were necessary only to eliminate the outliers

### Cleaned Data


In [155]:
data.to_csv(r'C:\Users\Saptarshi Majumder\Projects\house_price_predictor\Cleaned_Data.csv')

In [156]:
x = data.drop(columns=['price'])
y = data['price']

### Importing ML libs

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [82]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)

In [95]:
print(x_train.shape)
print(x_test.shape)

(5888, 4)
(1473, 4)


### Linear Regression

In [96]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [123]:
scaler = StandardScaler()

In [125]:
lr = LinearRegression()

In [102]:
pipe = make_pipeline(column_trans, scaler, lr)

In [107]:
pipe.fit(x_train, y_train)



In [108]:
y_pred_lr = pipe.predict(x_test)

In [109]:
r2_score(y_test, y_pred_lr)

0.8234153939332131

### Lasso

In [110]:
lasso = Lasso()

In [111]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [112]:
pipe.fit(x_train, y_train)



In [113]:
y_pred_lasso = pipe.predict(x_test)

In [114]:
r2_score(y_test, y_pred_lasso)

0.8128285650772719

### Ridge

In [115]:
ridge = Ridge()

In [116]:
pipe  = make_pipeline(column_trans, scaler, ridge)

In [117]:
pipe.fit(x_train, y_train)



In [118]:
y_pred_ridge = pipe.predict(x_test)

In [119]:
r2_score(y_test, y_pred_ridge)

0.8234146633312699

In [158]:
print("Linear Regression: ", r2_score(y_test, y_pred_lr))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ", r2_score(y_test, y_pred_ridge))

Linear Regression:  0.8234153939332131
Lasso:  0.8128285650772719
Ridge:  0.8234146633312699


In [159]:
import pickle

pickle.dump(pipe, open(r'C:\Users\Saptarshi Majumder\Projects\house_price_predictor\RidgeModel.pkl' , 'wb'))