In [386]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [443]:
df1 = pd.read_csv('bengaluru_house_prices.csv')
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [444]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.head()
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [445]:
df3 = df2.dropna()
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [446]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3[df3.bhk > 20]
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [447]:
def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

df3[~df3['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [442]:
print(df1.shape)
print(df2.shape)
print(df3.shape)

(13320, 9)
(13320, 5)
(13246, 6)


In [454]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
        
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)

df4.loc[30]
df4 = df4.dropna(subset=['total_sqft'])
print('shape ku waaa:',df4.shape)

shape ku waaa: (13200, 6)


In [465]:
df5 = df4.copy()

df5 = df4[~(df4.total_sqft/df4.bhk < 300)]

df5.location = df5.location.apply(lambda x: x.strip())

df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']

location_stats = df5['location'].value_counts(ascending=False)
location_stats_less_than_10 = location_stats[location_stats <= 10]

df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
print("Tirada 'other' ee df5:", df5[df5.location == 'other'].shape[0])
print("Inta xaafadood ee hadda haray:", len(df5.location.unique())) # Waa inay ahaataa 240+

Tirada 'other' ee df5: 2714
Inta xaafadood ee hadda haray: 223


In [466]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_out = pd.concat([df_out, reduced_df], axis=0, ignore_index=True)
    return df_out

df6 = remove_pps_outliers(df5)
print("Tirada 'other' ee df5:", df6[df6.location == 'other'].shape[0])

print(df6.shape)

Tirada 'other' ee df5: 2437
(10270, 7)


In [469]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

df7 = remove_bhk_outliers(df6)
# Fiiri guud ahaan xogta hartay
print("Wadarta guud ee xogta hartay:", df7.shape)
print(df7.location.value_counts().head())

Wadarta guud ee xogta hartay: (7264, 7)
location
other                    1194
Whitefield                241
Sarjapur  Road            191
Electronic City           162
Raja Rajeshwari Nagar     140
Name: count, dtype: int64


In [473]:
df8 = df7[df7.bath < df7.bhk + 2]

df9 = df8.drop(['size', 'price_per_sqft'], axis='columns')
print("Xogta halkan ku hartay waadf9:", df9.shape)

Xogta halkan ku hartay waadf9: (7188, 5)


In [478]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

dummies = pd.get_dummies(df9.location,dtype=int)
df10 = pd.concat([df9,dummies.drop('other',axis='columns')],axis='columns')
df11 = df10.drop('location', axis='columns')

X = df11.drop('price',axis='columns')
y = df11.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

lr = LinearRegression()
lr.fit(X_train, y_train)
print(f"Linear Regression Score: {lr.score(X_test, y_test)}")

Linear Regression Score: 0.8595050035383578


In [480]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)
cv_results = cross_val_score(LinearRegression(), X, y, cv=cv)

print(f"Cross Validation Scores: {cv_results}")
print(f"Average CV Score: {cv_results.mean()}")

Cross Validation Scores: [0.859505   0.8316122  0.8735384  0.84487437 0.8551387 ]
Average CV Score: 0.8529337351263481


In [486]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
# Halkan soo daji maktabadaha loo baahan yahay
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

algos = {
    'linear_regression': {
            'model': LinearRegression(),
            'params': {
            }
        },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1, 2],
            'selection': ['random', 'cyclic']
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['squared_error', 'friedman_mse'],
            'splitter': ['best', 'random']
        }
    }
}

scores = []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=20)

for algo_name, config in algos.items():
    gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
    gs.fit(X, y)
    scores.append({
        'model': algo_name,
        'best_score': gs.best_score_,
        'best_params': gs.best_params_
    })

results_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
print(results_df)

               model  best_score  \
0  linear_regression    0.852934   
1              lasso    0.718073   
2      decision_tree    0.698468   

                                         best_params  
0                                                 {}  
1                {'alpha': 2, 'selection': 'random'}  
2  {'criterion': 'friedman_mse', 'splitter': 'ran...  


In [488]:
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns == location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
    return lr.predict([x])[0]

print("\n--- Tijaabinta Saadaalinta ---")
print(f"1st Phase JP Nagar, 1000sqft, 2 Bath, 2 BHK: {predict_price('1st Phase JP Nagar', 1000, 2, 2)} Lakhs")
print(f"1st Phase JP Nagar, 1000sqft, 3 Bath, 3 BHK: {predict_price('1st Phase JP Nagar', 1000, 3, 3)} Lakhs")
print(f"Indira Nagar, 1000sqft, 2 Bath, 2 BHK: {predict_price('Indira Nagar', 1000, 2, 2)} Lakhs")


--- Tijaabinta Saadaalinta ---
1st Phase JP Nagar, 1000sqft, 2 Bath, 2 BHK: 87.98172738474646 Lakhs
1st Phase JP Nagar, 1000sqft, 3 Bath, 3 BHK: 91.15890167988199 Lakhs
Indira Nagar, 1000sqft, 2 Bath, 2 BHK: 175.25638385733532 Lakhs


In [490]:
import json
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr,f)

columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))