In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as pt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)


In [None]:
df1 = pd.read_csv("data.csv")
df1.head()

In [None]:
df1.shape


In [None]:
df1.groupby('area_type')['area_type'].agg('count')

In [None]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.head()

In [None]:
df2.groupby('location')['location'].agg('count')

In [None]:
#check count of null values
df2.isnull().sum()

In [None]:
df3 = df2.dropna()
df3.shape

In [None]:
df3['size'].unique()

In [None]:
df3['bedrooms'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.head()                                   

In [None]:
df3['bedrooms'].unique()

In [None]:
df3[df3.bedrooms>20]

In [None]:
df3['total_sqft'].unique()

In [None]:
#ranges is total_sqft, we should remove this error
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True
df3[~df3['total_sqft'].apply(isfloat)].head(10)

In [None]:
def convert_to_float(x):
    token = x.split('-')
    if len(token) == 2:
            print(token[0])
            print(token[1])
            return float(float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None
convert_to_float('2932-2850')

In [None]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(lambda x: convert_to_float(x))
df4.head(3)

In [None]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*10000/df5['total_sqft']
df5.head()

In [None]:
#location is important becuase it is a type and usually for such types we have one-hot-encoding
# each column belongs to a certain class and each element belonging to certain class has a corresponding 1, others are 0
# but we have a lot of location meaning alot of columns
len(df5['location'].unique())
#1304 columns, to many features or dimentianality curse

In [None]:
#solution, add other caetagory
df5['location'] = df5['location'].apply(lambda x: x.strip()) #remove useless spaces from start or end
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending = False)
location_stats

In [None]:
less_than_ten = location_stats[location_stats<=10]
less_than_ten

In [None]:
df5['location'] = df5['location'].apply(lambda x: 'others' if x in less_than_ten else x)
len(df5.location.unique())

In [None]:
#remove outliers
# techniques : standard deviation, domain knowledge
# in domain knowledge, we set a threshold then examine it for instance a normal room has a threasold of 300 sqft
# so if any house has less than threasold value would be a outlier
df5[df5.total_sqft/df5.bedrooms< 300].head()
#now how a house can have 8 rooms and only size 600 sqft, that's an outlier

In [None]:
df6 = df5[~(df5.total_sqft/df5.bedrooms< 300)]
df6.shape

In [None]:
df6.price_per_sqft.describe()

In [None]:
#removing price_per_sqft per location using standard deviation
def remove_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        mean = np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        new_df = subdf[(subdf.price_per_sqft > (mean-std)) & (subdf.price_per_sqft <= (mean+std))]
        df_out = pd.concat([df_out,new_df],ignore_index = True)
    return df_out
df7 = remove_outliers(df6)
df7.shape

In [None]:
#same sqft_area but more price of house with less rooms in same location
def scatter_plot(df,location):
    bhk2 = df[(df.location == location) & (df.bedrooms == 2)]
    bhk3 = df[(df.location == location) & (df.bedrooms == 3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    pt.scatter(bhk2.total_sqft,bhk2.price,color = 'blue',label = '2 Bedroom',s= 50)
    pt.scatter(bhk3.total_sqft,bhk3.price,marker = '+',color = 'green',label = '3 Bedroom',s= 50)
    pt.xlabel('total squre_feet')
    pt.ylabel('price in lacs')
    pt.title(location)
    pt.legend()
    
scatter_plot(df7,"Hebbal")

In [None]:
#remove outliers (house with bedrooms) that has price less than mean of price with less bedrooms
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bedrooms'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bedrooms'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

df8 = remove_bhk_outliers(df7)
df8.shape


In [None]:
#now in this plot you will see majority of green plus which were below blue dots for same sqft are gone
def scatter_plot(df,location):
    bhk2 = df[(df.location == location) & (df.bedrooms == 2)]
    bhk3 = df[(df.location == location) & (df.bedrooms == 3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    pt.scatter(bhk2.total_sqft,bhk2.price,color = 'blue',label = '2 Bedroom',s= 50)
    pt.scatter(bhk3.total_sqft,bhk3.price,marker = '+',color = 'green',label = '3 Bedroom',s= 50)
    pt.xlabel('total squre_feet')
    pt.ylabel('price in lacs')
    pt.title(location)
    pt.legend()
    
scatter_plot(df8,"Hebbal")

In [None]:
matplotlib.rcParams['figure.figsize'] = (20,10)
pt.hist(df8.price_per_sqft,rwidth= 0.8)
pt.xlabel('price per sqft')
pt.ylabel('count')

In [None]:
#removing bathroom outliers
matplotlib.rcParams['figure.figsize'] = (20,10)
pt.hist(df8.bath,rwidth= 0.8)
pt.xlabel('bath')
pt.ylabel('count')

In [None]:
#we set a threshold by domian knwoledge that any house which has more bathroom than number of bedrooms+2 , we remove them
df9 = df8[(df8.bath)< (df8.bedrooms+2)]

In [None]:
df9.shape


In [None]:
#now we remove unnecassary features for traning i.e remove size because we already have bathrooms and remove price_per_sqft
#because we only need it for outlier detection
df9.drop(['size','price_per_sqft'],axis = 'columns')

In [None]:
#machine learning model cannot interpret text data so we convert everything that is string into numbers
#create one-hot encoding
dummies = pd.get_dummies(df9.location)

dummies.head(10)


In [None]:
#we remove that column to avoid dummy variable trap
df10 = pd.concat([df9,dummies.drop('others',axis='columns')],axis='columns')
df10 = df10.drop(['location','size','price_per_sqft'],axis='columns')

In [None]:
#now we have dataframe with neccessary columns and labels
df10.head(10)

In [None]:
#now we seperate dependant and independant variables
X= df10.drop('price',axis='columns')
X.head()



In [None]:
y = df10.price
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.2,random_state =10)

In [None]:
#using linear_regression model
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:
#using k-fold cross validation model
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
cross_val_score(LinearRegression(),X,y,cv=cv)

In [None]:
def predict_price(location,sqft,bath,bedrooms):
    loc_index = np.where(X.columns==location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bedrooms
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
price = predict_price('1st Phase JP Nagar',1000,2,2)

print(price,"lacs")

In [None]:
import pickle
import json
with open('prediction','wb') as f:
    pickle.dump(lr_clf,f)
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))