#### This is a realistic housing dataset obtained from Kaggel. We focus on feature Engineering  and necessary data cleaning, then followed by applying various machime learning to perform the regression

https://www.kaggle.com/harlfoxem/housesalesprediction

Feature Columns:
    
* id - Unique ID for each home sold
* date - Date of the home sale
* price - Price of each home sold
* bedrooms - Number of bedrooms
* bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
* sqft_living - Square footage of the apartments interior living space
* sqft_lot - Square footage of the land space
* floors - Number of floors
* waterfront - A dummy variable for whether the apartment was overlooking the waterfront or not
* view - An index from 0 to 4 of how good the view of the property was
* condition - An index from 1 to 5 on the condition of the apartment,
* grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 4-7 has an average level of construction, 8-10 has good quality design, and 11-13 have a high quality level of construction and design.
* sqft_above - The square footage of the interior housing space that is above ground level
* sqft_basement - The square footage of the interior housing space that is below ground level
* yr_built - The year the house was initially built
* yr_renovated - The year of the house’s last renovation
* zipcode - What zipcode area the house is in
* lat - Lattitude
* long - Longitude
* sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors
* sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors


Task is to predict the price of new house in the give area.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
housing = pd.read_csv('../../DATA/kc_house_data.csv')
housing.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [None]:
def add_grade_string(x):
    if x in [1, 2, 3]:
        return 'Poor'
    elif x in [4, 5, 6, 7]:
        return 'Average'
    elif x in [8, 9, 10]:
        return 'Good'
    else:
        return 'Best'

housing['grade'] = housing['grade'].apply(add_grade_string)
housing['grade'] = housing['grade'].astype('category')
housing['grade'] = housing['grade'].cat.codes
housing.loc[:, ['grade']].head()

In [None]:
housing['view'] = housing['view'].astype('category')
housing['view'] = housing['view'].cat.codes

In [None]:
housing.corr()['price']

In [None]:
housing.columns

In [None]:
remove_feature = ['id', 'date', 'zipcode', 'yr_built', 'yr_renovated']
housing['renovated'] = housing['yr_renovated'].map(lambda x: 0 if x==0 else 1)
housing.drop(remove_feature, axis=1, inplace=True)
housing.columns

In [None]:
filter_features = housing.corr()['price']>0.32
principal_features = list(filter_features[filter_features==True].index)
len(housing[principal_features].columns)

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(housing[principal_features].corr(), annot=True);

In [None]:
df = housing[principal_features].copy()
print(df.shape)
print(df.dtypes)
df.head()

#### Scaling and Train Test Split

In [None]:
X = df.drop('price',axis=1).values
y = df['price'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

##### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train= scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
# y_train= scaler.fit_transform(y_train)

# y_test = scaler.transform(y_test)

#### Creating a Model

In [None]:
def regressor(string):
    model = eval(string)()    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    r2 = r2_score(y_pred, y_test)
    
    return (y_pred, rmse, r2)

In [None]:
models = ['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor']
predicted_values = {model: {} for model in models}
for model in models:
    predicted_values[model]['prediction'] = regressor(model)[0]
    predicted_values[model]['rmse'] = regressor(model)[1]
    predicted_values[model]['r2 score'] = regressor(model)[2]
    
for model in models:
    print('{}: RMSE= {}, r2 score= {}\n'.format(model, predicted_values[model]['rmse'], predicted_values[model]['r2 score']))

print('Compare the errors with actual value: \nMean price: {}, Median price: {}'.format(housing['price'].mean(), housing['price'].median()))

In [None]:
housing['price'].describe()

## Building a Neural network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [None]:
model = Sequential()

model.add(Dense(8,activation='relu'))
model.add(Dense(4,activation='relu'))
model.add(Dense(2,activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

### Training the Model

In [None]:
model.fit(x=X_train,y=y_train, validation_data=(X_test,y_test), batch_size=128,epochs=200)

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot(grid=True)

### Evaluation on Test Data

In [None]:
predictions = model.predict(X_test)
np.sqrt(mean_squared_error(y_test,predictions))

In [None]:
errors = y_test.reshape(6480, 1) - predictions
sns.distplot(errors)