# PROBLEM STATEMENT





---

* Dataset includes house sale prices for King County in USA. 
* Homes that are sold in the time period: May, 2014 and May, 2015.
* Columns:

> 1. ida: notation for a house
1. date: Date house was sold
2. price: Price is prediction target
3. bedrooms: Number of Bedrooms/House
4. bathrooms: Number of bathrooms/House
5. sqft_living: square footage of the home
6. sqft_lot: square footage of the lot
7. floors: Total floors (levels) in house
8. waterfront: House which has a view to a waterfront
9.  view: Has been viewed
10. condition: How good the condition is ( Overall )
11. grade: overall grade given to the housing unit, based on King County grading system
12. sqft_abovesquare: footage of house apart from basement
13. sqft_basement: square footage of the basement
14. yr_built: Built Year
15. yr_renovated: Year when house was renovated
16. zipcode: zip
17. lat: Latitude coordinate
18. long: Longitude coordinate
19. sqft_living15: Living room area in 2015(implies-- some renovations) 
20. sqft_lot15: lotSize area in 2015(implies-- some renovations)




---








STEP 0: IMPORT LIBRARIES

In [None]:
#!pip install tensorflow-gpu==2.0.0.alpha0
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from  datetime import datetime 
from datetime import timedelta
import warnings
import tensorflow as tf
from sklearn.feature_selection import SelectKBest,f_regression
warnings.filterwarnings("ignore")
%matplotlib inline

STEP 1: IMPORT DATASETS

In [None]:
# You will need to mount your drive using the following commands:
# For more information regarding mounting, please check this out: https://stackoverflow.com/questions/46986398/import-data-into-google-colaboratory

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# You have to include the full link to the csv file containing your dataset
df = pd.read_csv('kc_house_data.csv', encoding = 'ISO-8859-1')


In [None]:
df.drop(['id','date'], axis=1, inplace=True)
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df.isnull().any().sum()

In [None]:
#Sorted heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df.corr().sort_values(by='price').T[::-1],cmap='coolwarm',annot=True, fmt=".2f");

In [None]:
#UNIVARIATE SELECTION
# Feature Extraction with Univariate Statistical Tests (f_regression)

# load data
X = df.drop('price',axis=1)
y = df['price']
names=pd.DataFrame(X.columns)

model = SelectKBest(score_func=f_regression, k=4)
results = model.fit(X, y)

print (results.scores_)

results_df=pd.DataFrame(results.scores_)
#Concat and name columns
scored=pd.concat([names,results_df], axis=1)
scored.columns = ["Feature", "Score"]
scored.sort_values(by=['Score'],ascending=False)
final_columns = scored[scored.Score >1]
final_columns.sort_values(by=['Score'],ascending=False)

In [None]:
df.to_csv('final_df.csv')

### STEP #3: CREATE TESTING AND TRAINING DATASET/DATA CLEANING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


In [None]:
df_final = pd.read_csv('final_df.csv')
df_final.drop('Unnamed: 0',axis=1, inplace=True)
df_final.head()

In [None]:
X = df_final.drop('price', axis=1)
y = df_final['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)
# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def create_model(neurons):
	# create model
    # Init the model
    reg_model = tf.keras.models.Sequential()
    # First layer inputs
    reg_model.add(tf.keras.Input(shape=(X_train.shape[1],)))
    # hidden layers number 1
    #reg_model.add(tf.keras.layers.BatchNormalization())
    reg_model.add(tf.keras.layers.Dense(neurons, activation='relu'))
    #reg_model.add(tf.keras.layers.Dropout(0.2))          
    # hidden layers number 2
    #reg_model.add(tf.keras.layers.BatchNormalization())  
    reg_model.add(tf.keras.layers.Dense(neurons//2 , activation='relu'))
    #reg_model.add(tf.keras.layers.Dropout(0.2))
    # hidden layers number 3
    #reg_model.add(tf.keras.layers.BatchNormalization())
    reg_model.add(tf.keras.layers.Dense(neurons//4 , activation='relu'))
    # Last layer
    reg_model.add(tf.keras.layers.Dense(1))

    # print summary to undertstand your neural network flow
    reg_model.summary()

    return reg_model

In [None]:
def create_model_functional(layer_sizes):
	# create model
    # Init the model
    reg_model = tf.keras.models.Sequential()
    # First layer inputs
    reg_model.add(tf.keras.Input(shape=(X_train.shape[1],)))
    # hidden layers number 1
    
    for layer_size in layer_sizes[:-1]:
      reg_model.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    for layer_size in layer_sizes[-1:]:
      reg_model.add(tf.keras.layers.Dense(layer_size))

    # print summary to undertstand your neural network flow
    reg_model.summary()

    return reg_model

In [None]:
create_model_functional([128,64,32,1])

TRAINING THE MODEL

In [None]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

In [None]:
seed = 7
tf.random.set_seed(seed)
# create model
model = KerasRegressor(model=create_model, 
                        loss="mean_squared_error", 
                        optimizer="Adam", 
                        metrics=['mae','mse'],
                        verbose=1)
# define the grid search parameters
neurons = [64]
learn_rate = [0.001]
batch_size = [50]
epochs = [50]

In [None]:
param_grid = dict(optimizer__learning_rate=learn_rate, batch_size=batch_size, epochs= epochs, model__neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(X, y)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
grid_result.best_params_['model__neurons']

In [None]:
Optimizer = tf.keras.optimizers.Adam(learning_rate=grid_result.best_params_['optimizer__learning_rate'])
model = create_model(grid_result.best_params_['model__neurons'])
model.compile(Optimizer, 
              loss='mean_squared_error', 
              metrics=['mae','mse'])
epochs_hist = model.fit(X_train,
                        y_train,
                        validation_data=(X_test , y_test),
                        batch_size=grid_result.best_params_['batch_size'], 
                        epochs=grid_result.best_params_['epochs'])

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

Using Kfold

In [None]:
# define the model
def larger_model():
	# create model
	model = tf.keras.models.Sequential()
	model.add(tf.keras.Input(shape=(X_train.shape[1],)))
    # hidden layers number 1
	model.add(tf.keras.layers.BatchNormalization())
	model.add(tf.keras.layers.Dense(128, activation='relu'))
	model.add(tf.keras.layers.Dropout(0.2))          
    # hidden layers number 2
	model.add(tf.keras.layers.BatchNormalization())
	model.add(tf.keras.layers.Dense(64 , activation='relu'))
	model.add(tf.keras.layers.Dropout(0.2))
    # hidden layers number 3
	model.add(tf.keras.layers.BatchNormalization())
	model.add(tf.keras.layers.Dense(16 , activation='relu'))
    # Last layer
	model.add(tf.keras.layers.Dense(1))

    # print summary to undertstand your neural network flow
	model.summary()
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

# evaluate model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(model=larger_model, epochs=50, batch_size=5, verbose=1)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=3)
results = cross_val_score(pipeline, X, y, cv=kfold, scoring='neg_mean_squared_error')
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)