<a href="https://colab.research.google.com/github/chakravarthipothureddy/chakravarthi.pothureddy/blob/main/Housing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Set Characteristics:

##### Number of Instances:
 	
20640

##### Number of Attributes:
 	
8 numeric, predictive attributes and the target

##### Attribute Information:
 	
* MedInc median income in block
* HouseAge median house age in block
* AveRooms average number of rooms
* AveBedrms average number of bedrooms
* Population block population
* AveOccup average house occupancy
* Latitude house block latitude
* Longitude house block longitude

#### Target

The target variable is the median house value in units of 100,000 for California districts.

#### Documentation 
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html#examples-using-sklearn-datasets-fetch-california-housing

In [None]:
# conda install -c conda-forge tensorflow, to install tensorflow package

In [None]:
# Import Required Libraries:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# To represent the results in a data frame:
from sklearn import metrics
import numpy as np
scores = pd.DataFrame(columns=['Model','MAE_Train','MSE_Train','RMSE_Train','MAPE_Test','MAE_Test','MSE_Test','RMSE_Test','MAPE_Test'])

def get_metrics(train_act,train_pred,test_act,test_pred,model_description,dataframe):
    MAE_Train = metrics.mean_absolute_error(train_act,train_pred)
    MSE_Train = metrics.mean_squared_error(train_act,train_pred)
    RMSE_Train = np.sqrt(metrics.mean_squared_error(train_act,train_pred))
    MAPE_Train = metrics.mean_absolute_percentage_error(train_act,train_pred)
    MAE_Test = metrics.mean_absolute_error(test_act,test_pred)
    MSE_Test = metrics.mean_squared_error(test_act,test_pred)
    RMSE_Test = np.sqrt(metrics.mean_squared_error(test_act,test_pred))
    MAPE_Test = metrics.mean_absolute_percentage_error(test_act,test_pred)
       
    dataframe=dataframe.append(pd.Series([model_description,MAE_Train,MSE_Train,RMSE_Train,MAPE_Train,MAE_Test,MSE_Test,RMSE_Test,MAPE_Test],index=scores.columns ), ignore_index=True)
    
    return(dataframe)

In [None]:
# Library to build Neural Networks:
import tensorflow as tf
from tensorflow import keras

In [None]:
# Fetching the data from sklearn:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [None]:
housing.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [None]:
# converting the data into a data frame:

In [None]:
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df["target"] = housing.target

In [None]:
# To display first five records:

In [None]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# Assigning names to columns:

# 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'

In [None]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'target'],
      dtype='object')

In [None]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [None]:
# Splitting the data:

In [None]:
x = df.drop('target',axis =1)
y = df['target']

In [None]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [None]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: target, Length: 20640, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state = 1)
x_train, x_val, y_train, y_val=train_test_split(x_train, y_train, test_size=0.25,random_state=1)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_test.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(12384, 8)
(4128, 8)
(4128,)
(12384,)
(4128, 8)
(4128,)


In [None]:
x_val

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
4686,3.3594,45.0,3.823126,1.055365,3359.0,1.645762,34.07,-118.35
14481,9.5245,23.0,7.607187,1.023544,2036.0,2.522924,32.82,-117.25
1102,2.4028,17.0,31.777778,9.703704,47.0,1.740741,40.06,-121.54
10662,5.8752,20.0,6.157667,1.058315,1192.0,2.574514,33.66,-117.81
11906,3.4044,48.0,5.093085,0.925532,1060.0,2.819149,33.97,-117.39
...,...,...,...,...,...,...,...,...
1452,5.0092,24.0,6.487805,0.892683,656.0,3.200000,37.97,-121.97
20283,3.2196,31.0,4.293160,1.039088,2695.0,4.389251,34.18,-119.18
16084,5.1991,39.0,6.309446,1.094463,742.0,2.416938,37.73,-122.49
20483,5.5955,18.0,5.881266,0.978892,1283.0,3.385224,34.28,-118.72


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Standardize the train data 
x_train = scaler.fit_transform(x_train)
# Standardize the validation data 
x_valid = scaler.fit_transform(x_val)
# Standardize the test data
x_test =  scaler.fit_transform(x_test)

In [None]:
x_train.shape

(12384, 8)

In [None]:
# print the splitting of data:
x_train,x_test,x_valid

(array([[ 0.15127046, -0.76219449,  0.26464844, ..., -0.01433711,
         -0.46720479,  0.69282843],
        [-1.78923511, -1.47770514,  0.47577811, ..., -0.07583071,
         -0.43917997,  1.33076719],
        [-0.96902743, -0.12618502, -0.6654751 , ...,  0.05434519,
         -0.86422307,  0.65295726],
        ...,
        [ 0.10638426, -0.84169567, -0.54158532, ..., -0.03393509,
         -0.80817343,  0.60311829],
        [-0.82407579,  0.98683155, -0.02656195, ..., -0.02934192,
          0.53701791, -0.10957892],
        [ 0.89719838, -1.55720633,  0.65901278, ...,  0.06608434,
         -0.75212379,  0.97192664]]),
 array([[-0.3098735 ,  0.81226638, -0.48452945, ..., -1.03109037,
         -0.73685251,  0.59953305],
        [-0.96796106,  0.65331708, -0.21646778, ..., -0.31360171,
          0.53929953, -0.11505424],
        [ 0.08510325,  1.36858896, -0.49718178, ...,  0.69385846,
          0.9850144 , -1.44358273],
        ...,
        [ 0.62064037, -0.22090411, -0.2101633 , ...,  

In [None]:
### MODEL BUILDING

In [None]:
## Linear Regression Model:

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(x_train,y_train)

LinearRegression()

In [None]:
y_pred = regression.predict(x_test)

In [None]:
y_pred

array([2.22070901, 0.99624459, 2.71906848, ..., 2.91339015, 2.6279541 ,
       1.48098913])

In [None]:
scores=get_metrics(y_train,regression.predict(x_train),y_test,regression.predict(x_test),'Lin_model',scores)
scores

Unnamed: 0,Model,MAE_Train,MSE_Train,RMSE_Train,MAPE_Test,MAE_Test,MSE_Test,RMSE_Test,MAPE_Test.1
0,Lin_model,0.532789,0.529195,0.727458,0.315534,0.533631,0.52298,0.723173,0.336336


In [None]:
## Model Building for Random Forest
from sklearn.ensemble import RandomForestRegressor  
Regressor= RandomForestRegressor(n_estimators= 50)  
Regressor.fit(x_train,y_train)

RandomForestRegressor(n_estimators=50)

In [None]:
# To represent the results in a data frame:
from sklearn import metrics
import numpy as np
scores = pd.DataFrame(columns=['Model','MAE_Train','MSE_Train','RMSE_Train','MAPE_Test','MAE_Test','MSE_Test','RMSE_Test','MAPE_Test'])

def get_metrics(train_act,train_pred,test_act,test_pred,model_description,dataframe):
    MAE_Train = metrics.mean_absolute_error(train_act,train_pred)
    MSE_Train = metrics.mean_squared_error(train_act,train_pred)
    RMSE_Train = np.sqrt(metrics.mean_squared_error(train_act,train_pred))
    MAPE_Train = metrics.mean_absolute_percentage_error(train_act,train_pred)
    MAE_Test = metrics.mean_absolute_error(test_act,test_pred)
    MSE_Test = metrics.mean_squared_error(test_act,test_pred)
    RMSE_Test = np.sqrt(metrics.mean_squared_error(test_act,test_pred))
    MAPE_Test = metrics.mean_absolute_percentage_error(test_act,test_pred)
       
    dataframe=dataframe.append(pd.Series([model_description,MAE_Train,MSE_Train,RMSE_Train,MAPE_Train,MAE_Test,MSE_Test,RMSE_Test,MAPE_Test],index=scores.columns ), ignore_index=True)
    
    return(dataframe)

In [None]:
scores=get_metrics(y_train,Regressor.predict(x_train),y_test,Regressor.predict(x_test),'RandomForest',scores)
scores

Unnamed: 0,Model,MAE_Train,MSE_Train,RMSE_Train,MAPE_Test,MAE_Test,MSE_Test,RMSE_Test,MAPE_Test.1
0,RandomForest,0.126524,0.03939,0.198468,0.071134,0.618751,0.747825,0.864769,0.406053


In [None]:
# ANN Model with 2 layers with xx nodes in each layer
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 

In [None]:
model=Sequential()
model.add(Dense(10, input_shape=(8,), activation='sigmoid'))
model.add(Dense(20,activation='sigmoid'))
model.add(Dense(1,activation='linear'))

In [None]:
model.summary

<bound method Model.summary of <keras.engine.sequential.Sequential object at 0x7f0b41a2f210>>

In [None]:
# Compile the model with loss as mean_squared error 
# and Optimizer as SGD 
model.compile(optimizer='SGD',loss='mean_squared_error',metrics=['accuracy'])

In [None]:
model_history = model.fit(x_train, y_train, epochs=20, validation_data=(x_valid, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
scores=get_metrics(y_train,model.predict(x_train),y_test,model.predict(x_test),'ANN_model',scores)
scores

Unnamed: 0,Model,MAE_Train,MSE_Train,RMSE_Train,MAPE_Test,MAE_Test,MSE_Test,RMSE_Test,MAPE_Test.1
0,RandomForest,0.126524,0.03939,0.198468,0.071134,0.618751,0.747825,0.864769,0.406053
1,ANN_model,0.48835,0.45039,0.671111,0.286338,0.593673,0.615919,0.784805,0.388079
