# Housing price prediction
In this notebook we train a simple neural network model using tensorflow, keras framework to predict the housing price based on its features as following:
- price
- area,
- bedrooms
- bathrooms
- stories 
- mainroad
- guestroom
- basement
- hotwaterheating
- airconditioning
- parking
- prefarea
- furnishingstatus

Downlod the Housing Price kaggel dataset

# Importing data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
df = pd.read_csv('Housing.csv', encoding='ISO-8859-1')

In [3]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [None]:
g = sns.pairplot(df, hue="furnishingstatus")
plt.title('Pairplots for all the Feature')
g.map_upper(sns.kdeplot, levels=4, color=".2")
plt.show()

In [None]:
df.columns

In [None]:
display(df.describe())

# Dealing with Outliers 

In [None]:
fig, axs = plt.subplots(2,3, figsize = (10,5))
plt1 = sns.boxplot(df['price'], ax = axs[0,0])
plt2 = sns.boxplot(df['area'], ax = axs[0,1])
plt3 = sns.boxplot(df['bedrooms'], ax = axs[0,2])
plt1 = sns.boxplot(df['bathrooms'], ax = axs[1,0])
plt2 = sns.boxplot(df['stories'], ax = axs[1,1])
plt3 = sns.boxplot(df['parking'], ax = axs[1,2])

plt.tight_layout()

In [None]:
# Dealing with outliers in price 
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.price >= Q1 - 1.5*IQR) & (df.price <= Q3 + 1.5*IQR)]

plt.boxplot(df.price)



In [None]:
# Dealing with outliers in area 
Q1 = df.area.quantile(0.25)
Q3 = df.area.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.area >= Q1 - 1.5*IQR) & (df.area <= Q3 + 1.5*IQR)]
plt.boxplot(df.area)

# Preprocessing

# categorizing binary data features
- There are five categorical columns and the rest are continuous features, needed to encode the categorical features for further utilization

In [None]:
df = pd.get_dummies(df,columns=['bedrooms', 'bathrooms', 'stories', 'parking', 'furnishingstatus'])

In [None]:
df

In [None]:
cat_col = list(df.columns[(df == 'yes').any() | (df =='no').any() | (df == False).any() | (df == True).any() | (df == 'furnished').any()])
cat_col 

In [None]:
label_Encoder = LabelEncoder()
for i in cat_col:
    df[i] = label_Encoder.fit_transform(df[i])


In [None]:
df

In [None]:
X = df.drop(['price'], axis=1)

In [None]:
X.head(2)

In [None]:
X.tail(2)

# Normalization using MinMax Scaler

In [None]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [None]:
y = df['price']

In [None]:
y.shape

In [None]:
y = y.values.reshape(-1,1)

In [None]:
y.shape

In [None]:
y = scaler.fit_transform(y)

# Feature Extraction
It seems that there is a strong multicollinearity. We can do the following methods to fix it:
- Variance inflation factor (VEF)
- Recursive Feature Elimination (RFE)
- Feature Elmination using PCA Decomposition

In [None]:
# applying PCS
from sklearn.decomposition import PCA

pca = PCA()  # Choose the number of components to retain
principal_components = pca.fit_transform(scaled_X)


In [None]:
plt.figure(figsize=[25,20])
sns.heatmap(df.corr(), annot = True, vmin = -1, vmax = 1, center = 0)
plt.show()

In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)[0:30]

The conclusion of this is that we need to use a total of seven principal components in the regression learner.

# Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train = pca.fit_transform(X_train)[:,0:23]
X_test = pca.transform(X_test)[:,0:23]

# Creating Neural Network Model

In [None]:
import tensorflow.keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.regularizers import l1, l2

In [None]:
X_train.shape[1]

In [None]:
model = Sequential()
model.add(Dense(50, input_dim = X_train.shape[1], activation = 'relu', kernel_regularizer=l1(0.01)))
model.add(Dense(50, activation = 'relu',kernel_regularizer=l1(0.01)))
model.add(Dense(1, activation = 'linear'))

In [None]:
model.summary()

# Compiling the model

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
eposhs_hist = model.fit(X_train, y_train, epochs=60, batch_size = 25, verbose=1, validation_split= 0.2)

# Model Evaluation

In [None]:
eposhs_hist.history.keys()

In [None]:

plt.plot(eposhs_hist.history['loss'])
plt.plot(eposhs_hist.history['val_loss'])
plt.title('Model Loss progress during trainig')
plt.ylabel('training and validation Loss')
plt.xlabel('Epoch Number')
plt.legend(['Traing loss', 'validation loss'])

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R^2): {r2:.2f}')