# House price regression - neural networks

![houses](https://storage.googleapis.com/kaggle-media/competitions/House%20Prices/kaggle_5407_media_housesbanner.png)

In [None]:
## Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

Each time we load Colab, we need to upload our kaggle.json file to access the dataset. 

In [None]:
# Then, we need to move the kaggle.json file to the expected location  

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Import data

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques
!unzip house-prices-advanced-regression-techniques.zip
df = pd.read_csv('train.csv')

In [None]:
# Show first few rows
df.head()

In [None]:
df.info()

## Preprocessing

In [None]:
# Drop columns with too many missing values 
data = data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'])
data = data.select_dtypes(include=[np.number]).interpolate().dropna()

In [None]:
# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

In [None]:
# Convert non-numeric columns using one-hot encoding
df_encoded = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)

In [None]:
# Drop the Id column
df_encoded.drop('Id', axis=1, inplace=True)

In [None]:
# Interpolate any other missing values - fill the value in with the average value
data = df_encoded.interpolate().dropna()

In [None]:
# Separate features and target variable
X = data.drop(columns=['SalePrice']) # all features will be used 
y = data['SalePrice'] # target/label is sale price 

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# New step: Convert to PyTorch tensors (data format that pytorch expects)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

## Construct neural network

In [None]:
# Define the layers of the network
fc1 = nn.Linear(X_train.shape[1], 64) # input layer of X_train nodes connecting to 64 nodes in next layer
fc2 = nn.Linear(64, 32) # hidden layer of 64 nodes connecting to 32 nodes in next layer
fc3 = nn.Linear(32, 1) # hidden layer of 32 nodes connecting to single output node in next layer

In [None]:
# Define the forward pass function
def forward(x):
    x = torch.relu(fc1(x)) # pass through layer 1, with reLu activation function, update x
    x = torch.relu(fc2(x)) # pass through layer 2, with reLU activation function, update x
    x = fc3(x) # pass through final layer to get output node 
    return x

## Train the model

In [None]:
# Define the loss function (Mean Squared Error) and the optimizer (Adam) - Adam is a variant of gradient descent optimisation commonly used
criterion = nn.MSELoss()
optimizer = optim.Adam(list(fc1.parameters()) + list(fc2.parameters()) + list(fc3.parameters()), lr=0.001)


In [None]:
# Create DataLoader to handle batching - this allows computations to be run in parallel (at same time)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Training loop
epochs = 100
train_losses = []
test_losses = []

for epoch in range(epochs):
    epoch_loss = 0
    # Training phase
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        y_pred = forward(X_batch)  # Forward pass
        loss = criterion(y_pred, y_batch)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        epoch_loss += loss.item()
    
    train_losses.append(epoch_loss / len(train_loader))  # Average loss for the epoch

    # Evaluation phase
    with torch.no_grad():
        y_test_pred = forward(X_test)
        test_loss = criterion(y_test_pred, y_test).item()
    test_losses.append(test_loss)

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {epoch_loss / len(train_loader):.4f}, Test Loss: {test_loss:.4f}')


## Evaluate the model

In [None]:
## Step 6: Evaluate the model
# Calculate the final mean squared error on the test set
mse = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error: {mse}')

In [None]:
# Plot training and test loss over epochs
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()