# Workshop 1: House Pricing Regression
In this workshop you will need to train a regression neural network to estimate the pricing of houses of California. The main blocks of the workshop are:

- Get the data from PyTorch repository and visualize it.
- Pre-process the data.
- Design the network.
- Train the network.
- Evaluate the model.

[Info of the data](https://www.kaggle.com/datasets/camnugent/california-housing-prices))

In [None]:
!pip install torchmetrics

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from torchmetrics import MeanSquaredLogError

torch.manual_seed(12345)

# 1. Get the data (Already in Colab)

In [None]:
TRAIN_DATA_PATH = '/content/sample_data/california_housing_train.csv'
TEST_DATA_PATH = '/content/sample_data/california_housing_test.csv'
TARGET_NAME = 'median_house_value'

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

In [None]:
train_data.head()

In [None]:
train_data.shape, test_data.shape

In [None]:
x_train, y_train = train_data.drop(TARGET_NAME, axis=1), train_data[TARGET_NAME]
x_test, y_test = test_data.drop(TARGET_NAME, axis=1), test_data[TARGET_NAME]

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

# 2. Pre-process the data.

In [None]:
standard_scaler = StandardScaler()
x_train_scaled = pd.DataFrame(
    standard_scaler.fit_transform(x_train),
    columns=x_train.columns
)
x_test_scaled = pd.DataFrame(
    standard_scaler.transform(x_test),
    columns = x_test.columns
)

In [None]:
x_train_scaled.head()

In [None]:
# Create a PyTorch dataset
class HousingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.to_numpy()).float()
        self.y = torch.from_numpy(y.to_numpy()).float()
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
train_set = HousingDataset(x_train_scaled[:15000], y_train[:15000])
valid_set = HousingDataset(x_train_scaled[15000:], y_train[15000:])
test_set = HousingDataset(x_test_scaled, y_test)
len(train_set), len(valid_set), len(test_set)

In [None]:
train_set[0]

In [None]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False)

In [None]:
len(train_loader)

In [None]:
64*235

In [None]:
next(iter(train_loader))

# 3. Design the network.

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(8, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.MSELoss()

# 4. Train the network.

In [None]:
def train(epochs=10):
  train_history = []
  valid_history = []
  for epoch in range(epochs):
      running_loss = 0.0
      for i, data in enumerate(train_loader):
          inputs, labels = data
          optimizer.zero_grad()
          outputs = net(inputs)
          loss = criterion(outputs[:, 0], labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()
      train_history.append(running_loss / len(train_loader))
      
      with torch.no_grad():
        running_loss = 0
        for i, data in enumerate(valid_loader):
          inputs, labels = data
          outputs = net(inputs)
          loss = criterion(outputs[:, 0], labels)
          running_loss += loss.item()
        valid_history.append(running_loss / len(valid_loader))

      print('[%d] loss: %.3f, valid_loss: %.3f' % (epoch + 1, train_history[-1], valid_history[-1]))

  plt.plot(train_history)
  plt.plot(valid_history)
  plt.xlabel("Epochs")
  plt.ylabel("MSE")

In [None]:
train()

In [None]:
net = Net()
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = MeanSquaredLogError()
train()

# Exercise 1: Re-train the model to improve its performance and evaluate it on the test set

In [None]:
# Train the model


In [None]:
# Predict on the test set


In [None]:
# Compare the results with the ground truth
