# Airbnb Property Listing - Neural Network Modelling

## Library Imports

In [63]:
import numpy as np
import pandas as pd
from zipfile import ZipFile
import data_cleaning
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import random_split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [43]:
'''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
# When running on the CuDNN backend, two further options must be set
torch.backends.cudnn.deterministic = True
# Set a fixed value for the hash seed
# os.environ['PYTHONHASHSEED'] = str(SEED)

## Create the dataset

In [44]:
with ZipFile('./airbnb-property-listings.zip') as myzip:
    data = myzip.open("AirbnbDataSci/tabular_data/AirBnbData.csv")

data_df = pd.read_csv(data)
cleaned_data_df = data_df.pipe(data_cleaning.clean_tabular_data)
cleaned_data_df.head()

Unnamed: 0,id,category,title,description,amenities,location,guests,beds,bathrooms,price_night,cleanliness_rate,accuracy_rate,communication_rate,location_rate,check-in_rate,value_rate,amenities_count,url,bedrooms
0,f9dcbd09-32ac-41d9-a0b1-fdb2793378cf,Treehouses,Red Kite Tree Tent - Ynys Affalon,Escape to one of these two fabulous Tree Tents...,"['What this place offers', 'Bathroom', 'Shampo...",Llandrindod Wells United Kingdom,2.0,1.0,1.0,105.0,4.6,4.7,4.3,5.0,4.3,4.3,13.0,https://www.airbnb.co.uk/rooms/26620994?adults...,1.0
1,1b4736a7-e73e-45bc-a9b5-d3e7fcf652fd,Treehouses,Az Alom Cabin - Treehouse Tree to Nature Cabin,Come and spend a romantic stay with a couple o...,"['What this place offers', 'Bedroom and laundr...",Guyonvelle Grand Est France,3.0,3.0,0.0,92.0,4.3,4.7,4.6,4.9,4.7,4.5,8.0,https://www.airbnb.co.uk/rooms/27055498?adults...,1.0
2,d577bc30-2222-4bef-a35e-a9825642aec4,Treehouses,Cabane Entre Les Pins\n🌲🏕️🌲,"Rustic cabin between the pines, 3 meters high ...","['What this place offers', 'Scenic views', 'Ga...",Duclair Normandie France,4.0,2.0,1.5,52.0,4.2,4.6,4.8,4.8,4.8,4.7,51.0,https://www.airbnb.co.uk/rooms/51427108?adults...,1.0
3,ca9cbfd4-7798-4e8d-8c17-d5a64fba0abc,Treehouses,Tree Top Cabin with log burner & private hot tub,The Tree top cabin is situated in our peaceful...,"['What this place offers', 'Bathroom', 'Hot wa...",Barmouth Wales United Kingdom,2.0,1.0,1.0,132.0,4.8,4.9,4.9,4.9,5.0,4.6,23.0,https://www.airbnb.co.uk/rooms/49543851?adults...,1.0
5,cfe479b9-c8f8-44af-9bc6-46ede9f14bb5,Treehouses,Treehouse near Paris Disney,"Charming cabin nestled in the leaves, real unu...","['What this place offers', 'Bathroom', 'Hair d...",Le Plessis-Feu-Aussoux Île-de-France France,4.0,3.0,1.0,143.0,5.0,4.9,5.0,4.7,5.0,4.7,32.0,https://www.airbnb.co.uk/rooms/935398?adults=1...,2.0


In [71]:
X = cleaned_data_df.select_dtypes('number').drop(columns='price_night').values
y = np.array(cleaned_data_df.price_night.values).reshape(-1,1)

std_scaler_input = StandardScaler()
std_scaler_output = StandardScaler()

train_set_len = round(0.7*len(X))
val_set_len = round(0.15*len(X))
test_set_len = len(X) - train_set_len - val_set_len

X_train, X_val, X_test = random_split(X, [train_set_len, val_set_len, test_set_len], generator=torch.Generator().manual_seed(42))
y_train, y_val, y_test = random_split(y, [train_set_len, val_set_len, test_set_len], generator=torch.Generator().manual_seed(42))

X_train_scaled = std_scaler_input.fit_transform(X_train)
X_val_scaled = std_scaler_input.transform(X_val)
X_test_scaled = std_scaler_input.transform(X_test)

# print(np.array(y_train).shape)

y_train_scaled = std_scaler_output.fit_transform(y_train)
y_val_scaled = std_scaler_output.transform(y_val)
y_test_scaled = std_scaler_output.transform(y_test)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [54]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y), "Data and labels have to be of equal length!"
        self.X = torch.tensor(np.array(X)).float()
        self.y = torch.tensor(np.array(y)).float()

    # Not dependent on index
    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [72]:
batch_size = 64
train_dataset = Dataset(X_train_scaled, y_train_scaled)
val_dataset = Dataset(X_val_scaled, y_val_scaled)
test_dataset = Dataset(X_test_scaled, y_test_scaled)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

for (X, y) in train_dataloader:
    print(X.shape, y.shape)

torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([64, 11]) torch.Size([64, 1])
torch.Size([5, 11]) torch.Size([5, 1])


## Creating the model

In [74]:
# Creating the Network
# create the two layers with hyper-parameter sizes (number of nodes)
class RegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        layer0 = nn.Linear(11,4)
        nn.init.kaiming_uniform_(layer0.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(layer0.bias, 1)
        layer1 = nn.Linear(4,2)
        nn.init.kaiming_uniform_(layer1.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(layer1.bias, 1)
        layer2 = nn.Linear(2,1)
        nn.init.kaiming_uniform_(layer2.weight, mode='fan_in', nonlinearity='relu')
        nn.init.constant_(layer2.bias, 1)
        self.layers = nn.Sequential(
            layer0,
            nn.ReLU(),
            layer1,
            nn.ReLU(),
            layer2
        )
    
    def forward(self, features):
        # return torch.reshape(self.layers(features), (-1,))
        return self.layers(features)

In [77]:
def train(model, train_loader, val_loader, lr=1e-3, epochs=int(1e4), optimiser=torch.optim.SGD):
    writer = SummaryWriter()
    optimiser = optimiser(model.parameters(), lr=lr)
    batch_idx = 0
    model.train()

    for epoch in range(epochs):
        for batch in train_loader:
            features, labels = batch
            predictions = model(features)
            loss = F.mse_loss(predictions, labels)
            loss.backward()
            optimiser.step()
            optimiser.zero_grad()
            writer.add_scalar('loss-train', loss.item(), batch_idx)
            batch_idx += 1
            if batch_idx % 10 == 0:
                val_loss = evaluate(model, val_loader)
                writer.add_scalar('loss-val', val_loss, batch_idx)

def evaluate(model, data_loader):
    losses = []
    model.eval()

    for batch in data_loader:
        features, labels = batch
        predictions = model(features)
        loss = F.mse_loss(predictions, labels)
        losses.append(loss.detach())
        
    avg_loss = np.mean(losses)
    return avg_loss

model = RegressionModel()

train(model, train_dataloader, val_dataloader, epochs = 3000)

## Evaluate on Test Set

In [78]:
losses = []
model.eval()

for batch in test_dataloader:
    features, labels = batch
    predictions = model(features)
    loss = F.mse_loss(predictions, labels)
    losses.append(loss.detach())
    
avg_loss = np.mean(losses)
avg_loss

0.59107965