<a href="https://colab.research.google.com/github/dernameistegal/airbnb_price/blob/main/models/pictures_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Preparation

In [1]:
#@title remove repos from disc
%cd /content
!rm -r airbnb_price

/content


In [2]:
#@title Clone repo
!git clone https://github.com/dernameistegal/airbnb_price.git

Cloning into 'airbnb_price'...
remote: Enumerating objects: 509, done.[K
remote: Counting objects: 100% (509/509), done.[K
remote: Compressing objects: 100% (479/479), done.[K
remote: Total 509 (delta 267), reused 139 (delta 24), pack-reused 0[K
Receiving objects: 100% (509/509), 3.60 MiB | 9.18 MiB/s, done.
Resolving deltas: 100% (267/267), done.


In [3]:
#@title add paths to library search path
import sys 

sys.path.append("/content/airbnb_price/custom_functions")

In [4]:
#@title Imports and drive
import os
import torch
import torchvision
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
import requests
from tqdm import tqdm
from scipy.stats.stats import pearsonr

# own modules
import general_utils as gu
import picture_model_utils as pu



from google.colab import drive

#@title Mount drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [5]:
#@title define device

# device
device = gu.get_device()
num_cpus = os.cpu_count()
print(num_cpus, 'CPUs available')

cuda available: True ; cudnn available: True ; num devices: 1
Using device Tesla P100-PCIE-16GB
4 CPUs available


# 1. Model thumbnail pictures

In [6]:
# define train, val split
listings = pd.read_pickle("/content/drive/MyDrive/Colab/airbnb/data/data1/listings_workfile.pickle")
trainsplit, valsplit, _ = gu.train_val_test_split(listings.index)

In [7]:
# information regarding predictor and response
thumbnail_dir = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_raw"

log_price = listings["log_price"]
log_price = log_price.to_dict()

In [8]:
# make train_dataset and val_dataset and respective dataloader with thumbnails
train_dataset = pu.ThumbnailsDataset(thumbnail_dir=thumbnail_dir, response=log_price, split=trainsplit)
val_dataset = pu.ThumbnailsDataset(thumbnail_dir=thumbnail_dir, response=log_price, split=valsplit)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=False)

In [9]:
# calculate root mse to get reference value for model performance
logprice = []
for i in tqdm(range(len(val_dataset))):
    logprice.append(val_dataset[i][1])

100%|██████████| 1711/1711 [00:06<00:00, 259.26it/s]


In [10]:
mean_logprice = np.mean(logprice)
squared_error = (np.array(logprice) - mean_logprice) ** 2
mean_squared_error = np.mean(squared_error)
root_mean_squared_error = np.sqrt(mean_squared_error)
root_mean_squared_error

0.64899296

In [11]:
#@title define models classes
class Model(torch.nn.Module):
    def __init__(self, feature_extractor, finalizer):
        super().__init__()
        self.feature_extractor = feature_extractor
        self.finalizer = finalizer

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.finalizer(x)
    
        return x

class Finalizer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.bn1 = torch.nn.BatchNorm2d(512)
        self.conv2 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.bn2 = torch.nn.BatchNorm2d(512)
        self.conv3 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.bn3 = torch.nn.BatchNorm2d(512)
        self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.drop1 = torch.nn.Dropout()
        self.linear1 = torch.nn.Linear(in_features = 25088, out_features=256)
        self.bn4 = torch.nn.BatchNorm1d(256)
        self.drop2 = torch.nn.Dropout()
        self.linear2 = torch.nn.Linear(in_features=256, out_features=1)

    def forward(self, x):
        x = self.bn1(F.relu(self.conv1(x)))
        x = self.bn2(F.relu(self.conv2(x)))
        x = self.bn3(F.relu(self.conv3(x)))
        x = self.pool1(x)
        x = torch.flatten(x, start_dim=1)
        x = self.drop1(x)
        x = self.bn4(F.relu(self.linear1(x)))
        x = self.drop2(x)
        x = self.linear2(x)

        return x
        

In [13]:
#@title define train functions
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR, StepLR
import torchvision
from torchvision import datasets, transforms, models
import fastprogress
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
    
def train(dataloader, optimizer, model, loss_fn, device, master_bar, scaler):
    model.train()
    epoch_loss = []

    for image, target in fastprogress.progress_bar(dataloader, parent=master_bar):
        
        image, target = image.to(device), target.to(device)
        target = torch.squeeze(target)

        # zero gradient
        optimizer.zero_grad()

        # Forward pass
        prediction = model.forward(image)
        prediction = torch.squeeze(prediction)

        # loss calculation
        loss = loss_fn(prediction, target)

        # Backward pass
        scaler.scale(loss).backward() #loss.backward()
        scaler.step(optimizer) # optimizer.step()
        scaler.update()

        # For plotting the train loss, save it for each sample
        epoch_loss.append(np.sqrt(loss.item()))

    return np.mean(epoch_loss)


def validate(dataloader, model, loss_fn, device, master_bar):
    model.eval()
    epoch_loss = []

    with torch.no_grad():
        for image, target in fastprogress.progress_bar(dataloader, parent=master_bar):
            
            image, target = image.to(device), target.to(device)
            target = torch.squeeze(target)

            # Forward pass
            prediction = model.forward(image)
            prediction = torch.squeeze(prediction)

            # loss calculation
            loss = loss_fn(prediction, target)

            # For plotting the train loss, save it for each sample
            epoch_loss.append(np.sqrt(loss.item()))

    return np.mean(epoch_loss)
    

def run_training(model, optimizer, scheduler, loss_fn, device, num_epochs,
                 train_dataloader, val_dataloader, verbose, 
                 savefolder="training_results1"):
  
    # make path to save performance measures and state dict
    savepath = "/content/drive/MyDrive/Colab/airbnb/data/thumbnails/thumbnails_model/" + savefolder
    os.mkdir(savepath)
  
    # technical stuff
    start_time = time.time()
    scaler = torch.cuda.amp.GradScaler()
    master_bar = fastprogress.master_bar(range(num_epochs))

    # instantiate losses
    train_loss = []
    val_loss = []

    for epoch in master_bar:

        # Train the model
        epoch_train_loss = train(train_dataloader, optimizer, model, loss_fn, device, master_bar, scaler)
        #Validate the model
        epoch_val_loss = validate(val_dataloader, model, loss_fn, device, master_bar)

        # update scheduler
        scheduler.step()

        # Save loss and acc for plotting
        train_loss.append(epoch_train_loss)
        val_loss.append(epoch_val_loss)

        if val_loss[-1] <= np.min(val_loss):
            torch.save(model.state_dict(), savepath + "/checkpoint.pt")
            print("saving model...")

        if verbose:
            master_bar.write(
                f'Train root mse: {epoch_train_loss:.4f}, val root mse: {epoch_val_loss:.4f}')
            
    

    time_elapsed = np.round(time.time() - start_time, 0).astype(int)
    print(f'Finished training after {time_elapsed} seconds.')
    return train_loss, val_loss
    

In [14]:
# define feature extractor
vgg = torchvision.models.vgg19(pretrained=True)
feature_extractor = vgg.features[0:31]

# define finalizer
finalizer = Finalizer()

# define model
model = Model(feature_extractor=feature_extractor, finalizer=finalizer)
model = model.to(device)

# freeze parameters in feature extractor
# for name, p in model.named_parameters():
#     if "feature_extractor" in name:
#         p.requires_grad = False

# print parameters that are optimized
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name)

In [15]:
# define loss function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: 0.1 ** (epoch // 10), 
                                              last_epoch=- 1, verbose=True)
loss_fn = torch.nn.MSELoss()

Adjusting learning rate of group 0 to 1.0000e-03.


In [None]:
train_loss, val_loss = run_training(model=model, optimizer=optimizer, scheduler=scheduler, loss_fn=loss_fn, device=device, num_epochs=20,
                                    train_dataloader=train_dataloader, val_dataloader=val_dataloader, verbose=True)

Adjusting learning rate of group 0 to 1.0000e-03.


In [None]:
ys = []
ypreds = []

with torch.no_grad():
    model = model.eval()

    for x, y in tqdm(val_dataloader):
        x = x.to(device)
        y_pred = model(x)
        y_pred = list(y_pred.data.cpu())
        ypreds = ypreds + y_pred

        ys = ys + list(y)

In [None]:
print(pearsonr(ys, ypreds))
plt.scatter(ys, ypreds)
plt.show()

In [None]:
gu.plot("Root-MSE during training", "Root-MSE", train_loss, val_loss, yscale='linear', legend=["Training", "Validation"],
         thinning=1, save_path=None)
