In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import pandas as pd
import os
import cv2
from PIL import Image
from tqdm import tqdm
import pickle
import torch.nn as nn

In [2]:
listings = pd.read_csv('LA_Airbnb/listings_detailed.csv')
listings['clean_price'] = [float(i.replace('$','').replace(',','')) for i in listings['price']]
room_codes = [i.split('/')[-1] for i in listings['listing_url'].values]


In [3]:
# all_paths = []
# all_labels = []
# for room_code in tqdm(room_codes):
#     try: 
#         photos = os.listdir(f'LA_photos/{room_code}')
#     except: 
#         continue
#     all_paths.extend([f'LA_photos/{room_code}/{i}' for i in photos])
#     labels = listings.set_index('id').clean_price.loc[float(room_code)]
#     if not isinstance(labels, float):
#         raise
#     all_labels.append([labels for count in range(len(photos))])




In [4]:
training_room_codes = np.random.choice(room_codes,size=int(len(room_codes)*0.90),replace=False)
testing_room_codes = [i for i in room_codes if not i in training_room_codes]
training_labels = listings.set_index('id').clean_price.loc[[float(i) for i in training_room_codes]].values
testing_labels = listings.set_index('id').clean_price.loc[[float(i) for i in testing_room_codes]].values


In [5]:
len(training_room_codes),len(testing_room_codes)

(36394, 4044)

In [6]:
from cnn_utiles import train_loader,SimpleCNN

In [7]:
model = SimpleCNN()
device = torch.device("mps")
model = model.to(device)


In [8]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


In [9]:
class early_stopper():
    def __init__(self,patient = 3):
        self.val_loss_list = []
        self.train_loss_list = []
        self.lowest_val_loss = 1e8
        self.patient = patient
    def check_early_stopping_store_best(self,model,val_loss,train_loss):
        self.train_loss_list.append(train_loss)
        if len(self.val_loss_list)<self.patient:
            self.val_loss_list.append(val_loss)
            if val_loss<self.lowest_val_loss:
                self.lowest_val_loss = val_loss
                pickle.dump(model.to('cpu'), open('best_CNN.pkl','wb'))
        else:
            if val_loss>np.max(self.val_loss_list[-self.patient:]):
                return True
            else:
                if val_loss<self.lowest_val_loss:
                    self.lowest_val_loss = val_loss
                    pickle.dump(model.to('cpu'), open('best_CNN.pkl','wb'))
                return False

stopper = early_stopper(patient=3)

In [10]:
epoch_pogress = tqdm(range(20))
mean_training_loss = np.nan
mean_val_loss = np.nan


for epoch in epoch_pogress:  # loop over the dataset multiple times

    ### training
    tr_loader = train_loader(training_room_codes,training_labels, batch_size=50) ### reset data each epoch

    loss_list = []
    for i, data in enumerate(tr_loader.generate_dataset()):
        model.train()
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels, frac_rooms_trained = data
        inputs,labels = inputs.to(device),labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        score = loss.item()
        loss_list.append(loss.item())
        epoch_pogress.set_postfix_str(f"epoch {epoch}, batch loss:{score}, last training loss: {mean_training_loss}, last validation loss: {mean_val_loss}") 

    mean_training_loss = np.mean(loss_list)

    ### validation
    val_loader = train_loader(testing_room_codes,testing_labels, batch_size=50) ### regenerate val set each time

    ##
    loss_list = []
    for val_index, data in enumerate(val_loader.generate_dataset()):
        model.eval()
        inputs, labels, frac_rooms_tested = data
        inputs,labels = inputs.to(device),labels.to(device)

        pred = model(inputs)
        loss_val = criterion(pred, labels)
        loss_list.append(loss.item())
    
    mean_val_loss = np.mean(loss_list)
    stop = stopper.check_early_stopping_store_best(model, mean_val_loss, mean_training_loss)
    model = model.to(device)
    scheduler.step(mean_val_loss)
    epoch_pogress.set_postfix_str(f"epoch {epoch}, training loss: {mean_training_loss}, validation loss: {mean_val_loss}") 
    if stop:
        break
            
print('Finished Training')



100%|██████████| 20/20 [14:29:10<00:00, 2607.54s/it, epoch 19, training loss: 0.5866107593484337, validation loss: 0.6225075721740723]                                             

Finished Training



