In [36]:
%load_ext tensorboard
import os 
from helpers_image_classification import *
from load_data import *
import cv2
import random
import itertools
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10000)
from load_data import *
from itertools import chain
from collections import Counter
import sklearn.metrics
import io
import re
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import preprocessing
from collections import Counter
from PIL import ImageStat, Image
import shap

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Log Prices and Max Pooling

In [37]:
# load paths and labels
img_folder = "data/images_resized"
img_df = pd.read_csv("data/img_paths.csv")
img_df.reset_index(drop=True, inplace = True)

# load label book
label_cat = ["bathroom", "bedroom", "dining", "hallway", "kitchen", "living", "others"]
label = np.arange(7)
label_book = pd.DataFrame({"label": label_cat, "categorical_label": label})

In [38]:
# prices
url_listing = "http://data.insideairbnb.com/ireland/leinster/dublin/2021-11-07/data/listings.csv.gz"
listings = pd.read_csv(url_listing)
urls = listings["listing_url"]
ids = listings["id"]
price = listings["price"]
price = price.str.replace("$","")
price = price.str.replace(",","")
price = price.astype(float)
listings["price"] = price
listings["log_price"] = np.log(price)
listings = listings[listings["price"]<500]


In [39]:
# save relevant columns
price_df = listings[["log_price", "id"]]
# merge image and price data together to get the corresponding price for each image
df = pd.merge(img_df, price_df, on = "id", how = "left")
df.head()

Unnamed: 0,img_path,id,img_no,label,log_price
0,44077_0.png,44077,0,4.0,4.174387
1,44077_1.png,44077,1,5.0,4.174387
2,44077_2.png,44077,2,1.0,4.174387
3,44077_3.png,44077,3,1.0,4.174387
4,44077_4.png,44077,4,1.0,4.174387


In [40]:
filter = np.any(df.isna(), axis = 1)
df = df[~filter]

# drop "others" as we are only interested in the rooms
filter = df["label"] == 6.0
df = df[~filter]

# use only listings with images for 4 or categories 
bool = []
for id in df["id"]:
    tmp = df[df["id"] == id]
    if len(np.unique(tmp["label"])) >= 4:
        bool.append(True)
    else:
        bool.append(False)
df_new = df[bool]
np.unique(df_new["id"].values).shape
df = df_new
df.shape

(56704, 5)

In [41]:
def data_generator(df):
    ''' Extracts the features from a Resnet50 '''
    counter_overall_dummy = 0
    counter_overall_img = 0

    def input_pipeline(room = 0, df = df):
        ids = []
        features = []
        
        # Filter df by room type
        df_room = df[df["label"] == room]
        
        # load resnet
        resnet = tf.keras.applications.resnet.ResNet50(include_top=False, weights='imagenet', pooling="avg", input_shape = (None,None,3))
        resnet_pre = keras.applications.resnet50.preprocess_input
        resnet.trainable = False
        
        # count dummies and images overall
        counter_dummy = 0
        counter_img = 0
        
        # for every id route all images of a room through the resnet50
        for id in tqdm_notebook(np.unique(df["id"])):
            filter = df_room["id"] == id  

            # if an error occurs just add a black images otherwise get resnet features
            try:
                l = []
                
                # if no image of the room is present add a black image and route it throug the resnet
                if filter.sum() == 0:
                    dummy_image = np.zeros((1,256,256,3))
                    dummy_image = resnet_pre(dummy_image)
                    dummy_image = resnet(dummy_image)
                    l.append(dummy_image)
                    counter_dummy += 1             
                
                # load image and get features
                else:
                    path_id = df_room["img_path"][filter]
                    for p in path_id:
                        img_tmp = cv2.imread("data/images_resized/"+p)
                        img_tmp = np.expand_dims(img_tmp, axis = 0)
                        img_tmp = resnet_pre(img_tmp.astype(np.float32))
                        img_tmp = resnet(img_tmp)
                        l.append(img_tmp)
                        counter_img += 1
                        
                # stack all images and compute maximum
                l = np.stack(l)
                l = np.max(l, axis = 0)
                features.append(l)
                ids.append(id)
                
            except:
                dummy_image = np.zeros((1,256,256,3))
                dummy_image = resnet_pre(dummy_image)
                dummy_image = resnet(dummy_image)
                l.append(dummy_image)
                l = np.stack(l)
                l = np.max(l, axis = 0)
                features.append(l)
                ids.append(id)
                continue
            
        # stack all features of the images and leave out zeros features
        features = np.squeeze(np.stack(features))
        filter = np.nonzero(features.sum(axis = 0))[0]
        features = features[:,filter]
        print(len(filter), " features are nonzero.")
        
        features = features.tolist()
        print(counter_dummy, "dummy images were added.")
        
        return features, ids, counter_dummy, counter_img

    basis_df = df[["id","log_price"]]
    basis_df = basis_df.drop_duplicates()
    
    features = []
    ids = []
    
    # loop over all room categories
    for i in tqdm_notebook(np.unique(df["label"])):
        
        feat_cat, ids_cat, counter_dummy, counter_img = input_pipeline(i,df)
        df_tmp = pd.DataFrame({"features_"+str(i): feat_cat, "id": ids_cat})
        basis_df = pd.merge(basis_df, df_tmp, on = "id", how = "left")
        counter_overall_dummy += counter_dummy
        counter_overall_img += counter_img
        
    print(counter_overall_dummy)
    print(counter_overall_img)
    
    return basis_df
    

In [42]:
final_df = data_generator(df)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
266 dummy images were added.


  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
42 dummy images were added.


  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
2316 dummy images were added.


  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
2428 dummy images were added.


  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
189 dummy images were added.


  0%|          | 0/4356 [00:00<?, ?it/s]

2048  features are nonzero.
342 dummy images were added.
5583
56704


In [43]:
# define input of model a tuples
def transform(ds):
    bath = tf.unstack(ds["bath"])
    bed = tf.unstack(ds["bed"])
    dining = tf.unstack(ds["dining"])
    hall = tf.unstack(ds["hall"])
    kitchen = tf.unstack(ds["kitchen"])
    living = tf.unstack(ds["living"])

    prices = ds["price"]
    
    return (bath, bed, dining, hall, kitchen, living), prices

In [44]:
X_train, X_test, X_val, y_train, y_test, y_val = load_data(drop_id = False)

train_ids = X_train["id"]
test_ids = X_test["id"]
val_ids = X_val["id"]

X_train = pd.merge(train_ids, final_df, on = "id", how = "left")
X_train = X_train.drop(["id","log_price"], axis = 1)
X_train.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]
filter = np.any(X_train.isna(), axis = 1)
X_train = X_train[~filter]
y_train = y_train[~filter]

X_test = pd.merge(test_ids, final_df, on = "id", how = "left")
X_test = X_test.drop(["id","log_price"], axis = 1)
X_test.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]
filter = np.any(X_test.isna(), axis = 1)
X_test = X_test[~filter]
y_test = y_test[~filter]

X_val = pd.merge(val_ids, final_df, on = "id", how = "left")
X_val = X_val.drop(["id","log_price"], axis = 1)
X_val.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]
filter = np.any(X_val.isna(), axis = 1)
X_val = X_val[~filter]
y_val = y_val[~filter]


In [45]:
# count how many training images we got
train_id_final = pd.merge(train_ids, final_df, on = "id", how = "left")["id"]
counter = 0
for i in train_id_final:
    counter += (df["id"] == i).sum()
counter

35785

In [46]:
# define weight constraint that ensures incoming weights for a layer to be greater than 0 and sum up top 1 
from tensorflow.keras import backend
class weight_constr(tf.keras.constraints.Constraint):
  ''' Constrains weight tensors to sum up to 1 and being greater than 0 '''

  def __init__(self):
    self.ref_value = 1

  def __call__(self, w):
    nonneg = w * tf.cast(tf.greater_equal(w, 0.), backend.floatx())
    sum_w = tf.reduce_sum(nonneg)
    nonneg_one = nonneg/sum_w
    return nonneg_one


In [47]:
# setup model
class Img_model(tf.keras.Model):

  def __init__(self, dropout = 0, l2 = 0, nodes1 = 512, nodes2 = 1):
    super().__init__()
    
    # bathroom
    self.bn1_bath = tf.keras.layers.BatchNormalization()
    self.drop1_bath = tf.keras.layers.Dropout(dropout)
    self.dense1_bath = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_bath = tf.keras.layers.BatchNormalization()
    self.drop2_bath = tf.keras.layers.Dropout(dropout)
    #self.dense2_bath = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_bath = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))
 
    # bedroom
    self.bn1_bed = tf.keras.layers.BatchNormalization()
    self.drop1_bed = tf.keras.layers.Dropout(dropout)
    self.dense1_bed = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_bed = tf.keras.layers.BatchNormalization()
    self.drop2_bed = tf.keras.layers.Dropout(dropout)
    #self.dense2_bed = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_bed = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))
       
      # dining
    self.bn1_dining = tf.keras.layers.BatchNormalization()
    self.drop1_dining = tf.keras.layers.Dropout(dropout)
    self.dense1_dining = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_dining = tf.keras.layers.BatchNormalization()
    self.drop2_dining = tf.keras.layers.Dropout(dropout)
    #self.dense2_dining = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_dining = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))
   
    # hall
    self.bn1_hall = tf.keras.layers.BatchNormalization()
    self.drop1_hall = tf.keras.layers.Dropout(dropout)
    self.dense1_hall = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_hall = tf.keras.layers.BatchNormalization()
    self.drop2_hall = tf.keras.layers.Dropout(dropout)
    #self.dense2_hall = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_hall = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))
    
      # kitchen
    self.bn1_kitchen = tf.keras.layers.BatchNormalization()
    self.drop1_kitchen = tf.keras.layers.Dropout(dropout)
    self.dense1_kitchen = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_kitchen = tf.keras.layers.BatchNormalization()
    self.drop2_kitchen = tf.keras.layers.Dropout(dropout)
    #self.dense2_kitchen = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_kitchen = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))
  
      # livingroom
    self.bn1_living = tf.keras.layers.BatchNormalization()
    self.drop1_living = tf.keras.layers.Dropout(dropout)
    self.dense1_living = tf.keras.layers.Dense(nodes1, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.bn2_living = tf.keras.layers.BatchNormalization()
    self.drop2_living = tf.keras.layers.Dropout(dropout)
    #self.dense2_living = tf.keras.layers.Dense(nodes2, activation=tf.nn.relu, kernel_regularizer = keras.regularizers.l2(l2))
    self.dense2_living = tf.keras.layers.Dense(nodes2, kernel_regularizer = keras.regularizers.l2(l2))

    # final prediction
    self.bn_final = tf.keras.layers.BatchNormalization()
    self.drop_final = tf.keras.layers.Dropout(dropout)
    self.dense_final = tf.keras.layers.Dense(1, kernel_constraint =weight_constr())
    
  def call(self, inputs, training = None):
    
    # bathroom
    x = self.bn1_bath(inputs[0])
    x = self.drop1_bath(x)
    x = self.dense1_bath(x)
    x = self.bn2_bath(x)
    x = self.drop2_bath(x)
    bath_out = self.dense2_bath(x)
    
    
    # bedroom
    x = self.bn1_bed(inputs[1])
    x = self.drop1_bed(x)
    x = self.dense1_bed(x)
    x = self.bn2_bed(x)
    x = self.drop2_bed(x)
    bed_out = self.dense2_bed(x)
    
    # diningroom
    x = self.bn1_dining(inputs[2])
    x = self.drop1_dining(x)
    x = self.dense1_dining(x)
    x = self.bn2_dining(x)
    x = self.drop2_dining(x)
    dining_out = self.dense2_dining(x)
    
    # hallroom
    x = self.bn1_hall(inputs[3])
    x = self.drop1_hall(x)
    x = self.dense1_hall(x)
    x = self.bn2_hall(x)
    x = self.drop2_hall(x)
    hall_out = self.dense2_hall(x)
    
    # kitchen
    x = self.bn1_kitchen(inputs[4])
    x = self.drop1_kitchen(x)
    x = self.dense1_kitchen(x)
    x = self.bn2_kitchen(x)
    x = self.drop2_kitchen(x)
    kitchen_out = self.dense2_kitchen(x)
    
    # livingroom
    x = self.bn1_living(inputs[5])
    x = self.drop1_living(x)
    x = self.dense1_living(x)
    x = self.bn2_living(x)
    x = self.drop2_living(x)
    living_out = self.dense2_living(x)

    # join
    out = tf.keras.layers.concatenate([bath_out, bed_out, dining_out, hall_out, kitchen_out, living_out])#, others_out])
    out = self.bn_final(out)
    out = self.drop_final(out)
    return self.dense_final(out)

In [48]:
# define r^2 metric
def R_squared(y, y_pred):
  residual = tf.reduce_sum(tf.square(tf.subtract(y, y_pred)))
  total = tf.reduce_sum(tf.square(tf.subtract(y, tf.reduce_mean(y))))
  r2 = tf.subtract(1.0, tf.math.divide(residual, total))
  
  return r2

## Grid Search

In [None]:

# load data
X_train_init, X_test_init, X_val_init, y_train_init, y_test_init, y_val_init = load_data(drop_id = False)

# join train and validation data as we only need the indicies
X = pd.concat((X_train_init,X_val_init)).reset_index(drop = True)
y = pd.concat((y_train_init, y_val_init)).reset_index(drop=True)

# define metrics to track
para_do = []
para_l2 = []
para_nodes = []
best_val_mse = []
best_val_rsq = []
best_train_mse = []
best_train_rsq = []

kf = KFold(n_splits=5)

total = 5 * 3 * 3 * 2
with tqdm_notebook(total=total) as pbar:

    for do in [0,0.2,0.5]: # best 0.2, 0.001, 64 
        for l2 in [0,0.01,0.001]:
            for nodes in [64, 512]:
                best_val_mse_tmp = []
                best_val_rsq_tmp = []     
                best_train_mse_tmp = []
                best_train_rsq_tmp = []    
                for train_idx, val_idx in kf.split(X):
                    tf.random.set_seed(2)

                    # data processing
                    X_train = X.loc[train_idx,:]
                    X_val = X.loc[val_idx,:]
                    X_test = X_test_init

                    y_train = y[train_idx]
                    y_val = y[val_idx]
                    y_test = y_test_init

                    train_ids = X_train["id"]
                    test_ids = X_test_init["id"]
                    val_ids = X_val["id"]

                    X_train = pd.merge(train_ids, final_df, on = "id", how = "left")
                    X_train = X_train.drop(["id","log_price"], axis = 1)
                    X_train.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
                    filter = np.any(X_train.isna(), axis = 1).values
                    X_train = X_train[~filter]
                    y_train = y_train[~filter]

                    X_test = pd.merge(test_ids, final_df, on = "id", how = "left")
                    X_test = X_test.drop(["id","log_price"], axis = 1)
                    X_test.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
                    filter = np.any(X_test.isna(), axis = 1).values
                    X_test = X_test[~filter]
                    y_test = y_test[~filter]

                    X_val = pd.merge(val_ids, final_df, on = "id", how = "left")
                    X_val = X_val.drop(["id","log_price"], axis = 1)
                    X_val.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
                    filter = np.any(X_val.isna(), axis = 1).values
                    X_val = X_val[~filter]
                    y_val = y_val[~filter]
                    
                    batch_size = 128

                    # setup tf datasets
                    data_train = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_train["bath"])),
                                                                    "bed": np.squeeze(np.stack(X_train["bed"])),
                                                                    "dining": np.squeeze(np.stack(X_train["dining"])),
                                                                    "hall": np.squeeze(np.stack(X_train["hall"])),
                                                                    "kitchen": np.squeeze(np.stack(X_train["kitchen"])),
                                                                    "living": np.squeeze(np.stack(X_train["living"])),
                                                                    "price": y_train})
                    data_train = data_train.cache()
                    data_train = data_train.shuffle(6000, seed = 13)
                    train_dataset = data_train.take(len(y_train))
                    train_dataset = train_dataset.map(transform)
                    train_dataset = train_dataset.batch(batch_size)

                    data_test = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_test["bath"])),
                                                                    "bed": np.squeeze(np.stack(X_test["bed"])),
                                                                    "dining": np.squeeze(np.stack(X_test["dining"])),
                                                                    "hall": np.squeeze(np.stack(X_test["hall"])),
                                                                    "kitchen": np.squeeze(np.stack(X_test["kitchen"])),
                                                                    "living": np.squeeze(np.stack(X_test["living"])),
                                                                    "price": y_test})
                    data_test = data_test.cache()
                    test_dataset = data_test.take(len(y_test))
                    test_dataset = test_dataset.map(transform)
                    test_dataset = test_dataset.batch(batch_size)

                    data_val = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_val["bath"])),
                                                                    "bed": np.squeeze(np.stack(X_val["bed"])),
                                                                    "dining": np.squeeze(np.stack(X_val["dining"])),
                                                                    "hall": np.squeeze(np.stack(X_val["hall"])),
                                                                    "kitchen": np.squeeze(np.stack(X_val["kitchen"])),
                                                                    "living": np.squeeze(np.stack(X_val["living"])),
                                                                    "price": y_val})
                    data_val = data_val.cache()
                    val_dataset = data_val.take(len(y_val))
                    val_dataset = val_dataset.map(transform)
                    val_dataset = val_dataset.batch(batch_size)
                    
                    # setup model and train
                    model = Img_model(dropout= do, nodes1 = nodes, nodes2 = 1, l2 = l2)
                    
                    lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=100, decay_rate=0.9, staircase=False)
                    
                    model.compile(optimizer=keras.optimizers.Adam(lr),
                                    loss= "mse", metrics= R_squared)
                    tf.random.set_seed(2)
                    history = model.fit(train_dataset, epochs = 50, validation_data = val_dataset, verbose = 0)
                    best_val_mse_tmp.append(np.nanmin(np.array(history.history["val_loss"])))
                    best_val_rsq_tmp.append(np.nanmax(np.array(history.history["val_R_squared"])))
                    best_train_mse_tmp.append(np.nanmin(np.array(history.history["loss"])))
                    best_train_rsq_tmp.append(np.nanmax(np.array(history.history["R_squared"])))
                    print(np.nanmax(np.array(history.history["val_R_squared"])))
                    pbar.update(1)
                                            
                # save mean of metrics across folds                           
                best_val_mse.append(np.nanmean(np.array(best_val_mse_tmp)))
                best_val_rsq.append(np.nanmean(np.array(best_val_rsq_tmp)))
                best_train_mse.append(np.nanmean(np.array(best_train_mse_tmp)))
                best_train_rsq.append(np.nanmean(np.array(best_train_rsq_tmp)))
                
                # save parameter combination
                para_do.append(do)
                para_l2.append(l2)
                para_nodes.append(nodes)
pbar.close()
                

In [17]:
grid_results = pd.DataFrame({"Dropout": para_do, "L2": para_l2, "Nodes": para_nodes, "Val_MSE": best_val_mse, "Val_Rsq": best_val_rsq, "Train_MSE": best_train_mse, "Train_Rsq": best_train_rsq})
#grid_results = pd.read_csv("grid_results_priceimage.csv")
#grid_results.drop(grid_results.columns[0], axis = 1, inplace = True)

In [18]:
grid_results.to_csv("grid_results_priceimage.csv")

In [19]:
print(grid_results.to_latex(index = False))

\begin{tabular}{rrrrrrr}
\toprule
 Dropout &    L2 &  Nodes &  Val\_MSE &  Val\_Rsq &  Train\_MSE &  Train\_Rsq \\
\midrule
     0.0 & 0.000 &     64 & 0.340157 & 0.158453 &   0.009895 &   0.975213 \\
     0.0 & 0.000 &    512 & 0.320462 & 0.212454 &   0.008252 &   0.979390 \\
     0.0 & 0.010 &     64 & 0.472604 & 0.156609 &   0.195085 &   0.780643 \\
     0.0 & 0.010 &    512 & 0.553412 & 0.196390 &   0.293380 &   0.717599 \\
     0.0 & 0.001 &     64 & 0.400306 & 0.159742 &   0.085048 &   0.905126 \\
     0.0 & 0.001 &    512 & 0.423162 & 0.165022 &   0.115019 &   0.887467 \\
     0.2 & 0.000 &     64 & 0.308538 & 0.251441 &   0.299438 &   0.272605 \\
     0.2 & 0.000 &    512 & 0.305444 & 0.256064 &   0.286204 &   0.304451 \\
     0.2 & 0.010 &     64 & 0.625010 & 0.227250 &   0.756768 &  -0.063806 \\
     0.2 & 0.010 &    512 & 0.896431 & 0.215529 &   1.093091 &  -0.176111 \\
     0.2 & 0.001 &     64 & 0.487127 & 0.234388 &   0.518781 &   0.145123 \\
     0.2 & 0.001 &    512 & 0

In [20]:
# visualize grid search results
grid_res = grid_results.groupby(["Dropout","L2","Nodes"]).sum()
grid_res.columns = ["Val MSE", r"Val. R^2", "Train MSE", r"Train R^2"]
grid_nice = grid_res[[r"Val. R^2", r"Train R^2"]].style.background_gradient(cmap ='Greens')\
        .set_properties(**{'font-size': '10px'})
grid_nice

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Val. R^2,Train R^2
Dropout,L2,Nodes,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,64,0.158453,0.975213
0.0,0.0,512,0.212454,0.97939
0.0,0.001,64,0.159742,0.905126
0.0,0.001,512,0.165022,0.887467
0.0,0.01,64,0.156609,0.780643
0.0,0.01,512,0.19639,0.717599
0.2,0.0,64,0.251441,0.272605
0.2,0.0,512,0.256064,0.304451
0.2,0.001,64,0.234388,0.145123
0.2,0.001,512,0.2355,0.105202


In [22]:
grid_res = grid_results.groupby(["Dropout","L2","Nodes"]).sum()
grid_res.columns = ["Val MSE", r"Val. R^2", "Train MSE", r"Train R^2"]
grid_nice2 = grid_res[["Val MSE", "Train MSE"]].style.background_gradient(cmap ='Greens_r')\
        .set_properties(**{'font-size': '10px'})
grid_nice2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Val MSE,Train MSE
Dropout,L2,Nodes,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,64,0.340157,0.009895
0.0,0.0,512,0.320462,0.008252
0.0,0.001,64,0.400306,0.085048
0.0,0.001,512,0.423162,0.115019
0.0,0.01,64,0.472604,0.195085
0.0,0.01,512,0.553412,0.29338
0.2,0.0,64,0.308538,0.299438
0.2,0.0,512,0.305444,0.286204
0.2,0.001,64,0.487127,0.518781
0.2,0.001,512,0.609311,0.664445


Train again for best parameters to find best fold

In [28]:
weight_list = []

for i in np.arange(5):
    tf.random.set_seed(2)
    X_train, X_test, X_val, y_train, y_test, y_val = load_data_fold(fold = i+1,drop_id=False)
    
    train_ids = X_train["id"]
    test_ids = X_test["id"]
    val_ids = X_val["id"]

    X_train = pd.merge(train_ids, final_df, on = "id", how = "left")
    X_train = X_train.drop(["id","log_price"], axis = 1)
    X_train.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
    filter = np.any(X_train.isna(), axis = 1).values
    X_train = X_train[~filter]
    y_train = y_train[~filter]

    X_test = pd.merge(test_ids, final_df, on = "id", how = "left")
    X_test = X_test.drop(["id","log_price"], axis = 1)
    X_test.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
    filter = np.any(X_test.isna(), axis = 1).values
    X_test = X_test[~filter]
    y_test = y_test[~filter]

    X_val = pd.merge(val_ids, final_df, on = "id", how = "left")
    X_val = X_val.drop(["id","log_price"], axis = 1)
    X_val.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
    filter = np.any(X_val.isna(), axis = 1).values
    X_val = X_val[~filter]
    y_val = y_val[~filter]

    batch_size = 128

    data_train = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_train["bath"])),
                                                    "bed": np.squeeze(np.stack(X_train["bed"])),
                                                    "dining": np.squeeze(np.stack(X_train["dining"])),
                                                    "hall": np.squeeze(np.stack(X_train["hall"])),
                                                    "kitchen": np.squeeze(np.stack(X_train["kitchen"])),
                                                    "living": np.squeeze(np.stack(X_train["living"])),
                                                    "price": y_train})
    data_train = data_train.cache()
    data_train = data_train.shuffle(6000, seed = 13)
    train_dataset = data_train.take(len(y_train))
    train_dataset = train_dataset.map(transform)
    train_dataset = train_dataset.batch(batch_size)

    data_test = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_test["bath"])),
                                                    "bed": np.squeeze(np.stack(X_test["bed"])),
                                                    "dining": np.squeeze(np.stack(X_test["dining"])),
                                                    "hall": np.squeeze(np.stack(X_test["hall"])),
                                                    "kitchen": np.squeeze(np.stack(X_test["kitchen"])),
                                                    "living": np.squeeze(np.stack(X_test["living"])),
                                                    "price": y_test})
    data_test = data_test.cache()
    test_dataset = data_test.take(len(y_test))
    test_dataset = test_dataset.map(transform)
    test_dataset = test_dataset.batch(batch_size)

    data_val = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_val["bath"])),
                                                    "bed": np.squeeze(np.stack(X_val["bed"])),
                                                    "dining": np.squeeze(np.stack(X_val["dining"])),
                                                    "hall": np.squeeze(np.stack(X_val["hall"])),
                                                    "kitchen": np.squeeze(np.stack(X_val["kitchen"])),
                                                    "living": np.squeeze(np.stack(X_val["living"])),
                                                    "price": y_val})
    data_val = data_val.cache()
    val_dataset = data_val.take(len(y_val))
    val_dataset = val_dataset.map(transform)
    val_dataset = val_dataset.batch(batch_size)

    model = Img_model(dropout= 0.2, nodes1 = 64, nodes2 = 1, l2 = 0)

    lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=100, decay_rate=0.9, staircase=False)
    
    tf.random.set_seed(2)
    
    chkpt_path = "img_models/price/" + str(i) + "/checkpoint"
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = chkpt_path, monitor='val_R_squared', save_best_only=True, mode='max')

    model.compile(optimizer=keras.optimizers.Adam(lr),
                    loss= "mse", metrics= ["mae", R_squared])
    history = model.fit(train_dataset, epochs = 300, validation_data = val_dataset, verbose = 0, callbacks=[checkpoint_callback])
    
    print("Val loss: " ,np.nanmin(np.array(history.history["val_loss"])))
    print("Val MAE: " ,np.nanmin(np.array(history.history["val_mae"])))
    print("Val R2: ", np.nanmax(np.array(history.history["val_R_squared"])))
    
    model.load_weights(chkpt_path)
    weight_list.append(model.layers[-1].get_weights()[0])
    
    model.evaluate(test_dataset)
                            

2022-02-28 22:00:23.287004: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-28 22:00:30.543445: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/0/checkpoint

2022-02-28 22:49:27.523503: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/0/checkpoint: Failed precondition: img_models/price/0/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?




2022-02-28 22:50:36.932936: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-28 22:50:46.621297: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/1/checkpoint

2022-02-28 23:41:32.134695: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/1/checkpoint: Failed precondition: img_models/price/1/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?




2022-02-28 23:42:34.628784: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-28 23:42:42.899754: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/2/checkpoint

2022-03-01 00:35:23.937075: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/2/checkpoint: Failed precondition: img_models/price/2/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?




2022-03-01 00:36:26.427959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-01 00:36:35.328546: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/3/checkpoint

2022-03-01 01:27:54.967324: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/3/checkpoint: Failed precondition: img_models/price/3/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?




2022-03-01 01:29:09.937629: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-03-01 01:29:18.448875: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint/assets
INFO:tensorflow:Assets written to: img_models/price/4/checkpoint

2022-03-01 02:20:04.778132: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/4/checkpoint: Failed precondition: img_models/price/4/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?




In [29]:
weight_list

[array([[0.14073993],
        [0.13005005],
        [0.10052551],
        [0.09928927],
        [0.15316834],
        [0.37622693]], dtype=float32),
 array([[ 0.22747917],
        [ 0.2016276 ],
        [ 0.13439326],
        [ 0.15231746],
        [ 0.28418252],
        [-0.        ]], dtype=float32),
 array([[0.19079134],
        [0.18551785],
        [0.1290203 ],
        [0.13198824],
        [0.23174208],
        [0.13094015]], dtype=float32),
 array([[0.20265608],
        [0.17804036],
        [0.13247567],
        [0.1359714 ],
        [0.23287763],
        [0.11797892]], dtype=float32),
 array([[ 0.22712818],
        [ 0.21414454],
        [ 0.1467258 ],
        [ 0.15238139],
        [ 0.25962007],
        [-0.        ]], dtype=float32)]

## Train final model

In [49]:
batch_size = 128
tf.random.set_seed(2)

X_train, X_test, X_val, y_train, y_test, y_val = load_data_fold(fold = 1 ,drop_id=False)

train_ids = X_train["id"]
test_ids = X_test["id"]
val_ids = X_val["id"]

X_train = pd.merge(train_ids, final_df, on = "id", how = "left")
X_train = X_train.drop(["id","log_price"], axis = 1)
X_train.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
filter = np.any(X_train.isna(), axis = 1).values
X_train = X_train[~filter]
y_train = y_train[~filter]

X_test = pd.merge(test_ids, final_df, on = "id", how = "left")
X_test = X_test.drop(["id","log_price"], axis = 1)
X_test.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
filter = np.any(X_test.isna(), axis = 1).values
X_test = X_test[~filter]
y_test = y_test[~filter]

X_val = pd.merge(val_ids, final_df, on = "id", how = "left")
X_val = X_val.drop(["id","log_price"], axis = 1)
X_val.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]#, "others"]
filter = np.any(X_val.isna(), axis = 1).values
X_val = X_val[~filter]
y_val = y_val[~filter]

data_train = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_train["bath"])),
                                                 "bed": np.squeeze(np.stack(X_train["bed"])),
                                                 "dining": np.squeeze(np.stack(X_train["dining"])),
                                                 "hall": np.squeeze(np.stack(X_train["hall"])),
                                                 "kitchen": np.squeeze(np.stack(X_train["kitchen"])),
                                                 "living": np.squeeze(np.stack(X_train["living"])),
                                                 "price": y_train})
data_train = data_train.cache()
data_train = data_train.shuffle(6000, seed = 13)
train_dataset = data_train.take(len(y_train))
train_dataset = train_dataset.map(transform)
train_dataset = train_dataset.batch(batch_size)

data_test = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_test["bath"])),
                                                 "bed": np.squeeze(np.stack(X_test["bed"])),
                                                 "dining": np.squeeze(np.stack(X_test["dining"])),
                                                 "hall": np.squeeze(np.stack(X_test["hall"])),
                                                 "kitchen": np.squeeze(np.stack(X_test["kitchen"])),
                                                 "living": np.squeeze(np.stack(X_test["living"])),
                                                 "price": y_test})
data_test = data_test.cache()
test_dataset = data_test.take(len(y_test))
test_dataset = test_dataset.map(transform)
test_dataset = test_dataset.batch(batch_size)

data_val = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(X_val["bath"])),
                                                 "bed": np.squeeze(np.stack(X_val["bed"])),
                                                 "dining": np.squeeze(np.stack(X_val["dining"])),
                                                 "hall": np.squeeze(np.stack(X_val["hall"])),
                                                 "kitchen": np.squeeze(np.stack(X_val["kitchen"])),
                                                 "living": np.squeeze(np.stack(X_val["living"])),
                                                 "price": y_val})
data_val = data_val.cache()
val_dataset = data_val.take(len(y_val))
val_dataset = val_dataset.map(transform)
val_dataset = val_dataset.batch(batch_size)

In [50]:
# track and save training
logdir = "logs/price/ensemble/final/log_price_atleast4"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = logdir+"/checkpoint.ckpt", monitor='val_R_squared', save_best_only=True, mode='max')
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

model = Img_model(dropout= 0.2, nodes1 = 64, nodes2 = 1, l2 = 0)
lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=100, decay_rate=0.9, staircase=False)

tf.random.set_seed(2)
model.compile(optimizer=keras.optimizers.Adam(lr),
                loss= "mse", metrics= R_squared)
model.fit(train_dataset, epochs = 300, validation_data = val_dataset, callbacks = [checkpoint_callback, tensorboard_callback])

2022-03-01 18:17:56.667871: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2022-03-01 18:17:56.668198: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2022-03-01 18:17:56.669885: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.


Epoch 1/300


2022-03-01 18:17:58.356278: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 2/22 [=>............................] - ETA: 5s - loss: 22.5865 - R_squared: -55.1857 

2022-03-01 18:17:59.578147: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2022-03-01 18:17:59.578176: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2022-03-01 18:17:59.715085: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-03-01 18:17:59.729837: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2022-03-01 18:17:59.739807: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: logs/price/ensemble/final/log_price_atleast4/train/plugins/profile/2022_03_01_18_17_59
2022-03-01 18:17:59.744309: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to logs/price/ensemble/final/log_price_atleast4/train/plugins/profile/2022_03_01_18_17_59/Air-von-Dominik.fritz.box.trace.json.gz
2022-03-01 18:17:59.761796: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creat



2022-03-01 18:18:05.487471: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-03-01 18:18:08.006465: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 8/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 9/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 10/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 11/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 12/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 13/300
INFO:tensorflow:Assets written to: logs/price/ensemble/final/log_price_atleast4/checkpoint.ckpt/assets
Epoch 14/300
INFO:tensorflow

<tensorflow.python.keras.callbacks.History at 0x3dab6d6d0>

In [None]:
# final weights to see the contribution - for last epoch
model.layers[-1].get_weights()[0]

In [8]:
# reload best epoch
model = Img_model(dropout= 0.2, nodes1 = 64, nodes2 = 1, l2 = 0)
model.compile(optimizer=keras.optimizers.Adam(0.1), loss= "mse", metrics= ["mae", R_squared])
#ckp = "logs/price/ensemble/final/log_price_atleast4/checkpoint"
ckp = "img_models/price/0/checkpoint"
model.load_weights(ckp)

2022-03-01 09:24:41.079609: W tensorflow/core/util/tensor_slice_reader.cc:95] Could not open img_models/price/0/checkpoint: Failed precondition: img_models/price/0/checkpoint; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2ab7570d0>

In [None]:
# final weights to see the contribution - for best epoch
model.layers[-1].get_weights()[0]

In [34]:
cols= ['weight']
index = ["bath", "bed", "dining", "hall", "kitchen", "living"]

weights_rooms = pd.DataFrame(model.layers[-1].get_weights()[0], index=index, columns=cols)

print(weights_rooms.to_latex())

\begin{tabular}{lr}
\toprule
{} &    weight \\
\midrule
bath    &  0.140740 \\
bed     &  0.130050 \\
dining  &  0.100526 \\
hall    &  0.099289 \\
kitchen &  0.153168 \\
living  &  0.376227 \\
\bottomrule
\end{tabular}



In [36]:
# evaluate test data
model.evaluate(test_dataset)

2022-03-01 07:46:00.915352: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




[0.2966732978820801, 0.4159461259841919, 0.24393852055072784]

# MUNICH

In [10]:
# load information about data path
img_df = pd.read_csv("munich/img_path.csv")
img_df.reset_index(drop=True, inplace = True)


In [11]:
# prices
url_listing = "http://data.insideairbnb.com/germany/bv/munich/2021-12-24/data/listings.csv.gz"
listings = pd.read_csv(url_listing)
urls = listings["listing_url"]
ids = listings["id"]
price = listings["price"]
price = price.str.replace("$","")
price = price.str.replace(",","")
price = price.astype(float)
listings["price"] = price
listings["log_price"] = np.log(price)
listings = listings[listings["price"]<500]

divide by zero encountered in log


In [12]:
# bring images and prices together
price_df = listings[["log_price", "id"]]
# merge image and price data together to get the corresponding price for each image
df = pd.merge(img_df, price_df, on = "id", how = "left")
df.drop(df.columns[0], axis =1, inplace = True)
df.head()

Unnamed: 0,id,img_path,label,log_price
0,3929265,3929265_4.png,1,3.663562
1,24351617,24351617_5.png,1,4.043051
2,32173477,32173477_2.png,5,5.257495
3,15117566,15117566_1.png,1,3.912023
4,38813685,38813685_1.png,5,5.393628


In [13]:
def data_generator(df):
    ''' Extracts the features from a Resnet50 '''
    counter_overall_dummy = 0
    counter_overall_img = 0

    def input_pipeline(room = 0, df = df):
        ids = []
        features = []
        
        # Filter df by room type
        df_room = df[df["label"] == room]
        
        # load resnet
        resnet = tf.keras.applications.resnet.ResNet50(include_top=False, weights='imagenet', pooling="avg", input_shape = (None,None,3))
        resnet_pre = keras.applications.resnet50.preprocess_input
        resnet.trainable = False
        
        # count dummies and images overall
        counter_dummy = 0
        counter_img = 0
        
        # for every id route all images of a room through the resnet50
        for id in tqdm_notebook(np.unique(df["id"])):
            filter = df_room["id"] == id  

            # if an error occurs just add a black images otherwise get resnet features
            try:
                l = []
                
                # if no image of the room is present add a black image and route it throug the resnet
                if filter.sum() == 0:
                    dummy_image = np.zeros((1,256,256,3))
                    dummy_image = resnet_pre(dummy_image)
                    dummy_image = resnet(dummy_image)
                    l.append(dummy_image)
                    counter_dummy += 1             
                
                # load image and get features
                else:
                    path_id = df_room["img_path"][filter]
                    for p in path_id:
                        img_tmp = cv2.imread("munich/images/"+p)
                        img_tmp = cv2.resize(img_tmp, dsize=(256, 256))
                        img_tmp = np.expand_dims(img_tmp, axis = 0)
                        img_tmp = resnet_pre(img_tmp.astype(np.float32))
                        img_tmp = resnet(img_tmp)
                        l.append(img_tmp)
                        counter_img += 1
                        
                # stack all images and compute maximum
                l = np.stack(l)
                l = np.max(l, axis = 0)
                features.append(l)
                ids.append(id)
                
            except:
                dummy_image = np.zeros((1,256,256,3))
                dummy_image = resnet_pre(dummy_image)
                dummy_image = resnet(dummy_image)
                l.append(dummy_image)
                l = np.stack(l)
                l = np.max(l, axis = 0)
                features.append(l)
                ids.append(id)
                continue
            
        # stack all features of the images and leave out zeros features
        features = np.squeeze(np.stack(features))
        filter = np.nonzero(features.sum(axis = 0))[0]
        features = features[:,filter]
        print(len(filter), " features are nonzero.")
        
        features = features.tolist()
        print(counter_dummy, "dummy images were added.")
        
        return features, ids, counter_dummy, counter_img

    basis_df = df[["id","log_price"]]
    basis_df = basis_df.drop_duplicates()
    
    features = []
    ids = []
    
    # loop over all room categories
    for i in tqdm_notebook(np.unique(df["label"])):
        
        feat_cat, ids_cat, counter_dummy, counter_img = input_pipeline(i,df)
        df_tmp = pd.DataFrame({"features_"+str(i): feat_cat, "id": ids_cat})
        basis_df = pd.merge(basis_df, df_tmp, on = "id", how = "left")
        counter_overall_dummy += counter_dummy
        counter_overall_img += counter_img
        
    print(counter_overall_dummy)
    print(counter_overall_img)
    
    return basis_df

In [14]:
# filter out all NAs and room category "Others"
filter = np.any(df.isna(), axis = 1)
df = df[~filter]

# drop "others" as we are only interested in the rooms
filter = df["label"] == 6.0
df = df[~filter]

# use only listings with images for 4 or categories 
bool = []
for id in df["id"]:
    tmp = df[df["id"] == id]
    if len(np.unique(tmp["label"])) >= 4:
        bool.append(True)
    else:
        bool.append(False)
df_munich = df[bool].copy()
df_munich["city"] = "Munich"

In [15]:
final_df_munich = data_generator(df_munich)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
70 dummy images were added.


  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
60 dummy images were added.


  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
1840 dummy images were added.


  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
1743 dummy images were added.


  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
187 dummy images were added.


  0%|          | 0/3078 [00:00<?, ?it/s]

2048  features are nonzero.
272 dummy images were added.
4172
38857


In [16]:
feat_munich = final_df_munich.dropna()

In [31]:
url_listing = "http://data.insideairbnb.com/ireland/leinster/dublin/2021-11-07/data/listings.csv.gz"
listings = pd.read_csv(url_listing)

# remove extreme prices
price = listings["price"]
price = price.str.replace("$","")
price = price.str.replace(",","")
price = price.astype(float)
filter = price < 500
price = price[filter.values]
log_price = np.log(price)


In [32]:
# bring the prices on the same scale
dub_mean = log_price.mean()
dub_sd = log_price.std()

label_munich = feat_munich["log_price"]
label_munich_tmp = (label_munich.values-label_munich.values.mean())/label_munich.values.std()
label_munich = (label_munich_tmp * dub_sd) + dub_mean

In [33]:
# define input of model a tuples
def transform(ds):
    bath = tf.unstack(ds["bath"])
    bed = tf.unstack(ds["bed"])
    dining = tf.unstack(ds["dining"])
    hall = tf.unstack(ds["hall"])
    kitchen = tf.unstack(ds["kitchen"])
    living = tf.unstack(ds["living"])

    prices = ds["price"]
    
    return (bath, bed, dining, hall, kitchen, living), prices

In [34]:
batch_size = 128

data = feat_munich.drop(["id","log_price"], axis = 1)
data.columns = ["bath", "bed", "dining", "hall", "kitchen", "living"]
filter = np.any(data.isna(), axis = 1)
data = data[~filter]
data = data[~filter]

data = tf.data.Dataset.from_tensor_slices({"bath": np.squeeze(np.stack(data["bath"])),
                                                "bed": np.squeeze(np.stack(data["bed"])),
                                                "dining": np.squeeze(np.stack(data["dining"])),
                                                "hall": np.squeeze(np.stack(data["hall"])),
                                                "kitchen": np.squeeze(np.stack(data["kitchen"])),
                                                "living": np.squeeze(np.stack(data["living"])),
                                                "price": label_munich})
data = data.cache()
data = data.take(len(label_munich))
data = data.map(transform)
data = data.batch(batch_size)

In [35]:
model.evaluate(data)



[0.4321860074996948, 0.5174143314361572, -0.025054719299077988]