In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers, Model, Input, applications
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

import torch

In [2]:
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# batch_size = 8
batch_size = 16

epochs = 60
# epochs = 30

In [4]:
main_directory = "/kaggle/input/nutrition5k/"

In [5]:
# folder and files names
folder = main_directory+"Nutritions5k/realsense_overhead"
types_of_photo = ["depth_color.png", "depth_raw.png", "rgb.png"]

# read files to understand which to put to train and which to test
with open(main_directory+"Nutritions5k/dish_ids/splits/rgb_train_ids.txt") as rgb_train_splits:
    rgb_train_splits = [i.replace('\n', '') for i in rgb_train_splits.readlines()]
    
with open(main_directory+"Nutritions5k/dish_ids/splits/rgb_test_ids.txt") as rgb_test_splits:
    rgb_test_splits = [i.replace('\n', '') for i in rgb_test_splits.readlines()]
    
# take csv files which contains output values per dish
real_column_names = ["dish_id", "total_calories", "total_mass", "total_fat", "total_carb", "total_protein"]
dish_metadata_cafe1 = pd.read_csv(main_directory+"Nutritions5k/metadata/dish_metadata_cafe1.csv", on_bad_lines='skip')
dish_metadata_cafe2 = pd.read_csv(main_directory+"Nutritions5k/metadata/dish_metadata_cafe2.csv", on_bad_lines='skip')
dish_metadata_cafe1 = dish_metadata_cafe1[dish_metadata_cafe1.columns[:6]]
dish_metadata_cafe2 = dish_metadata_cafe2[dish_metadata_cafe2.columns[:6]]

# columns it's actual values from the dataset, so put it inside of the DataFrame
# and then change columns to actual names of columns
dish_metadata_cafe1.loc[dish_metadata_cafe1.iloc[-1].name+1] = dish_metadata_cafe1.columns
dish_metadata_cafe2.loc[dish_metadata_cafe2.iloc[-1].name+1] = dish_metadata_cafe2.columns
dish_metadata_cafe1.columns = real_column_names
dish_metadata_cafe2.columns = real_column_names
    
    
rgb_imgs_train = []
rgb_imgs_metadata_train = []
rgb_imgs_test = []
rgb_imgs_metadata_test = []

depth_imgs_train = []
depth_imgs_test = []

# # parameters
image_size = (256,256)
# image_size = (384,384)
# image_size = (224,224)

# go through all photos
for dish_folder in os.listdir(folder):
    # read image and convert it to rgb
    img = cv2.imread(folder+'/'+dish_folder+'/'+'rgb.png')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, image_size)
    
    depth_color = cv2.imread(folder+'/'+dish_folder+'/'+'depth_color.png')
    depth_color = cv2.cvtColor(depth_color, cv2.COLOR_BGR2RGB)
    depth_color = cv2.resize(depth_color, image_size)
    
    # take nutrition values and mass of food
    if dish_folder in dish_metadata_cafe1["dish_id"].values:
        dish_metadata = dish_metadata_cafe1[dish_metadata_cafe1["dish_id"] == dish_folder].values[0][1:]
    elif dish_folder in dish_metadata_cafe2["dish_id"].values:
        dish_metadata = dish_metadata_cafe2[dish_metadata_cafe2["dish_id"] == dish_folder].values[0][1:]
    else:
        continue
    
    if dish_folder in rgb_train_splits:
        rgb_imgs_train.append(img)
        depth_imgs_train.append(depth_color)
        rgb_imgs_metadata_train.append(dish_metadata)
    elif dish_folder in rgb_test_splits:
        rgb_imgs_test.append(img)
        depth_imgs_test.append(depth_color)
        rgb_imgs_metadata_test.append(dish_metadata)
    
    
rgb_imgs_metadata_train = np.array([i.astype(np.float32) for i in rgb_imgs_metadata_train])
rgb_imgs_metadata_test = np.array([i.astype(np.float32) for i in rgb_imgs_metadata_test])

rgb_imgs_train = np.array(rgb_imgs_train)
rgb_imgs_test = np.array(rgb_imgs_test)

depth_imgs_train = np.array(depth_imgs_train)
depth_imgs_test = np.array(depth_imgs_test)

In [6]:
print("Train max -->", rgb_imgs_metadata_train.max(axis=0))
print("Test max -->", rgb_imgs_metadata_test.max(axis=0))
print()
print("Train min -->", rgb_imgs_metadata_train.min(axis=0))
print("Test min -->", rgb_imgs_metadata_test.min(axis=0))
print()
print("Train mean -->", rgb_imgs_metadata_train.mean(axis=0))
print("Test mean -->", rgb_imgs_metadata_test.mean(axis=0))
print()
print("Train median -->", np.median(rgb_imgs_metadata_train, axis=0))
print("Test median -->", np.median(rgb_imgs_metadata_test, axis=0))

Train max --> [3943.3252   3051.        106.343     844.5686    120.443954]
Test max --> [1050.5111  871.       84.153    85.81     84.645 ]

Train min --> [0. 5. 0. 0. 0.]
Test min --> [0. 1. 0. 0. 0.]

Train mean --> [229.09233  199.97179   11.444165  18.087877  15.380763]
Test mean --> [232.27643  182.58667   11.479245  19.038528  14.982078]

Train median --> [176.14001  161.         7.169495  13.860001   8.698   ]
Test median --> [185.83298   143.          6.9696765  14.597746    8.567883 ]


In [7]:
# outliers_upper = np.where(rgb_imgs_metadata_train[:, 0] >= np.percentile(rgb_imgs_metadata_train[:, 0], 99))[0]
# outliers_lower = np.where(rgb_imgs_metadata_train[:, 0] <= np.percentile(rgb_imgs_metadata_train[:, 0], 1))[0]

# indexes_without_outliers = [i for i in range(len(rgb_imgs_metadata_train)) if i not in outliers_upper.tolist()+outliers_lower.tolist()]

In [8]:
# # worser results
# rgb_imgs_train, depth_imgs_train, rgb_imgs_metadata_train = rgb_imgs_train[indexes_without_outliers],\
#                                                             depth_imgs_train[indexes_without_outliers],\
#                                                             rgb_imgs_metadata_train[indexes_without_outliers]

# Augmentation and division dataset to train, validation

In [9]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical", seed=42),
#     tf.keras.layers.RandomRotation(0.2, seed=42),
#     tf.keras.layers.RandomZoom(0.1, seed=42)
    tf.keras.layers.RandomRotation(0.1, seed=42),
    tf.keras.layers.RandomZoom(0.05, seed=42),
    tf.keras.layers.RandomContrast(0.1, seed=42),
    tf.keras.layers.RandomBrightness(0.1, seed=42),
    tf.keras.layers.RandomTranslation(height_factor=0.1, width_factor=0.1)
])


def augment_image(rgb_img, depth_img, threshold=0.5):
    image = np.concatenate([rgb_img, depth_img], axis=-1)
    # Randomly choose to apply augmentation based on the defined probability
    if tf.random.uniform([]) < threshold:
        image = data_augmentation(image)
        
    augmented_rgb = image[..., :3]  # First three channels for RGB
    augmented_depth = image[..., 3:]  # Last channel for depth
    
    return augmented_rgb, augmented_depth

In [10]:
# divide train to train and validation
train_indexes = [i for i in range(len(rgb_imgs_train))]
validation_len = int(len(train_indexes) * 0.1)

validation_indexes = np.random.choice(train_indexes, validation_len)
train_indexes = [i for i in train_indexes if i not in validation_indexes]

# take data only for validation
rgb_imgs_validation = rgb_imgs_train[validation_indexes]
depth_imgs_validation = depth_imgs_train[validation_indexes]
rgb_imgs_metadata_validation = rgb_imgs_metadata_train[validation_indexes]

# take data only for train
rgb_imgs_train = rgb_imgs_train[train_indexes]
depth_imgs_train = depth_imgs_train[train_indexes]
rgb_imgs_metadata_train = rgb_imgs_metadata_train[train_indexes]

In [11]:
augmented_imgs = np.array([augment_image(i, j) for i, j in zip(rgb_imgs_train, depth_imgs_train)]) / 255
rgb_imgs_train, depth_imgs_train = augmented_imgs[:, 0], augmented_imgs[:, 1]

# rgb_imgs_train, depth_imgs_train = rgb_imgs_train / 255, depth_imgs_train / 255

rgb_imgs_test = rgb_imgs_test / 255
rgb_imgs_validation = rgb_imgs_validation / 255


del augmented_imgs

In [12]:
depth_imgs_validation = depth_imgs_validation / 255
depth_imgs_test = depth_imgs_test / 255

In [13]:
if image_size == (384,384):
    indexes = [i for i in range(len(rgb_imgs_train))]
    np.random.shuffle(indexes)
    indexes = indexes[:1400]
    rgb_imgs_train = rgb_imgs_train[indexes]
    depth_imgs_train = depth_imgs_train[indexes]
    rgb_imgs_metadata_train = rgb_imgs_metadata_train[indexes]

# Model without depth

In [14]:
def get_callbacks():
    early_stopping = EarlyStopping(
        monitor='val_loss',       # Monitor validation loss
        patience=5,               # Number of epochs to wait before stopping
        min_delta=0.01,           # Minimum change to qualify as improvement
        mode='min',               # Stop when the metric is minimizing (val_loss is decreasing)
        restore_best_weights=True # Restore model weights from the epoch with the best value
    )
    return [early_stopping]

In [15]:
# "total_calories", "total_mass", "total_fat", "total_carb", "total_protein"
def custom_multitask_loss(y_true, y_pred):
    y_true_macro, y_true_cal, y_true_weight = y_true[:, 2:], y_true[:, 0], y_true[:, 1]
    y_pred_macro, y_pred_cal, y_pred_weight = y_pred[:, 2:], y_pred[:, 0], y_pred[:, 1]

    # lm
    macro_loss = tf.reduce_mean(tf.abs(y_pred_macro - y_true_macro), axis=1)
    # lc
    calorie_loss = tf.abs(y_pred_cal - y_true_cal)
    # lw
    weight_loss = tf.abs(y_pred_weight - y_true_weight)

#     total_loss = macro_loss + calorie_loss + weight_loss
    total_loss = tf.reduce_mean(macro_loss + calorie_loss + weight_loss)

    
    return total_loss

In [16]:
def geometric_mean_loss(y_true, y_pred):
    # Split ground truth and predicted values into separate tensors
    y_cal_true, y_mass_true, y_fat_true, y_carb_true, y_protein_true = tf.split(y_true, num_or_size_splits=5, axis=-1)
    y_cal_pred, y_mass_pred, y_fat_pred, y_carb_pred, y_protein_pred = tf.split(y_pred, num_or_size_splits=5, axis=-1)

    # Calculate L1 losses for each subtask
    L_cal = tf.reduce_mean(tf.abs(y_cal_true - y_cal_pred))
    L_mass = tf.reduce_mean(tf.abs(y_mass_true - y_mass_pred))
    L_fat = tf.reduce_mean(tf.abs(y_fat_true - y_fat_pred))
    L_carb = tf.reduce_mean(tf.abs(y_carb_true - y_carb_pred))
    L_protein = tf.reduce_mean(tf.abs(y_protein_true - y_protein_pred))

    # Combine losses using geometric mean
    L_total = (L_cal * L_mass * L_fat * L_carb * L_protein) ** (1/5)

    return L_total

In [17]:
# NOTE: delete after test
class ChannelAttention(layers.Layer):
    def __init__(self, ratio=8, **kwargs):
        super(ChannelAttention, self).__init__(**kwargs)
        self.ratio = ratio
    
    def build(self, input_shape):
        channel = input_shape[-1]
        self.shared_dense_one = layers.Dense(channel // self.ratio, activation='relu', use_bias=True)
        self.shared_dense_two = layers.Dense(channel, use_bias=True)
    
    def call(self, input_feature):
        avg_pool = layers.GlobalAveragePooling2D()(input_feature)
        avg_pool = layers.Reshape((1, 1, input_feature.shape[-1]))(avg_pool)
        avg_pool = self.shared_dense_one(avg_pool)
        avg_pool = self.shared_dense_two(avg_pool)
        
        max_pool = layers.GlobalMaxPooling2D()(input_feature)
        max_pool = layers.Reshape((1, 1, input_feature.shape[-1]))(max_pool)
        max_pool = self.shared_dense_one(max_pool)
        max_pool = self.shared_dense_two(max_pool)
        
        cbam_feature = layers.Add()([avg_pool, max_pool])
        cbam_feature = layers.Activation('sigmoid')(cbam_feature)
        
        return layers.Multiply()([input_feature, cbam_feature])

# Custom Layer for Spatial Attention
class SpatialAttention(layers.Layer):
    def __init__(self, **kwargs):
        super(SpatialAttention, self).__init__(**kwargs)
        self.conv = None
    
    def build(self, input_shape):
        self.conv = layers.Conv2D(filters=1, kernel_size=7, strides=1, padding='same', activation='sigmoid', use_bias=False)
    
    def call(self, input_feature):
        avg_pool = tf.reduce_mean(input_feature, axis=-1, keepdims=True)
        max_pool = tf.reduce_max(input_feature, axis=-1, keepdims=True)
        concat = layers.Concatenate(axis=-1)([avg_pool, max_pool])
        
        cbam_feature = self.conv(concat)
        
        return layers.Multiply()([input_feature, cbam_feature])

# CBAM Block using the Custom Layers
def cbam_block(cbam_input, ratio=8):
    x = ChannelAttention(ratio=ratio)(cbam_input)
    x = SpatialAttention()(x)
    return x
# NOTE: delete after test

In [18]:
def nutrition5k(image_size=(256,256,3), version="v2", n_outputs=5, loss="custom", original=False):
    # Build the model
    inputs = tf.keras.Input(shape=image_size)
    
    if version == "v2":
        # Load Inception V3 as the feature extractor, excluding the top layers
        model = applications.InceptionResNetV2(weights='imagenet', include_top=False, input_tensor=inputs, input_shape=image_size)
        model.trainable = False
        x = model.get_layer(model.layers[-1].name).output
       
    elif version == "v3":
        model = applications.InceptionV3(weights='imagenet', include_top=False, input_tensor=inputs, input_shape=image_size)
        model.trainable = False
        x = model.get_layer("mixed5").output
    else:
        raise TypeError("This version does not exist, please choose between 'v2', 'v3'")

    if not original:
        x = cbam_block(x)
        x = layers.Conv2D(128, (3, 3), padding='same', activation='relu')(x)
        x = layers.GlobalAveragePooling2D(name="feature_extractor")(x)
        
        x_1 = layers.Dense(512)(x)
        x_2 = layers.Dense(512)(x)
        x_3 = layers.Dense(512)(x)
        x_4 = layers.Dense(512)(x)

        x = layers.Add()([x_1, x_2, x_3, x_4])

        
    if original:
        # Apply [3, 3] average pooling with stride 2 and valid padding
        x = layers.AveragePooling2D(pool_size=(3, 3), strides=2, padding='valid')(x)
        x = layers.Flatten()(x)

    # Fully connected layers
    x = layers.Dense(4096)(x)
    x = layers.Dense(4096)(x)
    x = layers.Dense(4096)(x)
    
    
    outputs = layers.Dense(n_outputs)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    if original:
        # Compile the model with a suitable loss function for regression tasks
        optimizer = keras.optimizers.RMSprop(learning_rate=1e-4,
                                             momentum=0.9,
                                             epsilon=1.0,
                                             weight_decay=0.9)
    else:
        optimizer = keras.optimizers.Adam(learning_rate=1e-4)
    
    if loss == "custom":
        loss_func = custom_multitask_loss
    elif loss == "geometric":
        loss_func = geometric_mean_loss
    elif loss == "mse":
        loss_func = "mean_squared_error"
    else:
        loss_func = "mean_absolute_error"

    model.compile(optimizer=optimizer, loss=loss_func, metrics=["mae"])
    
    return model

In [19]:
def test_model(model, x, y_real):
    test_predicted = model.predict(x)

    test_errors = np.abs(test_predicted - y_real)

    for val_name, val in zip(real_column_names[1:], np.mean(test_errors, axis=0)):
        print("MAE Mean for", val_name, '-->', val)
        
    print("Total MAE -->", np.mean(test_errors))

In [20]:
def xgboost_test(model, inputs, outputs):
    rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test = inputs
    rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test = outputs

    feature_extractor = Model(inputs=model.input, outputs=model.get_layer("feature_extractor").output)

    # Extract features from training data
    train_features = feature_extractor.predict(rgb_imgs_train)
    val_features = feature_extractor.predict(rgb_imgs_validation)
    test_features = feature_extractor.predict(rgb_imgs_test)

    xgboost_params = {
                'objective': 'reg:squarederror',  # Use 'reg:squarederror' for regression tasks
                'n_estimators': 50,               # Start with a moderate number of trees
                'max_depth': 6,                   # Maximum depth of a tree
                'learning_rate': 0.1,             # Step size shrinkage
                'subsample': 0.8,                 # Subsample ratio of the training instance
                'colsample_bytree': 0.8,          # Subsample ratio of columns when constructing each tree
                'gamma': 0,                       # Minimum loss reduction required to make a further partition
                'alpha': 0,                       # L1 regularization term on weights
                'lambda': 1,                      # L2 regularization term on weights
                'random_state': 42                # For reproducibility
    }

    xgb_model = xgb.XGBRegressor(**xgboost_params)

    # Train the Random Forest regressor using extracted features
    xgb_model.fit(train_features, rgb_imgs_metadata_train)

    # Predict on validation features
    y_pred = xgb_model.predict(val_features)

    # Calculate Mean Absolute Error
    mae = mean_absolute_error(rgb_imgs_metadata_validation, y_pred)
    print(f'Validation Mean Absolute Error of: {mae}')
    
    test_model(xgb_model, test_features, rgb_imgs_metadata_test)

In [21]:
# model = nutrition5k(image_size=image_size+(3,), version="efficientnet", loss="geometric")

In [22]:
# history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
#                     validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation))

In [23]:
# test_predicted = model.predict(rgb_imgs_test)

# test_errors = np.abs(test_predicted - rgb_imgs_metadata_test)

# for val_name, val in zip(real_column_names[1:], np.mean(test_errors, axis=0)):
#     print("MAE Mean for", val_name, '-->', val)

# Original method

In [24]:
model = nutrition5k(image_size=image_size+(3,), version="v3", loss="geometric", original=True)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [25]:
callbacks = get_callbacks()

history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60


I0000 00:00:1725481813.993304     107 service.cc:145] XLA service 0x5ae59b18ce90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725481813.993366     107 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1725481813.993371     107 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m  3/139[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7s[0m 55ms/step - loss: 46.5647 - mae: 100.5238

I0000 00:00:1725481821.277925     107 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 146ms/step - loss: 35.5905 - mae: 89.5624 - val_loss: 31.2327 - val_mae: 78.4133
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step - loss: 28.4275 - mae: 69.1316 - val_loss: 27.2249 - val_mae: 59.6656
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - loss: 25.6026 - mae: 57.0992 - val_loss: 25.3877 - val_mae: 55.8062
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step - loss: 23.7892 - mae: 53.9529 - val_loss: 24.3237 - val_mae: 53.0404
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - loss: 23.2920 - mae: 51.5532 - val_loss: 23.7381 - val_mae: 50.0684
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 63ms/step - loss: 21.9018 - mae: 48.2976 - val_loss: 22.6517 - val_mae: 46.5611
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [26]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 343ms/step
MAE Mean for total_calories --> 95.43458
MAE Mean for total_mass --> 54.10184
MAE Mean for total_fat --> 6.873353
MAE Mean for total_carb --> 8.130579
MAE Mean for total_protein --> 8.1826515
Total MAE --> 34.544605


# Modified method

In [27]:
# mse right now running
model = nutrition5k(image_size=image_size+(3,), version="v3", loss="geometric")

In [28]:
callbacks = get_callbacks()

history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60





[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 160ms/step - loss: 33.2733 - mae: 81.3726 - val_loss: 23.3158 - val_mae: 40.1044
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - loss: 21.4933 - mae: 38.5806 - val_loss: 21.6233 - val_mae: 37.5447
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 19.4730 - mae: 35.6824 - val_loss: 19.0353 - val_mae: 35.4190
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - loss: 17.6811 - mae: 32.9818 - val_loss: 17.9714 - val_mae: 33.6163
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - loss: 16.5782 - mae: 31.0487 - val_loss: 19.1419 - val_mae: 37.7152
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 16.3685 - mae: 30.1677 - val_loss: 16.5385 - val_mae: 30.6327
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [29]:
xgboost_test(model, inputs=[rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test],
                          outputs=[rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test])

# With last 4096 
# MAE Mean for total_calories --> 79.45708275180662
# MAE Mean for total_mass --> 53.23186666666662
# MAE Mean for total_fat --> 6.384834461182731
# MAE Mean for total_carb --> 8.153495717423656
# MAE Mean for total_protein --> 7.8782022629967985

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 729ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 176ms/step
Validation Mean Absolute Error of: 26.658559799194336
MAE Mean for total_calories --> 73.518524
MAE Mean for total_mass --> 46.35223
MAE Mean for total_fat --> 5.189673
MAE Mean for total_carb --> 7.0847692
MAE Mean for total_protein --> 6.931301
Total MAE --> 27.815296


In [30]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

# With last 4096
# MAE Mean for total_calories --> 96.324486
# MAE Mean for total_mass --> 55.977676
# MAE Mean for total_fat --> 8.200258
# MAE Mean for total_carb --> 8.190553
# MAE Mean for total_protein --> 9.116547

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 380ms/step
MAE Mean for total_calories --> 70.07911
MAE Mean for total_mass --> 44.417343
MAE Mean for total_fat --> 5.121333
MAE Mean for total_carb --> 6.6875234
MAE Mean for total_protein --> 7.219619
Total MAE --> 26.704985


In [31]:
# for now the best
model = nutrition5k(version="v3", loss="mae")

In [32]:
callbacks = get_callbacks()

history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 119ms/step - loss: 60.7869 - mae: 60.7869 - val_loss: 43.8822 - val_mae: 43.8822
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 38.8948 - mae: 38.8948 - val_loss: 39.2042 - val_mae: 39.2042
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 36.7289 - mae: 36.7289 - val_loss: 35.9019 - val_mae: 35.9019
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 34.1670 - mae: 34.1670 - val_loss: 33.9516 - val_mae: 33.9516
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 31.7202 - mae: 31.7202 - val_loss: 33.7483 - val_mae: 33.7483
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 29.8645 - mae: 29.8645 - val_loss: 32.1478 - val_mae: 32.1478
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [33]:
xgboost_test(model, inputs=[rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test],
                          outputs=[rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test])

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 86ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 228ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 137ms/step
Validation Mean Absolute Error of: 29.752090454101562
MAE Mean for total_calories --> 69.40536
MAE Mean for total_mass --> 48.741177
MAE Mean for total_fat --> 5.389448
MAE Mean for total_carb --> 7.7314973
MAE Mean for total_protein --> 7.1848574
Total MAE --> 27.690468


In [34]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 262ms/step
MAE Mean for total_calories --> 70.35804
MAE Mean for total_mass --> 47.477623
MAE Mean for total_fat --> 5.6916804
MAE Mean for total_carb --> 7.8741155
MAE Mean for total_protein --> 7.565407
Total MAE --> 27.793375


In [35]:
model = nutrition5k(version="v2", loss="mae")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m219055592/219055592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [36]:
callbacks = get_callbacks()

history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 342ms/step - loss: 59.7897 - mae: 59.7897 - val_loss: 44.3012 - val_mae: 44.3012
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 127ms/step - loss: 42.6345 - mae: 42.6345 - val_loss: 39.6240 - val_mae: 39.6240
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 128ms/step - loss: 39.4358 - mae: 39.4358 - val_loss: 38.7455 - val_mae: 38.7455
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - loss: 38.0157 - mae: 38.0157 - val_loss: 36.9419 - val_mae: 36.9419
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 125ms/step - loss: 34.4632 - mae: 34.4632 - val_loss: 34.9269 - val_mae: 34.9269
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 123ms/step - loss: 31.3134 - mae: 31.3134 - val_loss: 35.2291 - val_mae: 35.2291
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━━━

In [37]:
xgboost_test(model, inputs=[rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test],
                          outputs=[rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test])

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 317ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step  
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 586ms/step
Validation Mean Absolute Error of: 32.047386169433594
MAE Mean for total_calories --> 84.76781
MAE Mean for total_mass --> 46.9249
MAE Mean for total_fat --> 6.2950993
MAE Mean for total_carb --> 9.108825
MAE Mean for total_protein --> 8.083182
Total MAE --> 31.03596


In [38]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 920ms/step
MAE Mean for total_calories --> 81.144104
MAE Mean for total_mass --> 45.9366
MAE Mean for total_fat --> 6.168474
MAE Mean for total_carb --> 8.63509
MAE Mean for total_protein --> 8.1956835
Total MAE --> 30.016


In [39]:
model = nutrition5k(version="v3", loss="custom")

In [40]:
callbacks = get_callbacks()

# , callbacks=callbacks
history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 120ms/step - loss: 307.3256 - mae: 66.7865 - val_loss: 191.3788 - val_mae: 42.6253
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 182.0703 - mae: 40.2895 - val_loss: 174.4297 - val_mae: 38.9102
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 169.7308 - mae: 37.6820 - val_loss: 170.8270 - val_mae: 38.1342
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - loss: 163.0605 - mae: 36.2184 - val_loss: 166.9398 - val_mae: 37.2663
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - loss: 156.8215 - mae: 34.8767 - val_loss: 162.4985 - val_mae: 36.3084
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 152.6192 - mae: 33.9845 - val_loss: 156.7360 - val_mae: 35.1330
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━

In [41]:
xgboost_test(model, inputs=[rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test],
                          outputs=[rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test])
    
    
# # WITH PREVIOUS augmentation parameters
# MAE Mean for total_calories --> 74.89289
# MAE Mean for total_mass --> 46.198105
# MAE Mean for total_fat --> 6.363183
# MAE Mean for total_carb --> 8.111093
# MAE Mean for total_protein --> 8.364978

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 85ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 237ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 135ms/step
Validation Mean Absolute Error of: 28.59465980529785
MAE Mean for total_calories --> 71.62666
MAE Mean for total_mass --> 43.012768
MAE Mean for total_fat --> 5.7270637
MAE Mean for total_carb --> 8.468128
MAE Mean for total_protein --> 7.6299496
Total MAE --> 27.292921


In [42]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 258ms/step
MAE Mean for total_calories --> 73.07089
MAE Mean for total_mass --> 43.871315
MAE Mean for total_fat --> 5.9916515
MAE Mean for total_carb --> 9.520895
MAE Mean for total_protein --> 8.655392
Total MAE --> 28.22203


In [43]:
model = nutrition5k(version="v2", loss="geometric")

In [44]:
callbacks = get_callbacks()

history = model.fit(rgb_imgs_train, rgb_imgs_metadata_train, batch_size=batch_size, epochs=epochs, 
                    validation_data=(rgb_imgs_validation, rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 303ms/step - loss: 33.3675 - mae: 81.3422 - val_loss: 22.2067 - val_mae: 40.3517
Epoch 2/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 124ms/step - loss: 21.6599 - mae: 40.5549 - val_loss: 22.4934 - val_mae: 39.5939
Epoch 3/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 128ms/step - loss: 20.1169 - mae: 37.1853 - val_loss: 21.1503 - val_mae: 37.9016
Epoch 4/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 126ms/step - loss: 18.6299 - mae: 34.5089 - val_loss: 19.3087 - val_mae: 35.3803
Epoch 5/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 125ms/step - loss: 17.7493 - mae: 32.9014 - val_loss: 19.2605 - val_mae: 34.5868
Epoch 6/60
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 122ms/step - loss: 16.5758 - mae: 30.7786 - val_loss: 23.6901 - val_mae: 39.5986
Epoch 7/60
[1m139/139[0m [32m━━━━━━━━━━━━━━

In [45]:
xgboost_test(model, inputs=[rgb_imgs_train, rgb_imgs_validation, rgb_imgs_test],
                          outputs=[rgb_imgs_metadata_train, rgb_imgs_metadata_validation, rgb_imgs_metadata_test])

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 311ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 744ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 448ms/step
Validation Mean Absolute Error of: 31.747610092163086
MAE Mean for total_calories --> 88.198006
MAE Mean for total_mass --> 52.59832
MAE Mean for total_fat --> 6.83967
MAE Mean for total_carb --> 9.811279
MAE Mean for total_protein --> 8.026792
Total MAE --> 33.094814


In [46]:
test_model(model, rgb_imgs_test, rgb_imgs_metadata_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 984ms/step
MAE Mean for total_calories --> 92.177124
MAE Mean for total_mass --> 55.843803
MAE Mean for total_fat --> 7.2581944
MAE Mean for total_carb --> 9.134695
MAE Mean for total_protein --> 8.742482
Total MAE --> 34.631264


# RGB-D model

In [47]:
class ConvolutionalAttentionBlock(layers.Layer):
    def __init__(self, filters):
        super(ConvolutionalAttentionBlock, self).__init__()
        self.conv1x1_first = layers.Conv2D(filters, (1, 1), padding='same', activation="relu")
        self.bn_1x1 = layers.BatchNormalization()
        self.relu_1x1 = layers.ReLU()
        
        self.conv3x3 = layers.Conv2D(filters, (3, 3), padding='same', activation="relu")
        self.bn_3x3 = layers.BatchNormalization()
        self.relu_3x3 = layers.ReLU()
        
        self.conv1x1_second = layers.Conv2D(filters, (1, 1), padding='same')
        self.sigmoid = layers.Activation('sigmoid')

    # , inputs
    def call(self, Ri, Di):
#         # architecture
#         Ri, Di = inputs
        
        # Element-wise addition of Ri and Di
        combined = layers.Add()([Ri, Di])
        
        # Global Average Pooling (GAP) for channel attention
        ca = layers.GlobalAveragePooling2D()(combined)
        ca = layers.Reshape((1, 1, -1))(ca)
        ca = self.conv1x1_first(ca)
        ca = self.bn_1x1(ca)
        ca = self.relu_1x1(ca)
        ca = self.sigmoid(ca)
        
        # Channel-wise multiplication for channel attention
        channel_attended_Ri = layers.Multiply()([Ri, ca])
        channel_attended_Di = layers.Multiply()([Di, ca])
        
        # Mean along the channel dimension for spatial attention
        mean_spatial = tf.reduce_mean(combined, axis=-1, keepdims=True)
        
        # Applying 3x3 Convolution for spatial attention
        sa = self.conv3x3(mean_spatial)
        sa = self.bn_3x3(sa)
        sa = self.relu_3x3(sa)
        sa = self.sigmoid(sa)
        
        # Spatial-wise multiplication for spatial attention
        spatial_attended_Ri = layers.Multiply()([Ri, sa])
        spatial_attended_Di = layers.Multiply()([Di, sa])
        
        # Concatenating enhanced features
        enhanced_Ri = layers.Multiply()([channel_attended_Ri, spatial_attended_Ri])
        enhanced_Di = layers.Multiply()([channel_attended_Di, spatial_attended_Di])
        concatenated_features = layers.Concatenate()([enhanced_Ri, enhanced_Di])
        
        # Applying final 1x1 Convolution
        output = self.conv1x1_second(concatenated_features)
        
        return output


class RGBDFusion(tf.keras.Model):
    def __init__(self, use_midas=False):
        super().__init__()
        
        self.use_midas = use_midas
        
        # architecture
        
        # rgb img
        # 7x7 S2, 64 and 3x3 Max Pool S2 for rgb
        self.first_conv_rgb = layers.Conv2D(64, (7,7), strides=(2,2))
        self.first_maxpool_rgb = layers.MaxPooling2D((3,3), strides=(2,2))
        
        self.resnet_rgb = tf.keras.applications.ResNet101(include_top=False, weights='imagenet')
        self.resnet_rgb.trainable = False
        
        # depth img
        # 7x7 S2, 64 and 3x3 Max Pool S2 for rgb
        self.first_conv_depth = layers.Conv2D(64, (7,7), strides=(2,2))
        self.first_maxpool_depth = layers.MaxPooling2D((3,3), strides=(2,2))
        
        self.resnet_depth = tf.keras.applications.ResNet101(include_top=False, weights='imagenet')
        self.resnet_depth.trainable = False
        
        # last union
        self.resnet_union = tf.keras.applications.ResNet101(include_top=False, weights="imagenet")
        self.resnet_union.trainable = False
        
        self.cab = ConvolutionalAttentionBlock(filters=64)
        self.gap = layers.GlobalAveragePooling2D()
        
        self.to_output = layers.Dense(5)
        
        
    def call(self, inputs):
        rgb_input, depth_input = inputs
        
        if self.use_midas:
            model_type = "DPT_Hybrid"
            midas = torch.hub.load("intel-isl/MiDaS", model_type, pretrained=True)
            midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
            transform = midas_transforms.dpt_transform
            
            print(rgb_input.__dir__())
            print(type(rgb_input))
            
            depth_input = [transform(i) for i in rgb_input.numpy()]
            depth_input = [midas(i) for i in depth_input]

            
        
        
        x_rgb = self.first_conv_rgb(rgb_input)
        x_rgb = self.first_maxpool_rgb(x_rgb)
        
        x_depth = self.first_conv_depth(depth_input)
        x_depth = self.first_maxpool_depth(x_depth)
        
    
        resnet_layer_names = ['conv1_relu', 'conv2_block3_out', 'conv3_block4_out', 'conv4_block23_out']

        
        rgb_features = []
        depth_features = []
        for layer_name in resnet_layer_names:
            layer = self.resnet_rgb.get_layer(layer_name)
            x_rgb = layer(x_rgb)
            rgb_features.append(x_rgb)
            
            layer = self.resnet_depth.get_layer(layer_name)
            x_depth = layer(x_depth)
            depth_features.append(x_depth)
        
        
        cab_outputs = [self.cab(r, d) for r, d in zip(rgb_features, depth_features)]
        cab_outputs.insert(0, self.cab(x_rgb, x_depth))
        
        
        layer = self.resnet_union.get_layer(resnet_layer_names[0])
        output = layer(cab_outputs[0])
        for cab_data, layer_name in zip(cab_outputs[1:], resnet_layer_names[1:]):
            layer = self.resnet_union.get_layer(layer_name)
            combined = layers.Add()([output, cab_data])
            output = layer(combined)
            
        combined = layers.Add()([output, cab_outputs[-1]])
        output = self.gap(combined)
        
        return self.to_output(output)

In [48]:
rgbd_fusion = RGBDFusion(use_midas=True)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m171446536/171446536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [49]:
optimizer = keras.optimizers.Adam()

# "mae", custom_multitask_loss, geometric_mean_loss
rgbd_fusion.compile(optimizer=optimizer, loss=geometric_mean_loss, metrics=["mae"])

In [50]:
callbacks = get_callbacks()

history = rgbd_fusion.fit([rgb_imgs_train.astype("float32"), depth_imgs_train.astype("float32")], rgb_imgs_metadata_train, 
                          batch_size=batch_size, epochs=epochs, 
                          validation_data=([rgb_imgs_validation.astype("float32"), depth_imgs_validation.astype("float32")], 
                                           rgb_imgs_metadata_validation), callbacks=callbacks)

Epoch 1/60


Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /root/.cache/torch/hub/master.zip
  model = create_fn(
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt" to /root/.cache/torch/hub/checkpoints/dpt_hybrid_384.pt
100%|██████████| 470M/470M [00:01<00:00, 373MB/s] 

KeyboardInterrupt



In [None]:
ind_sample = 1

print(rgbd_fusion.predict([np.array([rgb_imgs_test[ind_sample]]).astype("float32"),
                           np.array([depth_imgs_test[ind_sample]]).astype("float32")]))
print(rgb_imgs_metadata_test[ind_sample])

In [None]:
test_predicted = rgbd_fusion.predict([rgb_imgs_test.astype("float32"), depth_imgs_test.astype("float32")])

In [None]:
test_errors = np.abs(test_predicted - rgb_imgs_metadata_test)

for val_name, val in zip(real_column_names[1:], np.mean(test_errors, axis=0)):
    print("MAE Mean for", val_name, '-->', val)

In [None]:
model_type = "DPT_Hybrid"
midas = torch.hub.load("intel-isl/MiDaS", model_type, pretrained=True)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.dpt_transform

[midas(transform(np.array(i.tolist()))) for i in rgb_imgs_train[:1000]]

# depth_input = [transform(i) for i in .numpy()]
# depth_input = [midas(i) for i in depth_input]

# ViT Regression

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
# import timm

# # Define a sample dataset
# class RegressionDataset(Dataset):
#     def __init__(self, images, targets):
#         self.images = images  # List of image tensors
#         self.targets = targets  # List of target values

#     def __len__(self):
#         return len(self.images)

#     def __getitem__(self, idx):
#         image = self.images[idx]
#         target = self.targets[idx]
#         return image, target

# rgb_imgs_train = rgb_imgs_train.reshape(len(rgb_imgs_train), 3, 224, 224)
# rgb_imgs_test = rgb_imgs_test.reshape(len(rgb_imgs_test), 3, 224, 224)
# rgb_imgs_validation = rgb_imgs_validation.reshape(len(rgb_imgs_validation), 3, 224, 224)

# # Create the dataset and dataloader
# train_dataset = RegressionDataset(rgb_imgs_train, rgb_imgs_metadata_train)
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# # Load a pre-trained ViT model from timm
# model = timm.create_model('vit_base_patch16_224', pretrained=True)

# # Modify the model's head for regression
# num_features = model.head.in_features
# model.head = nn.Linear(num_features, 5)  # Change the output layer to a single neuron for regression

# # # Move model to GPU if available
# # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # model = model.to(device)

# # Define loss function and optimizer
# criterion = nn.MSELoss()  # Mean Squared Error for regression
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# num_epochs = 10
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for images, targets in train_loader:
#         images = images#.to(device)
#         targets = targets#.to(device)

#         # Forward pass
#         outputs = model(images)
#         loss = criterion(outputs, targets)

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# print("Training complete.")

In [None]:
# test_predicted = model.predict(rgb_imgs_test)

# test_errors = np.abs(test_predicted - rgb_imgs_metadata_test)

# for val_name, val in zip(real_column_names[1:], np.mean(test_errors, axis=0)):
#     print("MAE Mean for", val_name, '-->', val)