# Machine Learning Project - Pawpularity ##
### Armando Fortes (2021403383), David Pissarra (2021403381)

#### Imports ####

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models
from matplotlib import pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
print(f'Number of GPUs available: {len(physical_devices)}')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

#### Constants and Hyperparameters ####

In [None]:
dataset_dir = '../Dataset/'
train_images_path = dataset_dir + 'train/'
test_images_path = dataset_dir + 'test/'
train_meta_path = dataset_dir + 'train.csv'
test_meta_path = dataset_dir + 'test.csv'

In [None]:
EPOCHS = 10
STRAT_SIZE = 21
BATCH_SIZE = 64
IMAGE_DIM = 128
BUFFER_SIZE = 1024
LEARNING_RATE = 0.001

AUTOTUNE = tf.data.experimental.AUTOTUNE

#### Image Dataset Preprocessing ####

In [None]:
def load_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.
    image = tf.image.resize(image, (IMAGE_DIM, IMAGE_DIM))
    return image

In [None]:
def map_image(image_path, label):
    return load_image(image_path), label

In [None]:
def attr_distribution(df, attr, value, color="dodgerblue"):
    plt.title(f"{attr} = {value}")
    x = df.loc[df[attr] == value]['Pawpularity']
    x.plot(kind='hist', bins=20, color=color)

    _, max_ylim = plt.ylim()
    plt.axvline(x.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.text(x.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(x.mean()))

Training and Validation Sets

In [None]:
train_metadata = pd.read_csv(train_meta_path).sort_values(by='Id')
train_metadata = train_metadata.assign(Strat=(train_metadata['Pawpularity']//STRAT_SIZE))
images_names = (train_images_path + train_metadata['Id'] + '.jpg').values
images_paws = (train_metadata['Pawpularity']).values
images_strats = (train_metadata['Strat']).values

In [None]:
fig = plt.figure(figsize=(13, 13))
columns = 4
rows = 6
set = 1

for attr in train_metadata.columns:
    if attr not in ('Id', 'Pawpularity', 'Strat'):
        fig.add_subplot(rows, columns, set)
        attr_distribution(train_metadata, attr, 0, color='orange')
        fig.add_subplot(rows, columns, set + 1)
        attr_distribution(train_metadata, attr, 1)
        set += 2

plt.tight_layout()

# Train Test Split

In [None]:
# train_images_names, valid_images_names, Y_train, Y_valid = train_test_split(
#     images_names,
#     images_strats,
#     test_size=0.1
#     )

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(IMAGE_DIM, IMAGE_DIM, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))

model.summary()

In [None]:
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1))

model.summary()

In [None]:
model.compile(optimizer=Adam(),
              loss=MeanSquaredError(),
              metrics=RootMeanSquaredError())

# Stratified K-Fold

In [14]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

min_loss, fold_no = float('+inf'), 0
min_ds_train, min_ds_val, min_fold = 0, 0, 0
for train_index, val_index in skf.split(images_names, images_strats):
    fold_no += 1
    print(f'\n----------------- Fold {fold_no} -----------------\n')
    X_train, X_val = images_names[train_index], images_names[val_index]
    y_train, y_val = images_paws[train_index], images_paws[val_index]
    
    ds_train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    ds_train = ds_train.map(map_image, num_parallel_calls=AUTOTUNE)
    ds_train = ds_train.shuffle(buffer_size=BUFFER_SIZE, reshuffle_each_iteration=True)
    ds_train = ds_train.batch(BATCH_SIZE)
    ds_train = ds_train.prefetch(AUTOTUNE)

    ds_val = tf.data.Dataset.from_tensor_slices((X_val, y_val))
    ds_val = ds_val.map(map_image, num_parallel_calls=AUTOTUNE)
    ds_val = ds_val.batch(BATCH_SIZE)
    ds_val = ds_val.prefetch(AUTOTUNE)

    log = model.fit(ds_train, epochs=EPOCHS, validation_data=ds_val)
    if min_loss > log.history['val_root_mean_squared_error'][-1]:
        min_ds_train, min_ds_val, min_fold = ds_train, ds_val, fold_no
        min_loss = log.history['val_root_mean_squared_error'][-1]

print(f'Lowest loss occurs on Fold {min_fold}')
    

Test Set

In [None]:
test_metadata = pd.read_csv(test_meta_path).sort_values(by='Id')
test_images_names = (test_images_path + test_metadata['Id'] + '.jpg').values

In [None]:
ds_test = tf.data.Dataset.from_tensor_slices((test_images_names,))
ds_test = ds_test.map(load_image, num_parallel_calls=AUTOTUNE)
ds_test = ds_test.batch(BATCH_SIZE)
ds_test = ds_test.prefetch(AUTOTUNE)

# for x in ds_test:
#     plt.imshow(x[0].numpy())
#     plt.show()
#     break

#### Submission ####

In [None]:
yhat = model.predict(ds_val, verbose=1)

In [None]:
print(yhat)

In [None]:
# test_predictions = pd.DataFrame()
# test_predictions['Id'] = test_metadata.Id
# test_predictions['Pawpularity'] = yhat
# test_predictions.to_csv('submission.csv', index=False)

In [None]:
y_paw = np.empty((0,))
for _, y in ds_val:
    y_paw = np.append(y_paw, y)
print(y_paw)
print(yhat.reshape(-1))

In [None]:
print(mean_squared_error(yhat.reshape(-1), y_paw, squared=False))

In [None]:
test_predictions.head(8)