# Import libs

In [1]:
import os
import json
import random
import pickle
import numpy as np
import pandas as pd
from itertools import product
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from datetime import datetime

import utils
from VisionModels import CustomCNN, CustomEfficientNetB3
from VisionDatasets import ContactDataset, PreloadContactDataset

%load_ext autoreload
%autoreload 2

# Pre-settings

In [2]:
parameters_json = os.path.join(os.getcwd(), 'settings/parameters.json')
paths_json = os.path.join(os.getcwd(), 'settings/paths.json')

In [6]:
with open(parameters_json, 'r') as json_file:
    params = json.load(json_file)

with open(paths_json, 'r') as json_file:
    paths = json.load(json_file)

In [7]:
random_seed = params["random_seed"]

os.environ['PYTHONHASHSEED'] = str(random_seed)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train_set = ['C_M1_T1_4', 'C_M1_T1_7', 'C_M1_T1_3', 'C_M1_T1_6',
             'R2_M1_T1_6', 'R2_M1_T1_7', 'R2_M1_T1_3', 'R2_M1_T1_5',
             'L2_M1_T1_3', 'L2_M1_T1_4', 'L2_M1_T1_5', 'L2_M1_T1_7',
             'Z2_M1_T1_1', 'M3_Z2_NF', 'M5_Z2_NF', 'M12_Z2_NF']

val_set = ['C_M1_T1_1', 'C_M1_T1_5', 'R2_M1_T1_2', 'R2_M1_T1_4',
           'L2_M1_T1_6', 'L2_M1_T1_1', 'Z2_M1_T1_2', 'M7_Z2_NF']

test_set = ['C_M1_T1_8', 'M1_NF', 'M2_NF', 'R1_M1_T1_1',
            'R1_M1_T1_2', 'R3_M1_T1_1', 'R3_M1_T1_2', 'L1_M1_T1_1',
            'L1_M1_T1_2', 'L3_M1_T1_1', 'L3_M1_T1_2', 'Z1_M1_T1_1',
            'Z3_M1_T1_1', 'R2_M1_T1_8', 'M3_R2_NF', 'M5_R2_NF',
            'M5_L2_NF', 'M3_L2_NF', 'M7_L2_NF', 'Z2_M1_T1_5',
            'M9_Z2_NF', 'M8_Z2_NF']

In [10]:
data_set = 'silicone'
model_set = ['CustomCNN', 'EfficientNet']
label_set = ['GT', 'MTurk']
model_combinations = list(product(model_set, label_set))

# Import Data

In [11]:
data = pd.read_csv(paths[data_set]['labels'], header=0)

In [12]:
# filter the data rows by the name of sets
train_data = data[data['dataset'].isin(train_set)]
val_data = data[data['dataset'].isin(val_set)]

In [None]:
# add data to Datasets
train_dataloader = {}
val_dataloader = {}
loss_fn = {}

for label_name in label_set:

    image_col = 'HPC_Path'
    
    if label_name == 'GT':
        label_col = 'GT'
    elif label_name == 'MTurk':
        label_col = 'MTurk_Label'

    train_dataset = ContactDataset(
        images=train_data.apply(lambda row: os.path.join(paths[data_set]['image_set'], row['dataset'], row['image']), axis=1).to_list(),
        labels=train_data[label_col].to_numpy(),
        coords=list(zip(
            train_data['x'].astype(int),
            train_data['y'].astype(int))),
        jitter=True)

    val_dataset = ContactDataset(
        images=val_data.apply(lambda row: os.path.join(paths[data_set]['image_set'], row['dataset'], row['image']), axis=1).to_list(),
        labels=val_data[label_col].to_numpy(),
        coords=list(zip(
            train_data['x'].astype(int),
            train_data['y'].astype(int))),
        jitter=False)

    # create DataLoader with existed Datasets
    train_dataloader[label_name] = DataLoader(
        dataset=train_dataset,
        batch_size=params['batch_size'],
        num_workers=(16 if os.cpu_count() > 16 else os.cpu_count()),
        pin_memory=True,
        shuffle=True)

    val_dataloader[label_name] = DataLoader(
        dataset=val_dataset,
        batch_size=params['batch_size'],
        num_workers=(16 if os.cpu_count() > 16 else os.cpu_count()),
        pin_memory=True,
        shuffle=True)
    
    weights = train_dataset.getWeights().to(device)
    loss_fn[label_name] = nn.CrossEntropyLoss(weight=weights)


In [None]:
len(train_dataset), len(val_dataset)

# Training

In [None]:
for model_name, label_name in model_combinations:
    # select the model
    if model_name == 'CustomCNN':
        model = CustomCNN()
    elif model_name == 'EfficientNet':
        model = CustomEfficientNetB3()

    # set up the optimizer (hyper-parameters)
    optimizer = optim.Adam(
        model.parameters(),
        lr=params[model_name]['learning_rate'],
        weight_decay=params[model_name]['weight_decay'])

    # train and retrieve the metrics
    utils.train(
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn[label_name],
        dataloader=train_dataloader[label_name],
        val_dataloader=val_dataloader[label_name],
        device=device,
        use_tqdm=True,
        epochs=params['epochs'])

    utils.save_metrics(model, model_name, label_name)
    utils.save_state_dict(model, model_name, label_name)

# Predict

In [29]:
results = {}

In [None]:
for set_name in tqdm(test_set):
    # concat paths
    images_path = os.path.join(
        paths[data_set]['image_set'], set_name)
    label_path = os.path.join(images_path, 'labels_30hz.txt')
    coordinates_path = os.path.join(
        paths[data_set]['keypoints'], f"{set_name}_L_h264{paths[data_set]['keypoints_model']}.h5")

    # load data files
    test_data = pd.read_csv(label_path, header=None).iloc[:, 1:4].to_numpy()
    coordinates = pd.read_hdf(coordinates_path).loc[:, [
        (paths[data_set]['keypoints_model'], 'Mid_1', 'x'),
        (paths[data_set]['keypoints_model'], 'Mid_1', 'y')]].to_numpy()

    test_images = []
    test_laebls = []

    # add data to list
    force_threshold = 0.2
    for index, row in enumerate(test_data):
        test_images.append(os.path.join(images_path, f'img_{index}.jpg'))
        test_laebls.append(1 if np.sqrt(row.dot(row)) > force_threshold else 0)

    # create dataset and dataloader
    # Note: switch to 'ContactDataset' if you don't have enough memory, 
    # 'PreloadContactDataset' will load all images into memory for accelerating
    test_dataset = PreloadContactDataset(
        images=test_images,
        labels=test_laebls,
        coords=coordinates.astype(int).tolist())

    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=512,
        num_workers=(16 if os.cpu_count() > 16 else os.cpu_count()),
        pin_memory=True)

    # predict for each model
    for model_name, label_name in model_combinations:
        # select the model
        if model_name == 'CustomCNN':
            model = CustomCNN()
        elif model_name == 'EfficientNet':
            model = CustomEfficientNetB3()

        utils.load_state_dict(model, model_name, label_name)
        predictions, ground_truth = utils.predict(
            model=model,
            dataloader=test_dataloader,
            device=device)

        results[(model_name, label_name, set_name)] = {
            "Prediction": predictions,
            "Ground Truth": ground_truth
        }

In [None]:
with open(f'labels/{data_set}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pkl', 'wb') as file:
    pickle.dump(results, file)

# Evaluate

In [None]:
# metrics for all 4 models
from sklearn.metrics import classification_report

# Note: Replace with your own results file
with open(f'labels/{'silicone_yyyymmdd_hhmmss'}.pkl', 'rb') as file:
        results = pickle.load(file)
print(data_set)
for model_name, label_name in model_combinations:
    binary_predictions = []
    y = []
    for ts in test_set:
        pred, gt = results[(model_name, label_name, ts)].values()
        y.extend(gt)
        binary_predictions.extend((pred > 0.5).astype(int))
    print((model_name, label_name))
    print(classification_report(y, binary_predictions))