# Dataset initialization
This notebook initializes the dataframe, provided as csv, and the TF records.

In [None]:
import sys

import numpy as np
import os
from sklearn.model_selection import train_test_split

sys.path.insert(0, '../model')

from utils.utils import *

import pandas as pd

from tqdm import tqdm

%reload_ext autoreload
%autoreload 2

In [None]:
import cv2

def min_max_norm(df_to_update: pd.DataFrame, df_current: pd.DataFrame, col_min: str, col_max: str, max_val: int):
    df_to_update.loc[df_current.index, col_min] = np.maximum(df_current[col_min], 0)
    df_to_update.loc[df_current.index, col_max] = np.minimum(df_current[col_max], max_val-1)

    df_to_update.loc[df_current.index, col_min] = df_current[col_min].astype(float) / max_val
    df_to_update.loc[df_current.index, col_max] = df_current[col_max].astype(float) / max_val

    return df_to_update

def get_df(path_csv: str, path_raw_csv: str, overwrite: bool = False, normalize: bool = False,):
    if not overwrite and os.path.exists(path_csv):
        df = pd.read_csv(path_csv)
    else:
        f = open(path_raw_csv, "r")
        rows = f.read().split("\n")
        rows = [r.split(",") for r in rows]
        rows = [[r[0]] + r[1].split(" ") for r in rows[:-1]]
        keys = ["filename", "bbox-0", "bbox-1", "bbox-2", "bbox-3"]
        df = pd.DataFrame([{keys[i]: r[i] for i in range(5)} for r in rows])

        if normalize:
            df_g = df.groupby('filename')

            for filename, df_current in tqdm(df_g):
                H, W = cv2.imread(os.path.join(images_path, filename)).shape[:2]

                for col in [f'bbox-{i}' for i in range(4)]:
                    df_current[col] = df_current[col].astype(int)

                df = min_max_norm(df_to_update=df, df_current=df_current, col_min='bbox-0', col_max='bbox-2', max_val=W)
                df = min_max_norm(df_to_update=df, df_current=df_current, col_min='bbox-1', col_max='bbox-3', max_val=H)

            for c in [f'bbox-{i}' for i in range(4)]:
                df[c] = np.maximum(df[c], 0.)
                df[c] = np.minimum(df[c], 1.)

        df.to_csv(path_csv)
        del rows

    return df

In [None]:
base_dir = get_dataset_path()
images_path = os.path.join(base_dir, 'images')
annotations_path = os.path.join(base_dir, "annotations")
dataset_path = os.path.join(base_dir, "dataset")
os.makedirs(dataset_path, exist_ok=True)

train_ds_path = os.path.join(dataset_path, "train_v3.tfrecord")
validation_ds_path = os.path.join(dataset_path, "validation_v4.tfrecord")
test_ds_path = os.path.join(dataset_path, "test_v3.tfrecord")

path_train_raw_csv = os.path.join(annotations_path, "100_percent_train.csv")
path_train_csv = os.path.join(annotations_path, "train_ds_all.csv")
path_test_raw_csv = os.path.join(annotations_path, "test.csv")
path_test_csv = os.path.join(annotations_path, "test_ds.csv")
path_validation_csv = os.path.join(annotations_path, "val_ds.csv")
path_train_split_csv = os.path.join(annotations_path, "train_ds.csv")

overwrite = False
df_train = get_df(path_csv=path_train_csv, path_raw_csv=path_train_raw_csv, overwrite=overwrite, normalize=True)
file_train = df_train['filename'].unique()


if overwrite:
    file_train, file_validation = train_test_split(file_train, test_size=.2)
    df_validation = df_train[df_train['filename'].isin(file_validation)]
    df_train = df_train[df_train['filename'].isin(file_train)]
    df_validation.to_csv(path_validation_csv)
    df_train.to_csv(path_train_split_csv)
else:
    df_train = pd.read_csv(path_train_split_csv)
    df_validation = pd.read_csv(path_validation_csv)


df_test = get_df(path_csv=path_test_csv, path_raw_csv=path_test_raw_csv, overwrite=overwrite, normalize=True)

In [None]:
print(set(df_train['filename'].unique()).intersection(df_validation['filename'].unique()))
print(set(df_test['filename'].unique()).intersection(df_validation['filename'].unique()))
print(set(df_train['filename'].unique()).intersection(df_test['filename'].unique()))

In [None]:
overwrite = False
build_tfrecord(df=df_train, output_path=train_ds_path, images_path=images_path, overwrite=overwrite)
build_tfrecord(df=df_validation, output_path=validation_ds_path, images_path=images_path, overwrite=overwrite)
build_tfrecord(df=df_test, output_path=test_ds_path, images_path=images_path, overwrite=overwrite)