## Imports

In [101]:
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import random

## Settings

In [102]:
in_dir = '/home/drevital/obstacles_classification_datasets/base_dataset'
out_dir = '/home/drevital/obstacles_classification_datasets/feb23_c_std_15_gamma_15'
sites_dir = '/home/drevital/obstacles_classification_datasets/base_dataset/sites'
sites = os.listdir(sites_dir)

# Parameters used in the diff_metric to diff_coef assignent function
alfa = -3.5
beta = 2.0
gamma = 1.5
swc = 2.0 # sample weight coefficient
diff_threshold = 50
std_threshold_dist = 1.5 # Distance from std to apply sample_weight correction

In [103]:
sites

['kfar_saba',
 'koki_factory',
 'new_factory_humid',
 'musashi_office',
 'shufersal',
 'new_factory',
 'neve_ilan',
 'unknown']

In [104]:
site_thresholds = {'neve_ilan': 55,
                   'kfar_saba': 55,
                   'shufersal': 55,
                   'new_factory': 50,
                   'new_factory_humid': 50,
                   'musashi_office': 40,
                   'koki_factory': 40,
                   'unknown': 50}
default_threshold = 50

## Make dictionary for the image names of each site

In [105]:
site_images = defaultdict(list)

for site in sites:
    site_dir = os.path.join(sites_dir, site)
    site_images[site] = os.listdir(site_dir)

In [106]:
site_images.keys()

dict_keys(['kfar_saba', 'koki_factory', 'new_factory_humid', 'musashi_office', 'shufersal', 'new_factory', 'neve_ilan', 'unknown'])

## A function to find the source site of a given image

In [107]:
def find_site_and_threshold(im_name):
    found_states = [im_name in site_images[site] for site in sites]
    
    if any(found_states):
        site = sites[np.argmax(found_states)]
        threshold = site_thresholds[site]
    else:
        site = 'unknown'
        site_images[site].append(im_name)
        threshold = default_threshold
        
    return site, threshold

## Define curve to assign diff_coef according to diff_metric

In [108]:
def diff_metric_to_diff_coef(sigma_dist):
    
    # Correction curve for assigning coefficients
    # Based on Sigmoid
    # adding alpha, beta and gamma controls, as explained at the
    # beginning of the diff_coef_curve notebook
    
    return 1/(1 + np.exp(-(sigma_dist*alfa-beta)*gamma))

## Calculate sample_weights

In [109]:
train_dict = {'in_name': [],
              'out_name': [],
              'class_name': [],
              'diff_metric': [],
              'diff_coef': [],
              'sample_weight': []
             }
diff_metrics = {'no_obstacle': [], 'obstacle': []}
class_names = ['no_obstacle', 'obstacle']
subset_name = 'train'

for class_name in class_names:
    class_path = os.path.join(in_dir, subset_name, class_name)
    im_names = os.listdir(class_path)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)

        # Generate diff mask            
        w = pair.shape[1]
        ref = pair[:, :w//2]
        current = pair[:, w//2:(w//2)*2]
        diff = cv2.subtract(ref, current)
        agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
        _, mask = cv2.threshold(agg_rgb, diff_threshold, 255, cv2.THRESH_BINARY)

        # Calculate diff_coeff
        h = mask.shape[0]
        w = mask.shape[1]
        area = h * w

        # Update train dictionary
        train_dict['in_name'].append(im_name)
        train_dict['class_name'].append(class_name)
        diff_metric = (np.sum(mask)/255)/area
        train_dict['diff_metric'].append(diff_metric)    
        diff_metrics[class_name].append(diff_metric)
            
mean = {'no_obstacle': np.mean(diff_metrics['no_obstacle']),
        'obstacle': np.mean(diff_metrics['obstacle'])}
std = {'no_obstacle': np.std(diff_metrics['no_obstacle']),
       'obstacle': np.std(diff_metrics['obstacle']) }

for i, diff_metric in enumerate(train_dict['diff_metric']):
    class_name = train_dict['class_name'][i]
    # Following is to adjust the direction of distance from std and correction accordingly
    # For obstacle - a negative sigma means we are lower than threshold and need correction
    # For no obstacle a positive sigma means we are higher than threshold and need correction
    sigma_dist_sign = 1.0 if class_name == 'obstacle' else -1.0 
    diff_threshold = mean[class_name] + sigma_dist_sign * std_threshold_dist * std[class_name]
    sigma_dist = sigma_dist_sign * (diff_metric - diff_threshold)/std[class_name]
    diff_coef = diff_metric_to_diff_coef(sigma_dist)
    sample_weight = 1.0 + swc * diff_coef
    train_dict['diff_coef'].append(diff_coef)
    train_dict['sample_weight'].append(sample_weight)

100%|██████████| 6810/6810 [00:13<00:00, 513.43it/s]
100%|██████████| 7015/7015 [00:35<00:00, 199.70it/s]


## A function to generate <ref, current, true_mask> triplet from <ref, current> pair

In [110]:
def triplet_image_true_mask(pair, threshold):
    w = pair.shape[1]
    ref = pair[:, :w//2]
    current = pair[:, w//2:(w//2)*2]
    diff = cv2.absdiff(current, ref)
    agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
    _, mask = cv2.threshold(agg_rgb, threshold, 255, cv2.THRESH_BINARY)

    # old morphological operations
    copyImg = cv2.erode(mask, np.ones((3, 3), np.uint8), iterations=1)  # reduce noise
    copyImg = cv2.dilate(copyImg, np.ones((7, 7), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((5, 5), np.uint8), iterations=1)
    copyImg = cv2.dilate(copyImg, np.ones((9, 9), np.uint8), iterations=1)
    kernel = np.ones((11, 11), np.uint8)  # kernel for dilation

    # increase area to an object
    copyImg = cv2.dilate(copyImg, kernel, iterations=2)
    copyImg = cv2.dilate(copyImg, np.ones((13, 13), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((11, 11), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((5, 5), np.uint8), iterations=1)

    mask = copyImg 
    mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)  
    
    return cv2.hconcat([ref, current, mask])

## A function to generate <ref, current, black_mask> triplet from <ref, current> pair

In [111]:
def triplet_image_black_mask(pair, threshold):
    w = pair.shape[1]
    ref = pair[:, :w//2]
    current = pair[:, w//2:(w//2)*2]
    h_mask = ref.shape[0]
    w_mask = ref.shape[1]
    mask = np.full((h_mask, w_mask, 3), 0, dtype=np.uint8)
    
    return cv2.hconcat([ref, current, mask])

## Generate triplet images <ref, current, mask>

In [112]:
subset_names = ['train', 'eval']
class_names = ['no_obstacle', 'obstacle']
class_extensions = {'no_obstacle': 'noobs', 'obstacle': 'obs'}

for subset_name in subset_names:
    cur_out_dir = os.path.join(out_dir, subset_name)
    Path(cur_out_dir).mkdir(parents=True, exist_ok=True)

# Prepare indices for shuffling the images in dictionary, so onstacle/no_obstacle are mixed
# This is necessary for the training/validation from corresponding dataframe to work properly

keys = list(train_dict.keys())
shuffled_train_dict = {}
inds = [i for i in range(len(train_dict['in_name']))]
shuffled = inds.copy()
random.shuffle(shuffled)
for k in keys:
    if len(train_dict[k]) > 0:
        shuffled_train_dict[k] = [train_dict[k][shuffled[i]] for i in range(len(train_dict['in_name']))]
    else:
        shuffled_train_dict[k] = []
    
subset_name = 'train'
i = 0
for im_name in tqdm(shuffled_train_dict['in_name']):
    class_name = shuffled_train_dict['class_name'][i]
    class_path = os.path.join(in_dir, subset_name, class_name)
    im_path = os.path.join(class_path, im_name)
    pair = cv2.imread(im_path)
    site, threshold = find_site_and_threshold(im_name)
    triplet = triplet_image_true_mask(pair, threshold)
    class_extension = class_extensions[class_name]
    sample_weight = shuffled_train_dict['sample_weight'][i]
    out_im_name = '.'.join(im_name.split('.')[:-1])\
         + f'_{site}_{class_extension}_{sample_weight:.4f}_.jpg'
    shuffled_train_dict['out_name'].append(out_im_name)
    cur_out_dir = os.path.join(out_dir, subset_name)
    out_path = os.path.join(cur_out_dir, out_im_name)
    cv2.imwrite(out_path, triplet)
    i += 1
    
subset_name = 'eval'
for class_name in class_names:
    class_path = os.path.join(in_dir, subset_name, class_name)
    im_names = os.listdir(class_path)
    cur_out_dir = os.path.join(out_dir, subset_name, class_name)
    Path(cur_out_dir).mkdir(parents=True, exist_ok=True)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)
        site, threshold = find_site_and_threshold(im_name)
        triplet = triplet_image_black_mask(pair, threshold)
        out_im_name = '.'.join(im_name.split('.')[:-1]) + f'_{site}_.jpg'
        out_path = os.path.join(cur_out_dir, out_im_name)
        cv2.imwrite(out_path, triplet)    

100%|██████████| 13825/13825 [01:51<00:00, 123.89it/s]
100%|██████████| 756/756 [00:03<00:00, 218.19it/s]
100%|██████████| 771/771 [00:07<00:00, 103.32it/s]


## Create Dataframe from sample_weights Dictionary

In [113]:
shuffled_train_dict.keys()

dict_keys(['in_name', 'out_name', 'class_name', 'diff_metric', 'diff_coef', 'sample_weight'])

In [114]:
[len(shuffled_train_dict[k]) for k in shuffled_train_dict.keys()]

[13825, 13825, 13825, 13825, 13825, 13825]

In [115]:
train_df = pd.DataFrame.from_dict(shuffled_train_dict)

In [116]:
train_df

Unnamed: 0,in_name,out_name,class_name,diff_metric,diff_coef,sample_weight
0,587_19057_0_shear_222.jpg,587_19057_0_shear_222_unknown_noobs_3.0000_.jpg,no_obstacle,0.247208,0.999983,2.999967
1,115_1964_0_rotate_475.jpg,115_1964_0_rotate_475_unknown_obs_3.0000_.jpg,obstacle,0.001034,0.999997,2.999994
2,58_1752_.96_2021-08-15--11-59-26_.jpg,58_1752_.96_2021-08-15--11-59-26__new_factory_...,obstacle,0.213763,0.974624,2.949247
3,458_obstacles_classification_10_2022-12-05T15-...,458_obstacles_classification_10_2022-12-05T15-...,obstacle,0.218325,0.969300,2.938600
4,293_obstacles_0_crop_136.jpg,293_obstacles_0_crop_136_unknown_noobs_2.9964_...,no_obstacle,0.130875,0.998219,2.996437
...,...,...,...,...,...,...
13820,282_obstacles_classification_10_2022-10-25T14-...,282_obstacles_classification_10_2022-10-25T14-...,obstacle,0.099875,0.999804,2.999609
13821,1_171_.79_2021-08-15--11-39-59_.jpg,1_171_.79_2021-08-15--11-39-59__new_factory_ob...,obstacle,0.292219,0.569260,2.138519
13822,1904_obstacles_0_shear_3737.jpg,1904_obstacles_0_shear_3737_unknown_noobs_3.00...,no_obstacle,0.443400,1.000000,3.000000
13823,1382_obstacles_0_shear_4367.jpg,1382_obstacles_0_shear_4367_unknown_noobs_2.99...,no_obstacle,0.105325,0.995044,2.990088


## Store the Dataframe in a .csv file

In [117]:
csv_name = out_dir.split('/')[-1] + '.csv'
csv_path = os.path.join(out_dir, csv_name)
train_df.to_csv(csv_path)

In [118]:
mean['obstacle'], std['obstacle']

(0.18519772001032608, 0.12224598107311017)

In [119]:
mean['no_obstacle'], std['no_obstacle']

(0.09469074593042251, 0.13068741788982025)

In [120]:
tot = 0
for site in site_images:
    tot += len(site_images[site])
    print(f'{site}: {len(site_images[site])}')
print(f'========== total: {tot}')

kfar_saba: 1539
koki_factory: 437
new_factory_humid: 378
musashi_office: 3171
shufersal: 1165
new_factory: 7649
neve_ilan: 3486
unknown: 7644
