## Imports

In [1]:
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import random

## Settings

In [2]:
src_folder = '/home/drevital/obstacles_classification_datasets/obstacle_classification_RGB_data'
annotated_folder = '/home/drevital/obstacles_classification_datasets/rgb_6/annotated'
in_folder = '/home/drevital/obstacles_classification_datasets/test_rgb_6'
out_folder = '/home/drevital/obstacles_classification_datasets/test_compare_boosts'
sites = ['_'.join(s.split('_')[:-2]) for s in os.listdir(src_folder)]

# Parameters used in the diff_metric to diff_coef assignent function
alfa = -3.5
beta = 2.0
gamma = 8
swc = 1.0 # sample weight coefficient
diff_threshold = 50
std_threshold_dist = 1.5 # Distance from std to apply sample_weight correction

In [3]:
sites

['musashi_office',
 'koki_factory',
 'israel',
 'new_factory',
 'new_factory_humid']

In [4]:
site_thresholds = {'israel': 55, 'new_factory': 50, 'new_factory_humid': 50, 'musashi_office': 40, 'koki_factory': 40}
default_threshold = 50

## Make dictionary for the image names of each site

In [5]:
site_images = defaultdict(list)

for site in sites:
    site_folder = os.path.join(src_folder, site + '_rgb_data','all_data')
    class_folders = os.listdir(site_folder)
    for cls in class_folders:
        site_images[site] += [f for f in os.listdir(os.path.join(site_folder,cls))]

## List images not found in any site

In [6]:
class_folders = ['no_obstacle', 'obstacle']

for class_folder in class_folders:
    annotated = os.listdir(os.path.join(annotated_folder, class_folder))
    for a in annotated:
        # alt_name takes into account the same name with ignoring one _ at the end
        alt_name = '.'.join(a.split('.')[:-1])[:-1] + '.jpg'
        found_states = [a in site_images[site] for site in sites]
        found = any(found_states)
        alt_found = any([alt_name in site_images[site] for site in sites])
        found = found or alt_found
        if not found:
            print(f'{class_folder}: {a}')

obstacle: 43_1561__reversed.jpg
obstacle: 43_1697__reversed.jpg
obstacle: 1_1235_1_reversed.jpg
obstacle: 43_1589__reversed.jpg
obstacle: 1_1195_1_reversed.jpg
obstacle: 43_1665__reversed.jpg
obstacle: 1_1031_1_reversed.jpg
obstacle: 43_1625__reversed.jpg
obstacle: 43_1525__reversed.jpg
obstacle: 1_725__reversed.jpg
obstacle: 1_1131_1_reversed.jpg
obstacle: 1_1027_1_reversed.jpg
obstacle: 43_1689__reversed.jpg
obstacle: 1_1035_1_reversed.jpg
obstacle: 43_1485__reversed.jpg
obstacle: 1_1111_1_reversed.jpg
obstacle: 1_1135_1_reversed.jpg
obstacle: 1_1159_1_reversed.jpg
obstacle: 43_1645__reversed.jpg
obstacle: 43_1509__reversed.jpg
obstacle: 1_1071_1_reversed.jpg
obstacle: 1_1147_1_reversed.jpg
obstacle: 43_1541__reversed.jpg
obstacle: 43_1661__reversed.jpg
obstacle: 43_1669__reversed.jpg
obstacle: 43_1677__reversed.jpg
obstacle: 1_1175_1_reversed.jpg
obstacle: 43_1569__reversed.jpg
obstacle: 43_1649__reversed.jpg
obstacle: 1_745__reversed.jpg
obstacle: 1_1263_1_reversed.jpg
obstacle: 1_

## A funciton to find the source site of a given image

In [7]:
def find_site_and_threshold(im_name):
    found_states = [im_name in site_images[site] for site in sites]
    
    if any(found_states):
        site = sites[np.argmax(found_states)]
        threshold = site_thresholds[site]
    else:
        site = 'unknown'
        threshold = default_threshold
        
    return site, threshold

## Define curve to assign diff_coef according to diff_metric

In [8]:
def diff_metric_to_diff_coef(sigma_dist):
    
    # Correction curve for assigning coefficients
    # Based on Sigmoid
    # adding alpha, beta and gamma controls, as explained at the
    # beginning of this notebook
    
    return 1/(1 + np.exp(-(sigma_dist*alfa-beta)*gamma))

## Calculate sample_weights

In [9]:
train_dict = {'in_name': [],
              'out_name': [],
              'class_name': [],
              'diff_metric': [],
              'diff_coef': [],
              'sample_weight': []
             }
diff_metrics = {'no_obstacle': [], 'obstacle': []}
class_names = ['no_obstacle', 'obstacle']
subset_name = 'train'

for class_name in class_names:
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_names = os.listdir(class_path)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)

        # Generate diff mask            
        w = pair.shape[1]
        ref = pair[:, :w//2]
        current = pair[:, w//2:w//2+w//2]
        diff = cv2.subtract(ref, current)
        agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
        _, mask = cv2.threshold(agg_rgb, diff_threshold, 255, cv2.THRESH_BINARY)

        # Calculate diff_coeff
        h = mask.shape[0]
        w = mask.shape[1]
        area = h * w

        # Update train dictionary
        train_dict['in_name'].append(im_name)
        train_dict['class_name'].append(class_name)
        diff_metric = (np.sum(mask)/255)/area
        train_dict['diff_metric'].append(diff_metric)    
        diff_metrics[class_name].append(diff_metric)
            
mean = {'no_obstacle': np.mean(diff_metrics['no_obstacle']),
        'obstacle': np.mean(diff_metrics['obstacle'])}
std = {'no_obstacle': np.std(diff_metrics['no_obstacle']),
       'obstacle': np.std(diff_metrics['obstacle']) }

for i, diff_metric in enumerate(train_dict['diff_metric']):
    class_name = train_dict['class_name'][i]
    # Following is to adjust the direction of distance from std and correction accordingly
    # For obstacle - a negative sigma means we are lower than threshold and need correction
    # For no obstacle a positive sigma means we are higher than threshold and need correction
    sigma_dist_sign = 1.0 if class_name == 'obstacle' else -1.0 
    diff_threshold = mean[class_name] + sigma_dist_sign * std_threshold_dist * std['obstacle']
    sigma_dist = sigma_dist_sign * (diff_metric - diff_threshold)/std[class_name]
    diff_coef = diff_metric_to_diff_coef(sigma_dist)
    sample_weight = 1.0 + swc * diff_coef
    train_dict['diff_coef'].append(diff_coef)
    train_dict['sample_weight'].append(sample_weight)

100%|██████████| 4/4 [00:00<00:00, 56.12it/s]
100%|██████████| 18/18 [00:00<00:00, 387.91it/s]


In [10]:
im_name

'253_3265_0.10_.46_2022-01-13--16-20-59__unknown_.jpg'

## A function to generate <ref, current, mask> triplet from <ref, current> pair

In [11]:
def triplet_image(pair, threshold):
    w = pair.shape[1]
    ref = pair[:, :w//2]
    current = pair[:, w//2:w//2+w//2]
    diff = cv2.subtract(ref, current)
    agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
    _, mask = cv2.threshold(agg_rgb, diff_threshold, 255, cv2.THRESH_BINARY)
    mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)  
    
    return cv2.hconcat([ref, current, mask])

## Generate triplet images <ref, current, mask>

In [12]:
subset_names = ['train', 'eval']
class_names = ['no_obstacle', 'obstacle']
class_extensions = {'no_obstacle': 'noobs', 'obstacle': 'obs'}

for subset_name in subset_names:
    cur_out_folder = os.path.join(out_folder, subset_name)
    Path(cur_out_folder).mkdir(parents=True, exist_ok=True)

# Prepare indices for shuffling the images in dictionary, so onstacle/no_obstacle are mixed
# This is necessary for the training/validation from corresponding dataframe to work properly

keys = list(train_dict.keys())
shuffled_train_dict = {}
inds = [i for i in range(len(train_dict['in_name']))]
shuffled = inds.copy()
random.shuffle(shuffled)
for k in keys:
    if len(train_dict[k]) > 0:
        shuffled_train_dict[k] = [train_dict[k][shuffled[i]] for i in range(len(train_dict['in_name']))]
    else:
        shuffled_train_dict[k] = []
    
subset_name = 'train'
i = 0
for im_name in tqdm(shuffled_train_dict['in_name']):
    class_name = shuffled_train_dict['class_name'][i]
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_path = os.path.join(class_path, im_name)
    pair = cv2.imread(im_path)
    site, threshold = find_site_and_threshold(im_name)
    triplet = triplet_image(pair, threshold)
    class_extension = class_extensions[class_name]
    sample_weight = shuffled_train_dict['sample_weight'][i]
    out_im_name = '.'.join(im_name.split('.')[:-1])\
         + f'_{site}_{class_extension}_{sample_weight:.4f}_.jpg'
    shuffled_train_dict['out_name'].append(out_im_name)
    cur_out_folder = os.path.join(out_folder, subset_name)
    out_path = os.path.join(cur_out_folder, out_im_name)
    cv2.imwrite(out_path, triplet)
    i += 1
    
subset_name = 'eval'
for class_name in class_names:
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_names = os.listdir(class_path)
    cur_out_folder = os.path.join(out_folder, subset_name, class_name)
    Path(cur_out_folder).mkdir(parents=True, exist_ok=True)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)
        site, threshold = find_site_and_threshold(im_name)
        triplet = triplet_image(pair, threshold)
        out_im_name = '.'.join(im_name.split('.')[:-1]) + f'_{site}_.jpg'
        out_path = os.path.join(cur_out_folder, out_im_name)
        cv2.imwrite(out_path, triplet)    

100%|██████████| 22/22 [00:00<00:00, 142.35it/s]
100%|██████████| 5/5 [00:00<00:00, 442.99it/s]
100%|██████████| 5/5 [00:00<00:00, 167.09it/s]


## Create Dataframe from sample_weights Dictionary

In [13]:
shuffled_train_dict.keys()

dict_keys(['in_name', 'out_name', 'class_name', 'diff_metric', 'diff_coef', 'sample_weight'])

In [14]:
[len(shuffled_train_dict[k]) for k in shuffled_train_dict.keys()]

[22, 22, 22, 22, 22, 22]

In [15]:
train_df = pd.DataFrame.from_dict(shuffled_train_dict)

In [16]:
train_df

Unnamed: 0,in_name,out_name,class_name,diff_metric,diff_coef,sample_weight
0,167_1943_0.10_.66_2021-08-16--09-54-07__israel...,167_1943_0.10_.66_2021-08-16--09-54-07__israel...,obstacle,0.332149,0.9999999,2.0
1,1_1565_0.52_.95_2021-08-15--17-22-56__new_fact...,1_1565_0.52_.95_2021-08-15--17-22-56__new_fact...,obstacle,0.0479,1.0,2.0
2,101_2031_0.24_.39_2021-08-15--15-04-10__new_fa...,101_2031_0.24_.39_2021-08-15--15-04-10__new_fa...,obstacle,0.082285,1.0,2.0
3,8_1690_0.73_1_reversed_unknown_.jpg,8_1690_0.73_1_reversed_unknown__unknown_noobs_...,no_obstacle,0.372236,1.0,2.0
4,5_120_0.82_.11_2021-08-17--18-08-54__musashi_o...,5_120_0.82_.11_2021-08-17--18-08-54__musashi_o...,no_obstacle,0.343559,1.0,2.0
5,121_1578_0.32_.05_2021-08-16--15-31-02__israel...,121_1578_0.32_.05_2021-08-16--15-31-02__israel...,obstacle,0.188447,1.0,2.0
6,46_1066_0.43_.17_2022-01-13--16-03-33__unknown...,46_1066_0.43_.17_2022-01-13--16-03-33__unknown...,obstacle,0.546214,7.633371e-10,1.0
7,46_319_0.79_.86_2021-08-15--16-29-44__new_fact...,46_319_0.79_.86_2021-08-15--16-29-44__new_fact...,no_obstacle,0.395592,1.0,2.0
8,106_3228_0.33_.09_2021-08-16--10-36-22__israel...,106_3228_0.33_.09_2021-08-16--10-36-22__israel...,obstacle,0.121572,1.0,2.0
9,17_1049_0.25_.05_2021-08-17--14-57-27__koki_fa...,17_1049_0.25_.05_2021-08-17--14-57-27__koki_fa...,obstacle,0.266126,1.0,2.0


## Store the Dataframe in a .csv file

In [17]:
csv_name = out_folder.split('/')[-1] + '.csv'
csv_path = os.path.join(out_folder, csv_name)
train_df.to_csv(csv_path)

In [18]:
mean['obstacle'], std['obstacle']

(0.2728643451268964, 0.16286995844053484)

In [19]:
mean['no_obstacle'], std['no_obstacle']

(0.34728438403160966, 0.04417328406668616)