## Imports

In [1]:
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

## Settings

In [2]:
src_folder = '/home/drevital/obstacles_classification_datasets/obstacle_classification_RGB_data'
annotated_folder = '/home/drevital/obstacles_classification_datasets/rgb_6/annotated'
in_folder = '/home/drevital/obstacles_classification_datasets/test_rgb_6'
out_folder = '/home/drevital/obstacles_classification_datasets/7_channels_weights'
sites = ['_'.join(s.split('_')[:-2]) for s in os.listdir(src_folder)]

# Parameters used in the diff_metric to diff_coef assignent function
alfa = 3
beta = 3.7
gamma = 8
swc = 1.0 # sample weight coefficient
diff_threshold = 50

In [3]:
sites

['musashi_office',
 'koki_factory',
 'israel',
 'new_factory',
 'new_factory_humid']

In [4]:
site_thresholds = {'israel': 55, 'new_factory': 50, 'new_factory_humid': 50, 'musashi_office': 40, 'koki_factory': 40}
default_threshold = 50

## Make dictionary for the image names of each site

In [5]:
site_images = defaultdict(list)

for site in sites:
    site_folder = os.path.join(src_folder, site + '_rgb_data','all_data')
    class_folders = os.listdir(site_folder)
    for cls in class_folders:
        site_images[site] += [f for f in os.listdir(os.path.join(site_folder,cls))]

## List images not found in any site

In [6]:
class_folders = ['no_obstacle', 'obstacle']

for class_folder in class_folders:
    annotated = os.listdir(os.path.join(annotated_folder, class_folder))
    for a in annotated:
        # alt_name takes into account the same name with ignoring one _ at the end
        alt_name = '.'.join(a.split('.')[:-1])[:-1] + '.jpg'
        found_states = [a in site_images[site] for site in sites]
        found = any(found_states)
        alt_found = any([alt_name in site_images[site] for site in sites])
        found = found or alt_found
        if not found:
            print(f'{class_folder}: {a}')

obstacle: 43_1561__reversed.jpg
obstacle: 43_1697__reversed.jpg
obstacle: 1_1235_1_reversed.jpg
obstacle: 43_1589__reversed.jpg
obstacle: 1_1195_1_reversed.jpg
obstacle: 43_1665__reversed.jpg
obstacle: 1_1031_1_reversed.jpg
obstacle: 43_1625__reversed.jpg
obstacle: 43_1525__reversed.jpg
obstacle: 1_725__reversed.jpg
obstacle: 1_1131_1_reversed.jpg
obstacle: 1_1027_1_reversed.jpg
obstacle: 43_1689__reversed.jpg
obstacle: 1_1035_1_reversed.jpg
obstacle: 43_1485__reversed.jpg
obstacle: 1_1111_1_reversed.jpg
obstacle: 1_1135_1_reversed.jpg
obstacle: 1_1159_1_reversed.jpg
obstacle: 43_1645__reversed.jpg
obstacle: 43_1509__reversed.jpg
obstacle: 1_1071_1_reversed.jpg
obstacle: 1_1147_1_reversed.jpg
obstacle: 43_1541__reversed.jpg
obstacle: 43_1661__reversed.jpg
obstacle: 43_1669__reversed.jpg
obstacle: 43_1677__reversed.jpg
obstacle: 1_1175_1_reversed.jpg
obstacle: 43_1569__reversed.jpg
obstacle: 43_1649__reversed.jpg
obstacle: 1_745__reversed.jpg
obstacle: 1_1263_1_reversed.jpg
obstacle: 1_

## A funciton to find the source site of a given image

In [7]:
def find_site_and_threshold(im_name):
    found_states = [im_name in site_images[site] for site in sites]
    
    if any(found_states):
        site = sites[np.argmax(found_states)]
        threshold = site_thresholds[site]
    else:
        site = 'unknown'
        threshold = default_threshold
        
    return site, threshold

## Define curve to assign diff_coef according to diff_metric

In [8]:
def diff_metric_to_diff_coef(sigma_dist):
    
    # Based on Sigmoid
    # adding alpha, beta and gamma controls, as explained at the
    # beginning of this notebook
    
    return 1/(1 + np.exp(-(sigma_dist*alfa-beta)*gamma))

## Calculate sample_weights

In [9]:
train_dict = {'in_name': [],
              'out_name': [],
              'class_name': [],
              'diff_metric': [],
              'diff_coef': [],
              'sample_weight': []
             }
diff_metrics = {'no_obstacle': [], 'obstacle': []}
class_names = ['no_obstacle', 'obstacle']
subset_name = 'train'

for class_name in class_names:
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_names = os.listdir(class_path)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)

        # Generate diff mask            
        w = pair.shape[1]
        ref = pair[:, :w//2]
        current = pair[:, w//2:]
        diff = cv2.subtract(ref, current)
        agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
        _, mask = cv2.threshold(agg_rgb, diff_threshold, 255, cv2.THRESH_BINARY)

        # Calculate diff_coeff
        h = mask.shape[0]
        w = mask.shape[1]
        area = h * w

        # Update train dictionary
        train_dict['in_name'].append(im_name)
        train_dict['class_name'].append(class_name)
        diff_metric = (np.sum(mask)/255)/area
        if class_name == 'obstacle':
            diff_metric = 1.0 - diff_metric
        train_dict['diff_metric'].append(diff_metric)    
        diff_metrics[class_name].append(diff_metric)
            
mean = {'no_obstacle': np.mean(diff_metrics['no_obstacle']),
        'obstacle': np.mean(diff_metrics['obstacle'])}
std = {'no_obstacle': np.std(diff_metrics['no_obstacle']),
       'obstacle': np.std(diff_metrics['obstacle']) }

for i, diff_metric in enumerate(train_dict['diff_metric']):
    class_name = train_dict['class_name'][i]
    sigma_dist = abs(diff_metric - mean[class_name])/std[class_name]
    diff_coef = diff_metric_to_diff_coef(sigma_dist)
    sample_weight = 1.0 + swc * diff_coef
    train_dict['diff_coef'].append(diff_coef)
    train_dict['sample_weight'].append(sample_weight)

100%|██████████| 5/5 [00:00<00:00, 1527.53it/s]
100%|██████████| 5/5 [00:00<00:00, 384.91it/s]


## A function to generate <ref, current, mask> triplet from <ref, current> pair

In [10]:
def triplet_image(pair, threshold):
    w = pair.shape[1]
    ref = pair[:, :w//2]
    current = pair[:, w//2:]
    diff = cv2.absdiff(current, ref)
    agg_rgb = np.stack((diff[:, :, 0], diff[:, :, 1], diff[:, :, 2])).max(0)
    _, mask = cv2.threshold(agg_rgb, threshold, 255, cv2.THRESH_BINARY)

    # old morphological operations
    copyImg = cv2.erode(mask, np.ones((3, 3), np.uint8), iterations=1)  # reduce noise
    copyImg = cv2.dilate(copyImg, np.ones((7, 7), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((5, 5), np.uint8), iterations=1)
    copyImg = cv2.dilate(copyImg, np.ones((9, 9), np.uint8), iterations=1)
    kernel = np.ones((11, 11), np.uint8)  # kernel for dilation

    # increase area to an object
    copyImg = cv2.dilate(copyImg, kernel, iterations=2)
    copyImg = cv2.dilate(copyImg, np.ones((13, 13), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((11, 11), np.uint8), iterations=1)
    copyImg = cv2.erode(copyImg, np.ones((5, 5), np.uint8), iterations=1)

    mask = copyImg 
    mask = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)  
    
    return cv2.hconcat([ref, current, mask])

## Generate triplet images <ref, current, mask>

In [11]:
subset_names = ['train', 'eval']
class_names = ['no_obstacle', 'obstacle']
class_extensions = {'no_obstacle': 'noobs', 'obstacle': 'obs'}

for subset_name in subset_names:
    cur_out_folder = os.path.join(out_folder, subset_name)
    Path(cur_out_folder).mkdir(parents=True, exist_ok=True)

subset_name = 'train'
i = 0
for im_name in tqdm(train_dict['in_name']):
    class_name = train_dict['class_name'][i]
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_path = os.path.join(class_path, im_name)
    pair = cv2.imread(im_path)
    site, threshold = find_site_and_threshold(im_name)
    triplet = triplet_image(pair, threshold)
    class_extension = class_extensions[class_name]
    sample_weight = train_dict['sample_weight'][i]
    out_im_name = '.'.join(im_name.split('.')[:-1])\
         + f'_{site}_{class_extension}_{sample_weight:.4f}_.jpg'
    train_dict['out_name'].append(out_im_name)
    cur_out_folder = os.path.join(out_folder, subset_name)
    out_path = os.path.join(cur_out_folder, out_im_name)
    cv2.imwrite(out_path, triplet)
    i += 1
    
subset_name = 'eval'
for class_name in class_names:
    class_path = os.path.join(in_folder, subset_name, class_name)
    im_names = os.listdir(class_path)
    cur_out_folder = os.path.join(out_folder, subset_name, class_name)
    Path(cur_out_folder).mkdir(parents=True, exist_ok=True)
    for im_name in tqdm(im_names):
        im_path = os.path.join(class_path, im_name)
        pair = cv2.imread(im_path)
        site, threshold = find_site_and_threshold(im_name)
        triplet = triplet_image(pair, threshold)
        out_im_name = '.'.join(im_name.split('.')[:-1]) + f'_{site}_.jpg'
        out_path = os.path.join(cur_out_folder, out_im_name)
        cv2.imwrite(out_path, triplet)    

100%|██████████| 10/10 [00:00<00:00, 213.98it/s]
100%|██████████| 5/5 [00:00<00:00, 504.32it/s]
100%|██████████| 5/5 [00:00<00:00, 174.26it/s]


In [12]:
out_path

'/home/drevital/obstacles_classification_datasets/7_channels_weights/eval/obstacle/1_506_.91_2021-08-15--15-13-35__new_factory_.jpg'

## Create Dataframe from sample_weights Dictionary

In [13]:
train_dict.keys()

dict_keys(['in_name', 'out_name', 'class_name', 'diff_metric', 'diff_coef', 'sample_weight'])

In [14]:
[len(train_dict[k]) for k in train_dict.keys()]

[10, 10, 10, 10, 10, 10]

In [15]:
train_df = pd.DataFrame.from_dict(train_dict)

In [16]:
train_df

Unnamed: 0,in_name,out_name,class_name,diff_metric,diff_coef,sample_weight
0,1_167_.06_2021-08-15--18-10-02_.jpg,1_167_.06_2021-08-15--18-10-02__new_factory_no...,no_obstacle,0.074704,9.299552e-10,1.0
1,1_42_.74_2021-08-15--18-13-17_.jpg,1_42_.74_2021-08-15--18-13-17__new_factory_noo...,no_obstacle,0.045747,3.900233e-07,1.0
2,1_167_.35_2021-08-15--17-47-27_.jpg,1_167_.35_2021-08-15--17-47-27__new_factory_no...,no_obstacle,0.133643,4.564874e-12,1.0
3,1_49_.52_2021-08-15--16-48-27_.jpg,1_49_.52_2021-08-15--16-48-27__new_factory_noo...,no_obstacle,0.330209,0.9999997,2.0
4,1_173_.02_2021-08-17--18-19-26_.jpg,1_173_.02_2021-08-17--18-19-26__musashi_office...,no_obstacle,0.0003,0.005070587,1.005071
5,1_197_.81_2021-08-15--17-28-35_.jpg,1_197_.81_2021-08-15--17-28-35__new_factory_ob...,obstacle,0.827262,4.588846e-10,1.0
6,1_70_.98_2021-08-17--18-08-50_.jpg,1_70_.98_2021-08-17--18-08-50__musashi_office_...,obstacle,0.953931,0.9999971,1.999997
7,1_204_1.00_2021-08-17--18-15-29_.jpg,1_204_1.00_2021-08-17--18-15-29__musashi_offic...,obstacle,0.843114,8.315621e-13,1.0
8,1_231_.97_2021-08-15--11-40-03_.jpg,1_231_.97_2021-08-15--11-40-03__new_factory_ob...,obstacle,0.767507,0.908568,1.908568
9,1_107_.99_2021-08-15--17-28-28_.jpg,1_107_.99_2021-08-15--17-28-28__new_factory_ob...,obstacle,0.846159,2.472571e-13,1.0


## Store the Dataframe in a .csv file

In [17]:
csv_name = out_folder.split('/')[-1] + '.csv'
csv_path = os.path.join(out_folder, csv_name)
train_df.to_csv(csv_path)