# Notebook Details

*   Author: Esther Vogt
*   Creation Date: 25.05.2021
*   Purpose: Get first understanding of image data

# Imports / Settings / User Input

In [None]:
# connect colab to drive files
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# copy path to .py file to allow function load
!cp '/content/gdrive/MyDrive/Computer Vision Project/notebooks/data_munching_ev.py' .

In [None]:
# load packages
from PIL import Image
import matplotlib.pyplot as plt
import torch
import os
import pandas as pd
import numpy as np
import re

# import own functions
import data_munching_ev as dm

# Find match: img + annotation

In [None]:
# specify path to folder part04
part04_path = '/content/gdrive/MyDrive/Computer Vision Project/data/raw/raijrr_sugarbeets_2016_annotations_part04'

# get list of all day folders
part04_cka_lst = [x for x in os.listdir(part04_path) if 'CKA_' in x]
print('#cka (=day) folders: ',len(part04_cka_lst))
print('names of day folders:\n',part04_cka_lst)

#cka (=day) folders:  20
names of day folders:
 ['CKA_160510', 'CKA_160511', 'CKA_160527', 'CKA_160504', 'CKA_160517', 'CKA_160518', 'CKA_160503', 'CKA_weeds', 'CKA_160427', 'CKA_160502', 'CKA_160421', 'CKA_160428', 'CKA_160426', 'CKA_160429', 'CKA_160523', 'CKA_160512', 'CKA_160513', 'CKA_160505', 'CKA_160509', 'CKA_160506']


In [None]:
# initialize final df for overview of files per cat
img_availability_df = pd.DataFrame()

# get total #items to evaluate
print('#cka: ',len(part04_cka_lst))

# loop over all files per day (for testing: only first element)
for cka in part04_cka_lst: #['CKA_160510','CKA_160511']:
  
  print(cka)

  # generate list of all files in directory 
  cka_l1_lst = dm.getListOfFiles(part04_path + '/' + cka)
  
  # initialize list for nir vs. rgb images
  cka_images_nir = []
  cka_images_rgb = []
  
  # initialize list for colorCleaned vs. iMap iamges
  cka_anno_colorCleaned = []
  cka_anno_iMapCleaned = []

  # sort images into according specification (img vs. annotation, nir vs. rgb)
  for file in cka_l1_lst:
    file = re.sub(part04_path + '/' + cka,'',file).replace('.png','')

    if 'images/nir' in file:
      cka_images_nir.append(file.replace('/images/nir/',''))
    elif 'images/rgb' in file:
      cka_images_rgb.append(file.replace('/images/rgb/',''))
    elif 'annotations/dlp/colorCleaned' in file:
      cka_anno_colorCleaned.append(file.replace('/annotations/dlp/colorCleaned/',''))
    elif 'annotations/dlp/iMapCleaned' in file:
      cka_anno_iMapCleaned.append(file.replace('/annotations/dlp/iMapCleaned/',''))

  # find match: images - nir vs. rgb
  cka_match_img_df = pd.DataFrame(cka_images_nir, 
                              columns=['nir']).merge(
                                  pd.DataFrame(cka_images_rgb, 
                                               columns=['rgb']), 
                                               left_on='nir', 
                                               right_on='rgb',
                                               how='outer') 
  cka_match_img_df['id'] = cka_match_img_df.apply(lambda x: dm.generate_id_col(x,'nir','rgb'),axis=1)                             
                              
  # find match: annotations - colorCleaned vs. iMapCleaned
  cka_match_anno_df = pd.DataFrame(cka_anno_colorCleaned, 
                              columns=['colorCleaned']).merge(
                                  pd.DataFrame(cka_anno_iMapCleaned, 
                                               columns=['iMapCleaned']), 
                                               left_on='colorCleaned', 
                                               right_on='iMapCleaned',
                                               how='outer') 
  cka_match_anno_df['id'] = cka_match_anno_df.apply(lambda x: dm.generate_id_col(x,'colorCleaned','iMapCleaned'),axis=1)                             

  # find match: images vs. annotations
  cka_match_df = cka_match_img_df.merge(cka_match_anno_df, 
                                        left_on='id',
                                        right_on='id', 
                                        how='outer')
  cka_match_df['cka'] = cka
  # display(cka_match_df.head()) 

  # add cka matching result to overall availability df
  img_availability_df = pd.concat([img_availability_df,cka_match_df])

# inspect final df
display(img_availability_df.head()) 

# print summary statistics w.r.t. missing values
print('#images per category:\n',
      img_availability_df.groupby('cka').count(),'\n')
print('#missing images per category:\n',
      img_availability_df.groupby('cka').agg(lambda x: x.isnull().sum()).reset_index(),'\n')

print('******************* missing images across folders per cka *******************')
for cka in list(img_availability_df['cka'].unique()):
  print(f'#missing images across {cka}:\n',
        (img_availability_df[img_availability_df['cka']==cka].notna() * 1).value_counts(),'\n')

#cka:  20
CKA_160510
CKA_160511
CKA_160527
CKA_160504
CKA_160517
CKA_160518
CKA_160503
CKA_weeds
CKA_160427
CKA_160502
CKA_160421
CKA_160428
CKA_160426
CKA_160429
CKA_160523
CKA_160512
CKA_160513
CKA_160505
CKA_160509
CKA_160506


Unnamed: 0,nir,rgb,id,colorCleaned,iMapCleaned,cka
0,bonirob_2016-05-10-11-29-18_13_frame187,,bonirob_2016-05-10-11-29-18_13_frame187,bonirob_2016-05-10-11-29-18_13_frame187,bonirob_2016-05-10-11-29-18_13_frame187,CKA_160510
1,bonirob_2016-05-10-11-39-31_15_frame201,,bonirob_2016-05-10-11-39-31_15_frame201,bonirob_2016-05-10-11-39-31_15_frame201,bonirob_2016-05-10-11-39-31_15_frame201,CKA_160510
2,bonirob_2016-05-10-11-34-24_14_frame278,,bonirob_2016-05-10-11-34-24_14_frame278,bonirob_2016-05-10-11-34-24_14_frame278,bonirob_2016-05-10-11-34-24_14_frame278,CKA_160510
3,bonirob_2016-05-10-11-39-31_15_frame206,,bonirob_2016-05-10-11-39-31_15_frame206,bonirob_2016-05-10-11-39-31_15_frame206,bonirob_2016-05-10-11-39-31_15_frame206,CKA_160510
4,bonirob_2016-05-10-11-34-24_14_frame207,,bonirob_2016-05-10-11-34-24_14_frame207,bonirob_2016-05-10-11-34-24_14_frame207,bonirob_2016-05-10-11-34-24_14_frame207,CKA_160510


#images per category:
              nir  rgb    id  colorCleaned  iMapCleaned
cka                                                   
CKA_160421   823    0   823           600          600
CKA_160426   306  306   306           296          296
CKA_160427   881    2   881           881          881
CKA_160428   300    0   300           300          300
CKA_160429   614    0   624           612          602
CKA_160502   302    0   302           302          302
CKA_160503  1570  288  1570          1556         1556
CKA_160504   301    0   301           301          301
CKA_160505   963    0   963           963          963
CKA_160506   301    0   301           301          301
CKA_160509   912    0   912           912          912
CKA_160510   867    0   869           869          867
CKA_160511   401    1   401           289          289
CKA_160512   304    0   304           304          304
CKA_160513   301    0   332           319          314
CKA_160517   307    0   315           315 

In [None]:
# write results to xlsx for further evaluation
target_path = '/content/gdrive/MyDrive/Computer Vision Project/notebooks/results/relations/'
img_availability_df.to_excel(target_path + '20210529_EV_img_availability_df.xlsx')