In [3]:
import pandas as pd
import glob
import os
from utils.utils import remap, cats_to_list
import yaml
pd.options.display.max_rows = 100

In [4]:
def generate_inference_df(input_path='runs/detect/predict/labels/',
                          conf_threshold=0.25,
                          mapper=None):
    '''Generates dataframe of information from infrence output files.
    Args:
        input_path: (string) path to prediction labels files
        conf_thrreshold: (float) minimum confidence threshold for valid detection
    Returns:
        df: (pd.DataFrame) dataframe of inference output
    '''
    out = {}
    filelist = glob.glob(input_path + '*.txt')

    cat_df = pd.read_json('../category_key65.json')
    shallow = cat_df[cat_df.shallow == True]['index'].to_list()
    if mapper == None:
        mapper = cat_df[['id', 'index']].to_dict()['id']
    else:
        mapper = mapper
    # cat_df = pd.read_json(category_file_path).reset_index().rename(columns={'index': 'id'})
    # shallow = cat_df[cat_df.shallow_species == True]['id'].to_list()
    # if mapper == None:
    #     mapper = cat_df[['id', 'category_id']].set_index('id').to_dict()['category_id']

    for i, file in enumerate(filelist):
        with open(file, 'r') as f:

            cats = []
            conf = []
            location = []
            weak_shallow = 0
            strong_shallow = 0
            no_detection = 0

            for line in f.readlines():

                category, x, y, w, h, conf_value = line.split(' ')
                category = int(category)
                conf_value = float(conf_value)
                loc = tuple([float(i) for i in [x, y, w, h]])

                if category in shallow:
                    weak_shallow = 1    # weakly shallow if there is a shallow detection at any confidence
                    if conf_value >= conf_threshold:
                        strong_shallow = 1  # strongly shallow if a high conf shallow detection

                if (category not in cats) and (conf_value >= conf_threshold): # dedup and add to list
                    cats.append(category)
                    conf.append(conf_value)
                    location.append(loc)
            
            cats = remap(cats, mapper)

            if len(cats) == 0:
                no_detection = 1

        out[i] = {'id': os.path.basename(file)[:-4],
                  'categories': cats,
                  'location': location,
                  'conf': conf,
                  'weak_shallow': weak_shallow,
                  'strong_shallow': strong_shallow,
                  'no_detection': no_detection,
                  #   'osd': osd,
                  }

    df = pd.DataFrame.from_dict(out, orient='index')
    return df

In [5]:
def generate_inference_df_sup(input_path='runs/detect/predict_superL/labels/', conf_threshold=0.5):
    '''Generates dataframe of information from infrence output files.
    Args:
        input_path: (string) path to prediction labels files
        conf_thrreshold: (float) minimum confidence threshold for valid detection
    Returns:
        df: (pd.DataFrame) dataframe of inference output
    '''
    out = {}
    filelist = glob.glob(input_path + '*.txt')

    scat_df = pd.read_json('../supercat_key.json')
    cat_df = pd.read_json('../shallow_species.json')
    shallow = cat_df[cat_df.shallow == True]['category_id'].to_list()
    mapper = scat_df[['top_category_id', 'supercat_id']].to_dict()['top_category_id']
    mapper = {str(k): mapper[k] for k in mapper.keys()}

    for i, file in enumerate(filelist):
        with open(file, 'r') as f:

            cats = []
            supercats = []
            conf = []
            location = []
            weak_shallow = 0
            strong_shallow = 0
            no_detection = 0

            for line in f.readlines():

                supercat, x, y, w, h, conf_value = line.split(' ')
                category = remap(supercat, mapper)[0]
                supercat = int(supercat)
                conf_value = float(conf_value)
                loc = tuple([float(i) for i in [x, y, w, h]])

                if category in shallow:
                    weak_shallow = 1    # weakly shallow if there is a shallow detection at any confidence
                    if conf_value >= conf_threshold:
                        strong_shallow = 1  # strongly shallow if a high conf shallow detection

                if (category not in cats) and (conf_value >= conf_threshold): # dedup and add to list
                    cats.append(category)
                    supercats.append(supercat)
                    conf.append(conf_value)
                    location.append(loc)
            

            if len(cats) == 0:
                no_detection = 1

        out[i] = {'id': os.path.basename(file)[:-4],
                  'supercategory': supercats,
                  'categories_s': cats,
                  'location_s': location,
                  'conf_s': conf,
                  'weak_shallow_s': weak_shallow,
                  'strong_shallow_s': strong_shallow,
                  'no_detection_s': no_detection
                  }

    df = pd.DataFrame.from_dict(out, orient='index')
    return df

In [6]:
def mapper_generator_65():
    # Load the YAML file
    with open('/data/dataset65.yaml', 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)

    # Convert the YAML data to a Pandas DataFrame
    dataset = pd.DataFrame.from_dict(data)
    dataset = dataset.reset_index().rename(columns={'index':'train_id', 'names': 'name'})[['train_id','name']]
    cat_key = pd.read_csv('../category_key.csv')
    mapper = cat_key.merge(dataset, on='name')[['train_id', 'id']]
    mapper = mapper.set_index('train_id').to_dict()['id']
    if mapper[0] == 160:
        print('Mapping appears to be correct')
    return mapper

In [7]:
# import yaml

# # Load the YAML file
# with open('/data/dataset65.yaml', 'r') as file:
#     data = yaml.load(file, Loader=yaml.FullLoader)

# # Convert the YAML data to a Pandas DataFrame
# dataset = pd.DataFrame.from_dict(data)

# # Print the result
# dataset

In [8]:
# dataset = dataset.reset_index().rename(columns={'index':'train_id', 'names': 'name'})[['train_id','name']]
# dataset.head()

In [9]:
# cat_key = pd.read_csv('../category_key.csv')
# cat_key.head()

In [10]:
# mapper = cat_key.merge(dataset, on='name')[['train_id', 'id']]
# mapper = mapper.set_index('train_id').to_dict()['id']
# mapper[0]

In [11]:
# df_cat = pd.read_json('runs/predict40m.json')

In [12]:
# pd.read_json('../shallow_counts.json')

In [13]:
# scat_df = pd.read_json('../supercat_key.json')
# scat_df

In [14]:
mapper = mapper_generator_65()

Mapping appears to be correct


In [15]:
df_sup = generate_inference_df_sup(conf_threshold=0.25)
df_sup.head()

Unnamed: 0,id,supercategory,categories_s,location_s,conf_s,weak_shallow_s,strong_shallow_s,no_detection_s
0,eb16e21f-8b15-4238-a7b7-813008334fb6,[6],[203],"[(0.51016, 0.607006, 0.0760153, 0.120715)]",[0.250197],0,0,0
1,4bafb8ca-47f8-4b6a-89aa-607fd5decd98,[1],[37],"[(0.877733, 0.5, 0.224283, 1.0)]",[0.915119],0,0,0
2,f6340ca2-8792-4221-8f6b-a3efcef97d2a,[5],[125],"[(0.800092, 0.8849, 0.197056, 0.217626)]",[0.932757],0,0,0
3,8fb88010-f618-4e3e-aeae-94c8988d0d2d,[],[],[],[],0,0,1
4,1e297e1d-1ddb-4c9e-af61-30dbbdb64dfe,"[14, 6]","[37, 203]","[(0.561126, 0.702819, 0.326502, 0.565539), (0....","[0.831957, 0.253548]",0,0,0


In [30]:
df_cat = generate_inference_df('runs/detect/predict65l/labels/', conf_threshold=0.5, mapper=mapper)
df_cat.head()

Unnamed: 0,id,categories,location,conf,weak_shallow,strong_shallow,no_detection
0,eb16e21f-8b15-4238-a7b7-813008334fb6,[],[],[],0,0,1
1,4bafb8ca-47f8-4b6a-89aa-607fd5decd98,[],[],[],0,0,1
2,f6340ca2-8792-4221-8f6b-a3efcef97d2a,"[125, 120]","[(0.80192, 0.890364, 0.195178, 0.218471), (0.4...","[0.581854, 0.519401]",0,0,0
3,8fb88010-f618-4e3e-aeae-94c8988d0d2d,[],[],[],0,0,1
4,1e297e1d-1ddb-4c9e-af61-30dbbdb64dfe,[242],"[(0.561873, 0.709742, 0.344029, 0.568275)]",[0.63316],0,0,0


The following should show a category of 160

In [31]:
df_cat[df_cat.id == '67d55379-18ca-40ec-b9da-6aa7117e4e1a']

Unnamed: 0,id,categories,location,conf,weak_shallow,strong_shallow,no_detection
4654,67d55379-18ca-40ec-b9da-6aa7117e4e1a,[160],"[(0.818127, 0.389969, 0.106611, 0.0946825)]",[0.836619],0,0,0


In [32]:
def simple_osd(row):
    shallow = (row.strong_shallow, row.weak_shallow, row.no_detection)
    if shallow == (0,0,1):
        row['osd'] = 1.0
        row['categories'] = [52]
    if shallow == (0,0,0):
        row['osd'] = 1.0
    if shallow == (0,1,1):
        row['osd'] = 0.5
        row['categories'] = [52]
    if shallow == (0,1,0):
        row['osd'] = 0.5
    if shallow == (1,1,0):
        row['osd'] = 0.0

    return row

In [33]:
def detect_osd(row):
    shallow = (row.strong_shallow, row.strong_shallow_s, row.weak_shallow, row.weak_shallow_s)
    # no detections from either model - must be osd
    if row.no_detection and row.no_detection_s:
        row['osd'] = 1.0
        row['categories'] = [52] # setting to the most common deep object

    if row.no_detection and not row.no_detection_s:
        row['categories'] = row.categories_s
    
    # nothing detected by cat, something detected by super
    if shallow == (0,0,0,0):
        row['osd'] = 0.9
    if shallow == (0,0,0,1):
        row['osd'] = 0.7
    if shallow == (0,1,0,1):
        row['osd'] = 0.5
    if shallow == (0,0,1,0):
        row['osd'] = 0.4
    if shallow == (0,0,1,1):
        row['osd'] = 0.3
    if shallow == (1,0,1,0):
        row['osd'] = 0.2
    if shallow == (0,1,1,1):
        row['osd'] = 0.1
    if shallow == (1,0,1,1):
        row['osd'] = 0.1
    if shallow == (1,1,1,1):
        row['osd'] = 0.0
        

    return row

In [34]:
# out_df = df_cat.apply(simple_osd, axis=1)

In [35]:
df = df_cat.merge(df_sup, on='id')
df

Unnamed: 0,id,categories,location,conf,weak_shallow,strong_shallow,no_detection,supercategory,categories_s,location_s,conf_s,weak_shallow_s,strong_shallow_s,no_detection_s
0,eb16e21f-8b15-4238-a7b7-813008334fb6,[],[],[],0,0,1,[6],[203],"[(0.51016, 0.607006, 0.0760153, 0.120715)]",[0.250197],0,0,0
1,4bafb8ca-47f8-4b6a-89aa-607fd5decd98,[],[],[],0,0,1,[1],[37],"[(0.877733, 0.5, 0.224283, 1.0)]",[0.915119],0,0,0
2,f6340ca2-8792-4221-8f6b-a3efcef97d2a,"[125, 120]","[(0.80192, 0.890364, 0.195178, 0.218471), (0.4...","[0.581854, 0.519401]",0,0,0,[5],[125],"[(0.800092, 0.8849, 0.197056, 0.217626)]",[0.932757],0,0,0
3,8fb88010-f618-4e3e-aeae-94c8988d0d2d,[],[],[],0,0,1,[],[],[],[],0,0,1
4,1e297e1d-1ddb-4c9e-af61-30dbbdb64dfe,[242],"[(0.561873, 0.709742, 0.344029, 0.568275)]",[0.63316],0,0,0,"[14, 6]","[37, 203]","[(0.561126, 0.702819, 0.326502, 0.565539), (0....","[0.831957, 0.253548]",0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10739,dccd858c-29aa-42c3-a6eb-0e20bb44a164,[205],"[(0.453976, 0.222439, 0.0513436, 0.144833)]",[0.542295],0,0,0,"[6, 5, 9]","[203, 125, 211]","[(0.453741, 0.220785, 0.0553431, 0.149002), (0...","[0.431345, 0.319091, 0.278475]",1,0,0
10740,7b2886e9-2759-4e1b-baaa-3ff1e6324993,"[52, 211]","[(0.585722, 0.688239, 0.0734121, 0.0861248), (...","[0.848346, 0.679121]",0,0,0,"[1, 9]","[37, 211]","[(0.586036, 0.691505, 0.0741134, 0.0897324), (...","[0.832152, 0.382505]",1,0,0
10741,2d6cbd12-1b68-4532-8ec9-e6237b8449a5,"[160, 259, 51]","[(0.905147, 0.438632, 0.0538639, 0.0586498), (...","[0.741732, 0.669353, 0.572185]",0,0,0,[0],[160],"[(0.903091, 0.441384, 0.0564308, 0.0602842)]",[0.515961],1,1,0
10742,312910a6-a5b6-4dce-814c-65ae49d58fd8,"[211, 52, 173]","[(0.492866, 0.71624, 0.0332447, 0.0508029), (0...","[0.600669, 0.529275, 0.52315]",0,0,0,"[9, 1]","[211, 37]","[(0.492291, 0.71495, 0.0341467, 0.0518731), (0...","[0.756543, 0.511233]",1,0,0


In [36]:
out_df = df.apply(detect_osd, axis=1)

Checking to make sure there are no nulls left

In [37]:
out_df[out_df['osd'].isnull()]

Unnamed: 0,id,categories,location,conf,weak_shallow,strong_shallow,no_detection,supercategory,categories_s,location_s,conf_s,weak_shallow_s,strong_shallow_s,no_detection_s,osd


In [38]:
out_df.no_detection_s.value_counts()

0    8190
1    2554
Name: no_detection_s, dtype: int64

In [39]:
out_df.osd.value_counts()

0.5    5156
0.9    4423
0.7     663
0.1     242
0.0     127
0.4      80
0.2      27
0.3      26
Name: osd, dtype: int64

In [40]:
# def select_top(lst):
#     return f'[{lst[0]}]'

In [41]:
def format_cat(lst):
    if len(lst) == 1:
        return f'[{lst[0]}]'
    else:
        return ' '.join([str(x) for x in lst])

In [42]:
out = out_df.copy()[['id','categories', 'osd']]

In [43]:
# out.categories = out.categories.apply(select_top)
# out.categories = out.categories.apply(format_cat)
out

Unnamed: 0,id,categories,osd
0,eb16e21f-8b15-4238-a7b7-813008334fb6,[203],0.9
1,4bafb8ca-47f8-4b6a-89aa-607fd5decd98,[37],0.9
2,f6340ca2-8792-4221-8f6b-a3efcef97d2a,125 120,0.9
3,8fb88010-f618-4e3e-aeae-94c8988d0d2d,[52],0.9
4,1e297e1d-1ddb-4c9e-af61-30dbbdb64dfe,[242],0.9
...,...,...,...
10739,dccd858c-29aa-42c3-a6eb-0e20bb44a164,[205],0.7
10740,7b2886e9-2759-4e1b-baaa-3ff1e6324993,52 211,0.7
10741,2d6cbd12-1b68-4532-8ec9-e6237b8449a5,160 259 51,0.5
10742,312910a6-a5b6-4dce-814c-65ae49d58fd8,211 52 173,0.7


In [44]:
out.to_csv('../submissions/submission_36.csv', index=False)