In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

### Load initial class labels

In [2]:
vienna = pd.read_csv('/workdir/data/tz/vienna_labels.csv')
vienna.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
vienna.head()

Unnamed: 0,image_path,label,exists
0,TMN/IMG/01/376/01376840.TIF,27.05.08С 09.01.10 18.05.01 21.01.25 14.03.15 ...,True
1,TMN/IMG/01/197/01197742.TIF,20.05.14 24.01.03 24.01.19 03.01.22 03.01.23 0...,True
2,TMN/IMG/01/270/01270620.TIF,27.05.15 26.04.02 26.04.18 26.04.24 28.11,True
3,TMN/IMG/01/254/01254043.TIF,03.01.04 03.01.16 28.19 03.01.16 03.01.24 26.0...,True
4,TMN/IMG/01/215/01215623.TIF,29.01.12 27.05.11 09.01.10 27.05.23 28.05 29.0...,True


### Count unique classes filter out non-popular

In [6]:
classes = []

for cur_classes in vienna['label'].apply(lambda label: label.split()):
    classes.extend(cur_classes)

In [7]:
unique_classes, classes_counts = np.unique(classes, return_counts=True)
unique_classes.shape, unique_classes[classes_counts > 100].shape

((4343,), (1743,))

In [8]:
unique_classes = unique_classes[classes_counts > 100]
idx2name = dict(enumerate(unique_classes))
name2idx = {v: k for k, v in idx2name.items()}

### Convert hierarchical classes tree into range levels

In [9]:
def build_class_levels(name2idx):
    """
    Takes in class name to index mapping and returns dictionary where:
    - keys: the same class names as input 
    (different level in the hierarchical classes tree are separated by '.')
    - values: K lists of tuples of two values (min_range, max_range),
    representing min-max range values of corresponding classes falling into 
    their sectors in the original full-length class vector. K - depth of the hierarchy, 
    where 1st level represent deepest hierarchy of the classes tree
    """
    name2levels = {}
    
    for cl_name, _ in name2idx.items():
        cl_level = cl_name
        main_level = (int(name2idx[cl_level]), int(name2idx[cl_level]))
        
        levels = []
        split_pos = cl_name.rfind('.')
        while split_pos > 0:
            cl_level = cl_name[:split_pos]
            level_indexes = np.array([cl_idx for cl_name, cl_idx in name2idx.items() if cl_name.startswith(cl_level)])
            levels.append((int(level_indexes.min()), int(level_indexes.max())))
            split_pos = cl_level.rfind('.')
        
        levels = [main_level] + levels
        
        name2levels[cl_name] = levels
        
    return name2levels

In [10]:
name2levels = build_class_levels(name2idx)

In [11]:
i = 0
for cl_name, cl_levels in name2levels.items():
    print(cl_name, cl_levels)
    i += 1
    if i >= 20:
        break

01.01.01 [(0, 0), (0, 14), (0, 95)]
01.01.02 [(1, 1), (0, 14), (0, 95)]
01.01.03 [(2, 2), (0, 14), (0, 95)]
01.01.04 [(3, 3), (0, 14), (0, 95)]
01.01.05 [(4, 4), (0, 14), (0, 95)]
01.01.08 [(5, 5), (0, 14), (0, 95)]
01.01.09 [(6, 6), (0, 14), (0, 95)]
01.01.10 [(7, 7), (0, 14), (0, 95)]
01.01.12 [(8, 8), (0, 14), (0, 95)]
01.01.13 [(9, 9), (0, 14), (0, 95)]
01.01.15 [(10, 10), (0, 14), (0, 95)]
01.01.17 [(11, 11), (0, 14), (0, 95)]
01.01.19 [(12, 12), (0, 14), (0, 95)]
01.01.20 [(13, 13), (0, 14), (0, 95)]
01.01.25 [(14, 14), (0, 14), (0, 95)]
01.03.01 [(15, 15), (15, 29), (0, 95)]
01.03.02 [(16, 16), (15, 29), (0, 95)]
01.03.06 [(17, 17), (15, 29), (0, 95)]
01.03.07 [(18, 18), (15, 29), (0, 95)]
01.03.08 [(19, 19), (15, 29), (0, 95)]


In [12]:
# For example, for this class we have 3 levels of hierarchy,
# where 1st is '08C', 2nd is '05' and the 3rd is '27' - 
# the higher class in the hierarchy the more broad range it covers
name2levels['27.05.08С']

[(1495, 1495), (1456, 1690), (1436, 1706)]

### Save classes hierarchy to be able to work in HierarchicalMultilabelClassificationLoss

In [352]:
with open('/workdir/data/tz/vienna_1743_classname2levels.json', 'w') as f:
    json.dump(name2levels, f)

In [353]:
with open('/workdir/data/tz/vienna_1743_classname2index.json', 'w') as f:
    json.dump(name2idx, f)

In [13]:
vienna.head()

Unnamed: 0,image_path,label,exists
0,TMN/IMG/01/376/01376840.TIF,27.05.08С 09.01.10 18.05.01 21.01.25 14.03.15 ...,True
1,TMN/IMG/01/197/01197742.TIF,20.05.14 24.01.03 24.01.19 03.01.22 03.01.23 0...,True
2,TMN/IMG/01/270/01270620.TIF,27.05.15 26.04.02 26.04.18 26.04.24 28.11,True
3,TMN/IMG/01/254/01254043.TIF,03.01.04 03.01.16 28.19 03.01.16 03.01.24 26.0...,True
4,TMN/IMG/01/215/01215623.TIF,29.01.12 27.05.11 09.01.10 27.05.23 28.05 29.0...,True


In [15]:
vienna['label'] = vienna['label'].apply(lambda label: [name2idx[cl_name] for cl_name in label.split() if cl_name in name2idx])
vienna.head()

Unnamed: 0,image_path,label,exists
0,TMN/IMG/01/376/01376840.TIF,"[1495, 632, 950, 1065, 791, 1064, 1624, 1712]",True
1,TMN/IMG/01/197/01197742.TIF,"[1046, 1124, 1136, 209, 210, 221, 791, 1124, 1...",True
2,TMN/IMG/01/270/01270620.TIF,"[1514, 1351, 1367, 1373, 1712]",True
3,TMN/IMG/01/254/01254043.TIF,"[196, 203, 1728, 203, 211, 1298, 1300, 1318, 1...",True
4,TMN/IMG/01/215/01215623.TIF,"[1739, 1510, 632, 1624, 1709, 1737, 1739]",True


In [16]:
vienna['image_path'] = vienna['image_path'].apply(lambda p: p[:-4] + '.jpg')

In [378]:
vienna.to_csv('/workdir/data/tz_jpg/vienna_1743_all.csv', index=False)

### Train/valid split

In [373]:
from sklearn.model_selection import train_test_split

In [374]:
train, valid = train_test_split(vienna, test_size=0.2, shuffle=True, random_state=123)

In [377]:
train.to_csv('/workdir/data/tz_jpg/vienna_1743_train.csv', index=False)
valid.to_csv('/workdir/data/tz_jpg/vienna_1743_valid.csv', index=False)