In [1]:
import os, sys
import logging
import math
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# datadir = Path("/Volumes/Extreme_Pro/data/hpa-single-cell-image-classification")
datadir = Path("/dlab/ldrive/CBT/USLJ-DSDE_DATA-I10008/BenchmarkDatasets/hpa-single-cell-image-classification")
train_imglist = sorted(datadir.joinpath('train').glob('*.png'))
df_train = pd.read_csv(datadir.joinpath('train.csv'))
assert len(train_imglist)/4 == len(df_train)
print(f'Amount of total images: {len(train_imglist)}')
print(f'Amount of total objects: {len(df_train)}')

Amount of total images: 87224
Amount of total objects: 21806


In [3]:
def df_sample_column(df, column, sample_count = 1000):
    labels = df[column].unique()
    counts = df[column].value_counts(normalize=True)
    probs = df[column].map(counts)
    df_select = df.sample(n=sample_count, weights=probs, random_state=42)
    
    print(f'Amount of unique labels: {len(labels)}')
    print(f'Sample Count: {len(df_select)}')
    print(f'Sample rate per label: {round(sample_count/len(df_train), 3)}')
    print(f'Amount of total samples: {len(df_select)}')
    return df_select

sample_count = 1000
df_select_1K = df_sample_column(df_train, column = 'Label', sample_count = sample_count)

print('Save selected train data to csv file...')
opdfpath = datadir.joinpath(f'train_select_1K.csv')
print(f'Output path: {opdfpath}')
df_select_1K.to_csv(opdfpath, index=False)

Amount of unique labels: 432
Sample Count: 1000
Sample rate per label: 0.046
Amount of total samples: 1000
Save selected train data to csv file...
Output path: /dlab/ldrive/CBT/USLJ-DSDE_DATA-I10008/BenchmarkDatasets/hpa-single-cell-image-classification/train_select_1K.csv


In [6]:
labelid_repres_path = datadir.joinpath('label_representation.csv')
if not labelid_repres_path.is_file():
    list_cat = [
        'Nucleoplasm',  
        'Nuclear membrane',   
        'Nucleoli',
        'Nucleoli fibrillar center',   
        'Nuclear speckles',
        'Nuclear bodies',
        'Endoplasmic reticulum',
        'Golgi apparatus',
        'Intermediate filaments',
        'Actin filaments',
        'Microtubules',
        'Mitotic spindle',
        'Centrosome',
        'Plasma membrane',
        'Mitochondria',
        'Aggresome',
        'Cytosol',
        'Vesicles and punctate cytosolic patterns',
        'Negative',
    ]
    list_lblid = list(range(len(list_cat)))
    df_labelid_repres = pd.DataFrame({'Label_ID': list_lblid, 
                                      'Label': list_cat})
    df_labelid_repres.to_csv(labelid_repres_path, index=False)
else:
    df_labelid_repres = pd.read_csv(labelid_repres_path)

In [7]:
def get_summary(df):
    df_train_unique_label = []
    for idx, row in df.iterrows():
        for label_id in row['Label'].split('|'):
            df_train_unique_label.append([row['ID'], label_id])
    df_train_unique_label = pd.DataFrame(df_train_unique_label, columns=['ID', 'Label_ID'])
    df_train_unique_label = df_train_unique_label.drop_duplicates()
    df_train_unique_label = df_train_unique_label.reset_index(drop=True)
    # print(df_train_unique_label.head(10))

    df_label_count = []
    for label_id in df_train_unique_label['Label_ID'].unique():
        df_tmp = df_train_unique_label[df_train_unique_label['Label_ID'] == label_id]
        df_label_count.append([label_id, len(df_tmp)])
    df_label_count = pd.DataFrame(df_label_count, columns=['Label_ID', 'Amount'], index = None)
    df_label_count['Label_ID'] = df_label_count['Label_ID'].astype('int64')

    df_label_count = df_label_count.merge(df_labelid_repres, on='Label_ID', how='left')
    df_label_count = df_label_count.sort_values(by=['Label_ID'], ascending=True)
    df_label_count = df_label_count.reset_index(drop=True)
    print(df_label_count)

    df_train_unique_label_count = df_train_unique_label.groupby(['ID']).count()
    df_train_unique_label_count.reset_index(inplace=True)
    df_multilabel_count = df_train_unique_label_count.groupby(['Label_ID']).count()
    df_multilabel_count.reset_index(inplace=True)
    rename_dict = {
        'ID': 'Image Count',
        'Label_ID': 'Count_of_Label(s)'
    }
    df_multilabel_count = df_multilabel_count.rename(columns=rename_dict)

    print("Amount of unique labels per image:")
    print(df_multilabel_count)
    return df_label_count, df_multilabel_count

df_train_label_count, df_train_multilabel_count = get_summary(df_train)

    Label_ID  Amount                                     Label
0          0    8797                               Nucleoplasm
1          1    1095                          Nuclear membrane
2          2    2451                                  Nucleoli
3          3    1262                 Nucleoli fibrillar center
4          4    1425                          Nuclear speckles
5          5    1792                            Nuclear bodies
6          6     775                     Endoplasmic reticulum
7          7    1846                           Golgi apparatus
8          8     964                    Intermediate filaments
9          9     998                           Actin filaments
10        10     818                              Microtubules
11        11      78                           Mitotic spindle
12        12    1734                                Centrosome
13        13    3111                           Plasma membrane
14        14    2013                              Mitoc

In [8]:
df_select_label_count, df_select_multilabel_count = get_summary(df_train)

    Label_ID  Amount                                     Label
0          0    8797                               Nucleoplasm
1          1    1095                          Nuclear membrane
2          2    2451                                  Nucleoli
3          3    1262                 Nucleoli fibrillar center
4          4    1425                          Nuclear speckles
5          5    1792                            Nuclear bodies
6          6     775                     Endoplasmic reticulum
7          7    1846                           Golgi apparatus
8          8     964                    Intermediate filaments
9          9     998                           Actin filaments
10        10     818                              Microtubules
11        11      78                           Mitotic spindle
12        12    1734                                Centrosome
13        13    3111                           Plasma membrane
14        14    2013                              Mitoc