In [90]:
import pandas as pd
import numpy as np
from pathlib import Path
from imet.dataset import DATA_ROOT
from imet.utils import mean_df, binarize_prediction
from imet.make_submission import get_classes

In [91]:
ZOO_ROOT = Path('zoo')
threshold = 0.10

In [92]:
train_df = pd.read_csv(DATA_ROOT / 'train.csv')

In [93]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109237 entries, 0 to 109236
Data columns (total 2 columns):
id               109237 non-null object
attribute_ids    109237 non-null object
dtypes: object(2)
memory usage: 1.7+ MB


In [94]:
model = 'se_resnet50'

In [95]:
df = None
for fold in range(5):
    tmp_df = pd.read_hdf(ZOO_ROOT / f'model_{model}_fold_{fold}' / 'val.h5', index_col='id')
    if df is None:
        df = tmp_df
    else:
        df = df.append(tmp_df)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109237 entries, 1000fe2e667721fe to ffffbf00586b8e37
Columns: 1103 entries, 0 to 1102
dtypes: float32(1103)
memory usage: 460.5+ MB


In [97]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102
1000fe2e667721fe,3.595478e-10,2.001243e-07,8.389334e-11,1.412256e-16,1.517376e-08,7.516258e-11,2.209236e-12,2.529898e-14,6.287883e-11,1.478659e-09,...,0.00364,1.915561e-06,4.8e-05,2.741867e-05,3.4e-05,0.00022,0.000173,7.857336e-07,8.537265e-08,2.449569e-08
1001614cb89646ee,5.416808e-11,4.216242e-08,1.414673e-09,3.112024e-14,5.806891e-12,7.575936e-14,1.801419e-13,2.422e-16,1.004877e-10,3.669721e-10,...,0.000487,8.608234e-07,4e-06,1.077061e-07,3e-06,2.5e-05,0.000244,2.994125e-10,8.488938e-08,7.529623e-10
100543a032517972,6.347986e-11,7.718303e-07,1.437212e-10,1.334118e-13,6.692447e-11,6.45597e-13,1.516377e-11,3.661043e-11,6.223931e-11,1.279924e-08,...,1.9e-05,4.261061e-10,3e-06,1.170131e-05,1.1e-05,2e-06,0.012463,1.571251e-06,8.318973e-05,1.187139e-06
1007057734dba6df,1.359862e-11,1.413056e-07,2.745828e-09,7.70444e-12,2.934613e-08,2.207639e-11,2.157106e-09,5.200446e-13,1.254165e-09,1.929548e-07,...,0.006804,8.697235e-08,0.001325,4.409771e-05,0.002568,2e-05,0.002665,1.183781e-07,3.87753e-08,5.322497e-07
100b45b7c4020f5d,2.767166e-12,0.002299853,1.536137e-05,2.139133e-10,0.0001149309,3.98147e-05,6.424694e-07,1.485794e-07,0.0001413228,1.669286e-07,...,0.001094,8.994305e-10,2.7e-05,0.0001441483,1.4e-05,0.000185,0.01345,2.066237e-05,0.001623935,0.005762186


In [98]:
df.rename_axis(['id'], inplace=True)

In [99]:
df[:] = binarize_prediction(df.values, threshold=threshold)
df = df.apply(get_classes, axis=1)
df.name = 'attribute_ids'

In [100]:
df.to_csv('pseudo_train.csv', header=True)

In [101]:
df = pd.read_csv('pseudo_train.csv')

In [103]:
pseudo_df = pd.merge(train_df, df, on='id')

In [115]:
pseudo_df

Unnamed: 0,id,attribute_ids_x,attribute_ids_y
0,1000483014d91860,147 616 813,147 420 813 952 1093
1,1000fe2e667721fe,51 616 734 813,51 156 501 573 776 813
2,1001614cb89646ee,776,483 690 1046
3,10041eb49b297c08,51 671 698 813 1092,51 492 616 813 1092
4,100501c227f8beea,13 404 492 903 1093,13 405 896 903 1092
5,10050ed12fbad46d,189 279 774 800 1051,189 279 378 721 800 953
6,100543a032517972,188 1034,188 369 1034
7,1006665c0aad488,1010 1053,179 194 253 335 557 1010 1034 1053 1059
8,1007057734dba6df,189 541 542 993,70 189 541 542 813 906 993 1012 1092
9,1008abd71f3ed5bc,70 776 794 813 1046 1092,70 111 676 776 1046


In [122]:
def merge_attributes(row):
    row['attribute_ids'] = ' '.join(set(row['attribute_ids'].split()))
    return row

In [119]:
pseudo_df['attribute_ids'] = pseudo_df['attribute_ids_x'] + ' ' + pseudo_df['attribute_ids_y']

In [124]:
pseudo_df = pseudo_df.apply(merge_attributes, axis=1)

In [128]:
pseudo_df.drop(['attribute_ids_x', 'attribute_ids_y'], inplace=True, axis=1)

In [129]:
pseudo_df

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 420 813 1093 952 616
1,1000fe2e667721fe,501 156 734 51 813 776 573 616
2,1001614cb89646ee,776 483 1046 690
3,10041eb49b297c08,51 698 671 492 813 1092 616
4,100501c227f8beea,405 1092 404 896 492 903 1093 13
5,10050ed12fbad46d,189 953 800 279 378 721 774 1051
6,100543a032517972,369 188 1034
7,1006665c0aad488,1059 194 1034 1053 557 179 1010 335 253
8,1007057734dba6df,189 70 1012 542 993 906 541 813 1092
9,1008abd71f3ed5bc,70 1046 676 794 111 813 1092 776


In [133]:
pseudo_df.to_csv('pseudo_train.csv', index=None)

In [135]:
pd.read_csv('pseudo_train.csv')

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 420 813 1093 952 616
1,1000fe2e667721fe,501 156 734 51 813 776 573 616
2,1001614cb89646ee,776 483 1046 690
3,10041eb49b297c08,51 698 671 492 813 1092 616
4,100501c227f8beea,405 1092 404 896 492 903 1093 13
5,10050ed12fbad46d,189 953 800 279 378 721 774 1051
6,100543a032517972,369 188 1034
7,1006665c0aad488,1059 194 1034 1053 557 179 1010 335 253
8,1007057734dba6df,189 70 1012 542 993 906 541 813 1092
9,1008abd71f3ed5bc,70 1046 676 794 111 813 1092 776
