In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from imet.dataset import DATA_ROOT
from imet.utils import mean_df, binarize_prediction
from imet.make_submission import get_classes

In [2]:
ZOO_ROOT = Path('zoo')
threshold = 0.10

In [3]:
train_df = pd.read_csv(DATA_ROOT / 'train.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109237 entries, 0 to 109236
Data columns (total 2 columns):
id               109237 non-null object
attribute_ids    109237 non-null object
dtypes: object(2)
memory usage: 1.7+ MB


In [5]:
model = 'se_resnext50_32x4d'

In [6]:
df = None
for fold in range(5):
    tmp_df = pd.read_hdf(ZOO_ROOT / f'model_{model}_fold_{fold}' / 'val.h5', index_col='id')
    if df is None:
        df = tmp_df
    else:
        df = df.append(tmp_df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109237 entries, 10041eb49b297c08 to fffdae8164c9cfff
Columns: 1103 entries, 0 to 1102
dtypes: float32(1103)
memory usage: 460.5+ MB


In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102
10041eb49b297c08,1.723427e-10,1.704131e-08,1.454852e-09,5.83591e-12,9.421575e-09,1.872938e-09,5.125733e-10,5.414047e-13,3.235597e-11,3.278363e-10,...,0.011062,5.909484e-07,0.000786,7.322479e-05,0.002841,0.0001,0.001887,2.247792e-08,2.991703e-08,2.68025e-05
1007057734dba6df,1.455984e-10,4.148521e-07,1.922179e-10,6.008871e-10,1.873635e-09,6.875585e-10,9.077812e-10,3.69594e-10,3.435334e-09,2.566108e-07,...,0.026436,3.095358e-07,0.00207,5.612769e-05,0.005297,0.000116,0.000678,1.204266e-06,1.692224e-08,1.100435e-07
100a58282c6584bf,2.503627e-11,5.098246e-08,4.157186e-11,1.213419e-13,4.982188e-11,1.634059e-10,4.242457e-11,4.214602e-13,3.827386e-11,5.455774e-11,...,0.000564,5.482831e-07,0.013106,0.0004580785,0.002063,1.7e-05,0.002073,7.649877e-08,2.538395e-07,5.30874e-07
100b45b7c4020f5d,1.589207e-09,5.966633e-05,7.230917e-05,3.190559e-10,7.631556e-08,3.042795e-06,1.421423e-08,5.688424e-08,1.241836e-06,8.693595e-07,...,0.00059,2.509743e-07,2.6e-05,0.000819398,0.000122,0.000641,0.003939,1.450188e-05,0.0003727955,0.000130341
100e1e65a6d7850e,1.662035e-12,2.997671e-09,2.298329e-10,3.267391e-14,3.370566e-14,2.734924e-13,6.473043e-15,4.096836e-15,1.670104e-11,4.519342e-10,...,0.001607,1.528915e-07,3e-06,2.354377e-08,6e-06,6e-06,6e-05,1.586041e-10,6.811095e-08,2.727214e-10


In [9]:
df.rename_axis(['id'], inplace=True)

In [10]:
df[:] = binarize_prediction(df.values, threshold=threshold)
df = df.apply(get_classes, axis=1)
df.name = 'attribute_ids'

In [11]:
df.to_csv('pseudo_train.csv', header=True)

In [12]:
df = pd.read_csv('pseudo_train.csv')

In [13]:
pseudo_df = pd.merge(train_df, df, on='id')

In [14]:
pseudo_df

Unnamed: 0,id,attribute_ids_x,attribute_ids_y
0,1000483014d91860,147 420 813 1093 952 616,147 813 952 1093
1,1000fe2e667721fe,501 156 734 51 813 776 573 616,51 519 616 734 813 1092
2,1001614cb89646ee,776 483 1046 690,483 737 1046
3,10041eb49b297c08,51 698 671 492 813 1092 616,51 616 698 813 1092
4,100501c227f8beea,405 1092 404 896 492 903 1093 13,13 404 405 492 698 813 903 1092
5,10050ed12fbad46d,189 953 800 279 378 721 774 1051,133 189 269 279 369 378 774 800
6,100543a032517972,369 188 1034,188 1034
7,1006665c0aad488,1059 194 1034 1053 557 179 1010 335 253,1010 1035
8,1007057734dba6df,189 70 1012 542 993 906 541 813 1092,70 189 259 541 813 993 1012 1092
9,1008abd71f3ed5bc,70 1046 676 794 111 813 1092 776,70 111 420 738 776 1046


In [15]:
def merge_attributes(row):
    row['attribute_ids'] = ' '.join(set(row['attribute_ids'].split()))
    return row

In [16]:
pseudo_df['attribute_ids'] = pseudo_df['attribute_ids_x'] + ' ' + pseudo_df['attribute_ids_y']

In [17]:
pseudo_df = pseudo_df.apply(merge_attributes, axis=1)

In [18]:
pseudo_df.drop(['attribute_ids_x', 'attribute_ids_y'], inplace=True, axis=1)

In [19]:
pseudo_df

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 813 1093 616 420 952
1,1000fe2e667721fe,573 813 734 501 1092 519 776 51 616 156
2,1001614cb89646ee,1046 737 776 690 483
3,10041eb49b297c08,813 616 671 1092 51 698 492
4,100501c227f8beea,404 813 896 903 1093 13 1092 698 405 492
5,10050ed12fbad46d,953 369 721 378 189 1051 133 800 269 774 279
6,100543a032517972,1034 369 188
7,1006665c0aad488,557 194 1059 1053 335 1035 253 179 1010 1034
8,1007057734dba6df,259 1012 813 70 542 993 541 189 1092 906
9,1008abd71f3ed5bc,813 676 1046 70 1092 776 111 420 738 794


In [20]:
pseudo_df.to_csv('pseudo_train.csv', index=None)

In [21]:
pd.read_csv('pseudo_train.csv')

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 813 1093 616 420 952
1,1000fe2e667721fe,573 813 734 501 1092 519 776 51 616 156
2,1001614cb89646ee,1046 737 776 690 483
3,10041eb49b297c08,813 616 671 1092 51 698 492
4,100501c227f8beea,404 813 896 903 1093 13 1092 698 405 492
5,10050ed12fbad46d,953 369 721 378 189 1051 133 800 269 774 279
6,100543a032517972,1034 369 188
7,1006665c0aad488,557 194 1059 1053 335 1035 253 179 1010 1034
8,1007057734dba6df,259 1012 813 70 542 993 541 189 1092 906
9,1008abd71f3ed5bc,813 676 1046 70 1092 776 111 420 738 794
