In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data_path = '/home/linux/kaggle/statoil/dataset/'

train_file = os.path.join(data_path, 'train.npz')
test_true_file = os.path.join(data_path, 'test-true.npz')
pesudo_file = os.path.join(data_path, 'train-pesudo.npz')
submit_true_file = os.path.join(data_path, 'submit-true.csv')

In [None]:
f = np.load(test_true_file)
df = pd.DataFrame()
df['id'] = f['ID']
band = f['img']
band1_lst = [None]*band.shape[0]
band2_lst = [None]*band.shape[0]
band1_lst[:] = band[..., 0]
band2_lst[:] = band[..., 1]
df['band1'] = band1_lst
df['band2'] = band2_lst
del f

In [None]:
df_label = pd.read_csv(submit_true_file)
assert((df_label['id'] == df['id']).all())
df['is_iceberg'] = df_label['is_iceberg']
del df_label

In [None]:
plt.hist(df['is_iceberg'], bins=100);

In [None]:
# confident samples
gap = 0.001
df_assume_iceberg = df[df['is_iceberg'] > 1-gap]
df_assume_ship = df[df['is_iceberg'] < gap]
print(df_assume_iceberg.shape[0], df_assume_ship.shape[0])

In [None]:
# samples around decision boundary
gap = 0.55
df_margin = df[(df['is_iceberg'] < gap) & (df['is_iceberg'] > 1-gap)]
print(df_margin.shape[0])

In [None]:
# concat pesudo labelled test set to train set
id_iceberg = df_assume_iceberg['id']
id_ship = df_assume_ship['id']

f = np.load(test_true_file)
img_iceberg = f['img'][np.isin(f['ID'], id_iceberg)]
img_ship = f['img'][np.isin(f['ID'], id_ship)]
label_iceberg = np.array([1.0]*len(id_iceberg))
label_ship = np.array([0.0]*len(id_ship))

f = np.load(train_file)
img, label, ID = f['img'], f['label'], f['ID']
img = np.concatenate([img, img_iceberg, img_ship])
label = np.concatenate([label, label_iceberg, label_ship])
ID = np.concatenate([ID, id_iceberg, id_ship])
del f

np.random.seed(17)
idx = np.arange(img.shape[0])
np.random.shuffle(idx)
img = img[idx].astype(np.float32)
label = label[idx].astype(np.float32)
ID = ID[idx]
np.savez(pesudo_file, img=img, label=label, ID=ID)

In [None]:
def show_imgs(df, title=None):
    n = df.shape[0]
    h, w, c = 75, 75, 2

    band1 = df['band1'].values
    band2 = df['band2'].values

    buf = np.empty((h*c, w*n))
    col = 0
    for i in range(n):
        buf[:h, col:col+w] = band1[i]
        buf[h:2*h, col:col+w] = band2[i]
        col += w

    _, ax = plt.subplots(figsize=(80,10))
    ax.imshow(buf, cmap='gray')
    if title:
        ax.set_title(title, fontsize=30)

def show_rand5(df, title=None):
    idx = np.random.randint(0, df.shape[0], 5)
    print(df.iloc[idx]['is_iceberg'].values)
    show_imgs(df.iloc[idx], title)

In [None]:
show_rand5(df_assume_iceberg)

In [None]:
show_rand5(df_assume_ship)

In [None]:
show_rand5(df_margin)