In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import fbeta_score
from tqdm import tqdm
import os
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
data_dir = '../input'
model_name = 'model-0.14135'
raw_prediction_on_train_set = 'raw_pred_{}.pkl'.format(model_name)
# this is saved in keras_train.py with 
# pickle.dump((ypred_train, ypred_valid, ytrain, yvalid), f)

special_str = ''
batch_method = 0
prediction_filename = os.path.join(data_dir, '../output/keras_pred_{}{}_BM{}.csv'.format(model_name, special_str, str(batch_method)))

In [3]:
with open(prediction_filename.replace('.csv', '.pkl'), 'rb') as f:
    test_filenames, ytest = pickle.load(f)
print(ytest.shape)

(61191, 17)


In [4]:
with open(raw_prediction_on_train_set, 'rb') as f:
    ypred_train, ypred_valid, ytrain, yvalid = pickle.load(f)

print(ypred_train.shape, ypred_valid.shape, ytrain.shape, yvalid.shape)
y_pred = np.concatenate((ypred_train, ypred_valid))
y_true = np.concatenate((ytrain, yvalid))

(16000, 17) (4000, 17) (16000, 17) (4000, 17)


In [5]:
#y_pred_i = (y_pred > .3).astype(int)
#score = fbeta_score(y_true, y_pred_i, beta=2, average='samples')
#print(score)

In [6]:
thresholds = np.random.rand(17)
# thresholds

In [7]:
def proba_to_int(yproba, thresh):
    y_pred_t = yproba.copy()
    for i in range(y_pred_t.shape[1]):
        y_pred_t[:,i] = yproba[:,i] > thresh[i]
        #print(y_pred)
    
    y_pred_i = y_pred_t.astype(int)
    return y_pred_i


def fbeta_with_thresholds(thresh, y_true=None, y_pred=None):
    if isinstance(thresh, float):
        thresh = np.ones(17)*thresh
    
    y_pred_i = proba_to_int(y_pred, thresh)
    
    score = fbeta_score(y_true, y_pred_i, beta=2, average='samples')
    #print(score)
    return score*-1

In [8]:
# fbeta_with_thresholds(0.3, y_true, y_pred)
# -0.88047779458491238

In [9]:
#fbeta_with_thresholds(thresholds, y_true, y_pred)
# #-0.7625872868255914

In [10]:
#from scipy.optimize import minimize

In [11]:
#minimize(fbeta_with_thresholds, x0=thresholds, args=(y_true, y_pred), tol=1e-9)

In [12]:
opt_thresh = list(np.ones(17)*.2)
opt_score = fbeta_with_thresholds(opt_thresh, y_true, y_pred)
for i in tqdm(range(len(opt_thresh))):
    for t in np.arange(0.01, .9, .011):
        tmp_thresh = opt_thresh.copy()
        tmp_thresh[i]  = t
        new_score = fbeta_with_thresholds(tmp_thresh, y_true, y_pred)
        if new_score < opt_score :
            opt_thresh[i] = t
            opt_score = new_score

  'precision', 'predicted', average, warn_for)
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:32<00:00,  1.89s/it]


In [13]:
print(opt_score)
print(opt_thresh)

-0.887820732767
[0.30699999999999994, 0.075999999999999984, 0.17499999999999996, 0.08699999999999998, 0.20000000000000001, 0.31799999999999995, 0.17499999999999996, 0.11999999999999997, 0.31799999999999995, 0.30699999999999994, 0.13099999999999998, 0.21899999999999997, 0.21899999999999997, 0.31799999999999995, 0.20799999999999996, 0.25199999999999995, 0.22999999999999995]


In [14]:
# run again in reverse to check convergence
ilist = list(range(len(opt_thresh)))
for i in tqdm(ilist[::-1]):
    # print(i)
    for t in np.arange(0.01, .9, .011):
        tmp_thresh = opt_thresh.copy()
        tmp_thresh[i]  = t
        new_score = fbeta_with_thresholds(tmp_thresh, y_true, y_pred)
        if new_score < opt_score :
            opt_thresh[i] = t
            opt_score = new_score

  'precision', 'predicted', average, warn_for)
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:31<00:00,  1.88s/it]


In [15]:
print(opt_score)
print(opt_thresh)

-0.887825809243
[0.30699999999999994, 0.075999999999999984, 0.17499999999999996, 0.08699999999999998, 0.20000000000000001, 0.39499999999999991, 0.17499999999999996, 0.11999999999999997, 0.31799999999999995, 0.30699999999999994, 0.13099999999999998, 0.21899999999999997, 0.21899999999999997, 0.31799999999999995, 0.20799999999999996, 0.25199999999999995, 0.22999999999999995]


In [16]:
ytest_i = proba_to_int(ytest, opt_thresh)

In [17]:
ytest_i

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ..., 
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [18]:
from tag_translation import train_label, tags_to_vec, map_predictions

In [19]:
predicted_labels = map_predictions(ytest_i)
predicted_labels_str = [' '.join(x) for x in predicted_labels]
df = pd.DataFrame({'image_name': test_filenames, 'tags': predicted_labels_str})

In [21]:
prediction_filename = os.path.join(data_dir, '../output/keras_pred_{}_opt.csv'.format(model_name))
df.to_csv(prediction_filename, index=False)