In [10]:
import os
import sys
import json
import datetime
import shutil
import time
import math
from collections import Counter
from  IPython.display import clear_output


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import trange
from tqdm import tqdm
from tqdm import tqdm_notebook
%matplotlib inline


import tensorflow as tf



############################################
import constants
from constants import *
import utils

SIZE = None
RUN_DEBUG = False
LABELS = ['Hi-hat', 'Saxophone', 'Trumpet', 'Glockenspiel', 'Cello', 'Knock',
       'Gunshot_or_gunfire', 'Clarinet', 'Computer_keyboard',
       'Keys_jangling', 'Snare_drum', 'Writing', 'Laughter', 'Tearing',
       'Fart', 'Oboe', 'Flute', 'Cough', 'Telephone', 'Bark', 'Chime',
       'Bass_drum', 'Bus', 'Squeak', 'Scissors', 'Harmonica', 'Gong',
       'Microwave_oven', 'Burping_or_eructation', 'Double_bass',
       'Shatter', 'Fireworks', 'Tambourine', 'Cowbell', 'Electric_piano',
       'Meow', 'Drawer_open_or_close', 'Applause', 'Acoustic_guitar',
       'Violin_or_fiddle', 'Finger_snapping']

___

In [26]:
def ensemble_from_folders(
        predictions_folders, name_result, n_folds,
        generate_predicted_labels=False,
        name_result_for_generated_labels=None):
    if generate_predicted_labels and name_result_for_generated_labels is None:
        raise ValueError()
    if not isinstance(n_folds, list):
        n_folds = [n_folds] * len(predictions_folders)
    if len(n_folds) != len(predictions_folders):
        raise ValueError()
    if not isinstance(n_folds[0], int):
        raise ValueError()
        
    pred_list = []
    for predictions_folder, n_folds_now in zip(predictions_folders, n_folds):
        for i in range(n_folds_now):
            pred_list.append(np.load(os.path.join(predictions_folder, 'test_predictions_%d.npy' % i)))
    prediction = np.ones_like(pred_list[0])
    print('count model in enseble: {}'.format(len(pred_list)))

    for pred in pred_list:
        prediction = prediction * pred
    prediction = prediction ** (1.0 / len(pred_list))
    np.save(os.path.join(constants.PREDICTIONS, name_result), prediction)
    
    if generate_predicted_labels:
        top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
        predicted_labels = [' '.join(list(x)) for x in top_3]
        test = pd.read_csv(os.path.join(constants.DATA, 'sample_submission.csv'))
        test['label'] = predicted_labels
        test[['fname', 'label']].to_csv(
            os.path.join(constants.PREDICTIONS, name_result_for_generated_labels), index=False)
        
        # for debugging:
        return pd.read_csv(os.path.join(
            constants.PREDICTIONS, name_result_for_generated_labels)).head()
    
    
def ensemble_from_files(predictions_files, name_result, weights):
    if len(predictions_files) != len(weights):
        raise ValueError
    if abs(sum(weights) - 1) > 1e-6:
        raise ValueError
    pred_list = []
    for predictions_file in predictions_files:
        pred_list.append(np.load(os.path.join(constants.PREDICTIONS, predictions_file)))
    prediction = np.ones_like(pred_list[0])
        
    for index, pred in enumerate(pred_list):
        prediction = prediction * (pred  ** weights[index])
    # prediction = prediction ** (1.0 / len(pred_list))
    
    top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test = pd.read_csv(os.path.join(constants.DATA, 'sample_submission.csv'))
    test['label'] = predicted_labels
    test[['fname', 'label']].to_csv(
        os.path.join(constants.PREDICTIONS, name_result), index=False)    

In [27]:
def make_two_stage_submission_file(
        predictions_folders_1d,
        predictions_folders_2d,
        name_result,
        weights,
        n_folds_1d=10,
        n_folds_2d=10):

    name_predictions_for_1d = '1d_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
        '.npy')
    ensemble_from_folders(
        predictions_folders_1d,
        name_predictions_for_1d,
        n_folds=n_folds_1d)
    
    name_predictions_for_2d = '2d_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
        '.npy')
    ensemble_from_folders(
        predictions_folders_2d,
        name_predictions_for_2d,
        n_folds=n_folds_2d)

    predictions_files = [name_predictions_for_1d, name_predictions_for_2d]
    ensemble_from_files(
        predictions_files, 
        name_result=name_result,
        weights=weights)
    
    for name_to_remove in [name_predictions_for_1d, name_predictions_for_2d]:
        os.remove(os.path.join(constants.PREDICTIONS, name_to_remove))
    
    # for debugging:
    return pd.read_csv(os.path.join(constants.PREDICTIONS, name_result)).head()

___

# Make two stage

In [58]:
weight_1d = 0.5
weight_2d = 0.5

predictions_folders_1d = [
    os.path.join(constants.PREDICTIONS, 'get_1d_conv_model_1d_final'),
]
predictions_folders_2d = [
    os.path.join(constants.PREDICTIONS, 'get_general_2d_conv_model_2d_final')
]
name_result = '1d_2d_final_two_stage_{}_{}.csv'.format(weight_1d, weight_2d)
make_two_stage_submission_file(
    predictions_folders_1d,
    predictions_folders_2d,
    name_result,
    weights=[weight_1d, weight_2d],
    n_folds_1d=9,
    n_folds_2d=10)
print(name_result)

del predictions_folders_1d, predictions_folders_2d
del name_result

count model in enseble: 9
count model in enseble: 10
1d_2d_final_two_stage_0.5_0.5.csv
