In [23]:

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
from h2o.transforms.decomposition import H2OPCA
import tempfile
import os
from os import listdir
from os import path
from os.path import isfile, join, getsize
import pandas as pd
import numpy as np
from pycaret.classification import *
import seaborn as sns
import pickle
import tqdm

In [24]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])



In [25]:
def compute_ap(recall, precision):
    """ Compute the average precision, given the recall and precision curves.
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
    # Arguments
        recall:    The recall curve (list).
        precision: The precision curve (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """
    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.0], recall, [1.0]))
    mpre = np.concatenate(([0.0], precision, [0.0]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap

def ap_per_class(tp, conf, pred_cls, target_cls):
    """ Compute the average precision, given the recall and precision curves.
    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
    # Arguments
        tp:    True positives (list).
        conf:  Objectness value from 0-1 (list).
        pred_cls: Predicted object classes (list).
        target_cls: True object classes (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """

    # Sort by objectness
    i = np.argsort(-conf)
    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]

    # Find unique classes
    unique_classes = np.unique(target_cls)

    # Create Precision-Recall curve and compute AP for each class
    ap, p, r = [], [], []
    for c in tqdm.tqdm(unique_classes, desc="Computing AP"):
        i = pred_cls == c
        n_gt = (target_cls == c).sum()  # Number of ground truth objects
        n_p = i.sum()  # Number of predicted objects

        if n_p == 0 and n_gt == 0:
            continue
        elif n_p == 0 or n_gt == 0:
            ap.append(0)
            r.append(0)
            p.append(0)
        else:
            # Accumulate FPs and TPs
            fpc = (1 - tp[i]).cumsum()
            tpc = (tp[i]).cumsum()

            # Recall
            recall_curve = tpc / (n_gt + 1e-16)
            r.append(recall_curve[-1])

            # Precision
            precision_curve = tpc / (tpc + fpc)
            p.append(precision_curve[-1])

            # AP from recall-precision curve
            ap.append(compute_ap(recall_curve, precision_curve))

    # Compute F1 score (harmonic mean of precision and recall)
    p, r, ap = np.array(p), np.array(r), np.array(ap)
    f1 = 2 * p * r / (p + r + 1e-16)

    return p, r, ap, f1, unique_classes.astype("int32")

In [26]:
h2o.init(max_mem_size="128G", nthreads = 31)
#nthreads = 30

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,3 hours 17 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_jupyter_chandler_vaughn_7gc1ic
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,63.32 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,30


In [27]:
os.chdir('/home/jupyter-chandler.vaughn/data/')
infile = open('merged_training.pickle','rb')
merge_pairs_plus_random = pickle.load(infile)
infile.close()
merge_pairs_plus_random = merge_pairs_plus_random.drop(columns=['pair', 'song_1', 'song_2', 'pair_index', 'sim_distance', 'sim_distance_argmin'])
merge_pairs_plus_random.reset_index(drop=True, inplace=True)


from sklearn.decomposition import PCA
x = merge_pairs_plus_random.drop(['target'], axis=1)
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['PC1', 'PC2', 'PC3', 'PC4'])
finalDf = pd.concat([principalDf, merge_pairs_plus_random[['target']]], axis = 1)

# create an export checkpoints directory
checkpoints_dir = tempfile.mkdtemp()

dataset = h2o.H2OFrame(finalDf)
#dataset = h2o.import_file('final_dataset_new.csv')
dataset.shape

Parse progress: |█████████████████████████████████████████████████████████| 100%


(314, 5)

In [28]:
# Identify predictors and response
x = dataset.columns
y = "target"
x.remove(y)

#dataset[y].unique()
dataset[y] = dataset[y].asfactor()

In [29]:
#impute any na's to zero
dataset[x].impute(column = 0, values = [0 for c in range(dataset[x].ncol)])
dataset[x].isna().any()

False

In [30]:
#subset the data so that we can work with it
#eventually we need to do all data for modeling
#using splitframe so we get random rows
#subset_dataset_train, subset_dataset_test, therest = dataset.split_frame(ratios=[0.04, 0.04])

In [31]:
print("Rows To Process: " + str(dataset.nrows))
print("Dataframe Shape: " + str(dataset.shape))

Rows To Process: 314
Dataframe Shape: (314, 5)


In [32]:
#subset the data so that we can work with it
#eventually we need to do all data for modeling
#subset_dataset = dataset[range(0,2000,1),:]
dataset[y] = dataset[y].asfactor()
dataset[x] = dataset[x].asnumeric() 
train,test,valid = dataset.split_frame(ratios=[.7, .15])

In [52]:
#max_runtime_secs=21600
#max_models=200
aml = H2OAutoML(max_runtime_secs=21600, seed=1, balance_classes = False, 
                include_algos = ["DeepLearning"], stopping_metric='mean_per_class_error')
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows) 

In [53]:
aml.leader

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_grid__2_AutoML_20200620_202346_model_101


Status of Neuron Layers: predicting target, 2-class classification, bernoulli distribution, CrossEntropy loss, 2,902 weights/biases, 38.8 KB, 1,398,400 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,4,Input,0.0,,,,,,,,,
1,,2,50,RectifierDropout,10.0,0.0,0.0,0.000338899,0.000347299,0.0,-0.0391809,0.258679,0.490578,0.3974
2,,3,50,RectifierDropout,10.0,0.0,0.0,0.00359266,0.0162937,0.0,-0.0017249,0.251445,0.889371,0.264194
3,,4,2,Softmax,,0.0,0.0,0.000827469,0.000929339,0.0,-0.0490634,0.986468,0.00163964,0.0625591




ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.057015744295865714
RMSE: 0.23877969824896278
LogLoss: 0.20120301564872475
Mean Per-Class Error: 0.09208048369511346
AUC: 0.9544057884825058
AUCPR: 0.9256281611147821
Gini: 0.9088115769650116

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.47926553771499214: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,168.0,3.0,0.0175,(3.0/171.0)
1,1,13.0,46.0,0.2203,(13.0/59.0)
2,Total,181.0,49.0,0.0696,(16.0/230.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.4792655,0.851852,48.0
1,max f2,0.2259523,0.879479,70.0
2,max f0point5,0.6632867,0.92511,41.0
3,max accuracy,0.5892133,0.930435,44.0
4,max precision,0.9999999,1.0,0.0
5,max recall,0.009524104,1.0,156.0
6,max specificity,0.9999999,1.0,0.0
7,max absolute_mcc,0.5892133,0.814542,44.0
8,max min_per_class_accuracy,0.2259523,0.900585,70.0
9,max mean_per_class_accuracy,0.2259523,0.90792,70.0



Gains/Lift Table: Avg response rate: 25.65 %, avg score: 25.95 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.013043,0.9999996,3.898305,3.898305,1.0,1.0,1.0,1.0,0.050847,0.050847,289.830508,289.830508
1,,2,0.021739,0.9999992,3.898305,3.898305,1.0,1.0,1.0,1.0,0.033898,0.084746,289.830508,289.830508
2,,3,0.030435,0.9999985,3.898305,3.898305,1.0,0.999999,1.0,0.999999,0.033898,0.118644,289.830508,289.830508
3,,4,0.043478,0.9999978,3.898305,3.898305,1.0,0.999998,1.0,0.999999,0.050847,0.169492,289.830508,289.830508
4,,5,0.052174,0.9999951,3.898305,3.898305,1.0,0.999996,1.0,0.999999,0.033898,0.20339,289.830508,289.830508
5,,6,0.1,0.9998837,3.898305,3.898305,1.0,0.999966,1.0,0.999983,0.186441,0.389831,289.830508,289.830508
6,,7,0.152174,0.985892,3.898305,3.898305,1.0,0.996168,1.0,0.998675,0.20339,0.59322,289.830508,289.830508
7,,8,0.2,0.5028392,3.189522,3.728814,0.818182,0.756347,0.956522,0.940727,0.152542,0.745763,218.952234,272.881356
8,,9,0.3,0.2273553,1.525424,2.99435,0.391304,0.370747,0.768116,0.750734,0.152542,0.898305,52.542373,199.435028
9,,10,0.4,0.1331901,0.169492,2.288136,0.043478,0.184727,0.586957,0.609232,0.016949,0.915254,-83.050847,128.813559




ModelMetricsBinomial: deeplearning
** Reported on cross-validation data. **

MSE: 0.12662047577769714
RMSE: 0.35583770988710167
LogLoss: 0.5788194232035933
Mean Per-Class Error: 0.18733273862622668
AUC: 0.8727326791555159
AUCPR: 0.8220296816642518
Gini: 0.7454653583110318

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.193498854730452: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,170.0,1.0,0.0058,(1.0/171.0)
1,1,23.0,36.0,0.3898,(23.0/59.0)
2,Total,193.0,37.0,0.1043,(24.0/230.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.1934989,0.75,36.0
1,max f2,0.004272386,0.752089,122.0
2,max f0point5,0.1934989,0.869565,36.0
3,max accuracy,0.1934989,0.895652,36.0
4,max precision,0.9999973,1.0,0.0
5,max recall,1.208715e-05,1.0,217.0
6,max specificity,0.9999973,1.0,0.0
7,max absolute_mcc,0.1934989,0.718311,36.0
8,max min_per_class_accuracy,0.0200143,0.783626,83.0
9,max mean_per_class_accuracy,0.08529909,0.812667,48.0



Gains/Lift Table: Avg response rate: 25.65 %, avg score: 12.18 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.013043,0.9999465,3.898305,3.898305,1.0,0.999978,1.0,0.999978,0.050847,0.050847,289.830508,289.830508
1,,2,0.021739,0.9998606,3.898305,3.898305,1.0,0.999915,1.0,0.999953,0.033898,0.084746,289.830508,289.830508
2,,3,0.030435,0.9993183,3.898305,3.898305,1.0,0.999728,1.0,0.999888,0.033898,0.118644,289.830508,289.830508
3,,4,0.043478,0.9986043,3.898305,3.898305,1.0,0.99903,1.0,0.999631,0.050847,0.169492,289.830508,289.830508
4,,5,0.052174,0.995945,3.898305,3.898305,1.0,0.996958,1.0,0.999186,0.033898,0.20339,289.830508,289.830508
5,,6,0.1,0.4499939,3.898305,3.898305,1.0,0.754634,1.0,0.882226,0.186441,0.389831,289.830508,289.830508
6,,7,0.152174,0.2119476,3.573446,3.786925,0.916667,0.325595,0.971429,0.691381,0.186441,0.576271,257.344633,278.692494
7,,8,0.2,0.09424565,1.417565,3.220339,0.363636,0.133253,0.826087,0.557916,0.067797,0.644068,41.756549,222.033898
8,,9,0.3,0.03584797,1.016949,2.485876,0.26087,0.061942,0.637681,0.392591,0.101695,0.745763,1.694915,148.587571
9,,10,0.4,0.01313631,0.677966,2.033898,0.173913,0.023288,0.521739,0.300265,0.067797,0.813559,-32.20339,103.389831




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.9,0.058735896,0.95652175,0.8913044,0.9130435,0.8043478,0.9347826
1,auc,0.8897708,0.02972728,0.9191176,0.92156863,0.87990195,0.85294116,0.87532467
2,aucpr,0.8434743,0.05842215,0.9212291,0.83064234,0.8524537,0.758405,0.8546413
3,err,0.1,0.058735896,0.04347826,0.10869565,0.08695652,0.19565217,0.06521739
4,err_count,4.6,2.7018511,2.0,5.0,4.0,9.0,3.0
5,f0point5,0.819633,0.12957004,0.96153843,0.7638889,0.86538464,0.625,0.88235295
6,f1,0.8217816,0.0733181,0.90909094,0.8148148,0.8181818,0.7096774,0.85714287
7,f2,0.8330352,0.038267616,0.86206895,0.8730159,0.7758621,0.8208955,0.8333333
8,lift_top_group,3.9030304,0.15584716,3.8333333,3.8333333,3.8333333,3.8333333,4.181818
9,logloss,0.5788194,0.15781341,0.45173493,0.69135916,0.44515795,0.79776156,0.5080835



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-06-20 22:37:24,0.000 sec,,0.0,0,0.0,,,,,,,
1,,2020-06-20 22:37:24,1:33:16.103,57500 obs/sec,10.0,1,2300.0,0.387516,0.452511,0.212615,0.870949,0.791015,3.898305,0.121739
2,,2020-06-20 22:37:29,1:33:21.121,71261 obs/sec,1560.0,156,358800.0,0.270124,0.252703,0.617409,0.925067,0.879601,3.898305,0.082609
3,,2020-06-20 22:37:34,1:33:26.125,78090 obs/sec,3400.0,340,782000.0,0.291417,0.359227,0.554716,0.940034,0.902516,3.898305,0.091304
4,,2020-06-20 22:37:39,1:33:31.129,82531 obs/sec,5380.0,538,1237400.0,0.247219,0.212563,0.679542,0.954009,0.923139,3.898305,0.078261
5,,2020-06-20 22:37:41,1:33:32.841,83801 obs/sec,6080.0,608,1398400.0,0.23878,0.201203,0.701047,0.954406,0.925628,3.898305,0.069565



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,PC1,1.0,1.0,0.287766
1,PC3,0.870651,0.870651,0.250543
2,PC4,0.819241,0.819241,0.235749
3,PC2,0.785159,0.785159,0.225942




In [55]:
pred = aml.predict(test)
pred

deeplearning prediction progress: |███████████████████████████████████████| 100%


predict,p0,p1
0,0.669362,0.330638
0,0.991225,0.00877483
1,5.72902e-07,0.999999
0,0.915819,0.0841812
0,0.996292,0.00370796
0,0.997668,0.00233168
0,0.986685,0.0133151
0,0.708707,0.291293
0,0.987735,0.0122647
0,1.0,4.74451e-07




In [56]:
results = pd.concat([test.as_data_frame(), pred.as_data_frame()], axis=1)
results['rank'] = results['p1'].rank(ascending=False)

In [57]:
#results_sorted = results.sort_values(by=['p1'], ascending=False)
results_sorted = results.sort_values(by=['rank'], ascending=True)
results_sorted

Unnamed: 0,PC1,PC2,PC3,PC4,target,predict,p0,p1,rank
2,1.08589,-0.242713,-0.179633,-0.350191,1,1,5.729025e-07,0.9999994,1.0
28,0.77736,-0.160065,-0.210795,0.056239,0,1,0.04491268,0.9550873,2.0
18,0.268486,-0.182519,0.532577,-0.621284,0,1,0.3038913,0.6961087,3.0
25,0.274101,-0.27984,0.107978,0.641529,0,1,0.398168,0.601832,4.0
30,0.048259,0.089509,-0.086303,-0.410545,0,0,0.5801156,0.4198844,5.0
0,0.388517,-0.748904,-0.213507,0.071134,1,0,0.6693625,0.3306375,6.0
7,0.007021,-0.133931,0.059231,0.45574,0,0,0.7087068,0.2912932,7.0
16,0.084197,-0.100914,-0.058635,0.030348,0,0,0.7273774,0.2726226,8.0
24,0.245256,-0.068457,0.137635,-0.050295,0,0,0.7621686,0.2378314,9.0
20,0.222615,-0.262921,-0.232746,-0.010079,0,0,0.8131605,0.1868395,10.0


In [58]:
#find Average Precision
tp = results_sorted['target'].values
conf = results_sorted['p1'].values
pred_cls = results_sorted['predict'].values
target_cls = [results_sorted['target'].values]

p, r, ap, f1, unique_classes = ap_per_class(tp, conf, pred_cls, target_cls)

Computing AP: 100%|██████████| 2/2 [00:00<00:00, 1272.74it/s]


In [59]:
#MAP
np.mean(ap)

0.1371808143547274

In [60]:
perf = aml.leader.model_performance(valid)
perf


ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.1961803579670313
RMSE: 0.44292251914644315
LogLoss: 0.8455157875622393
Mean Per-Class Error: 0.1947004608294931
AUC: 0.8294930875576038
AUCPR: 0.7915889953041572
Gini: 0.6589861751152075

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9928983040061599: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,30.0,1.0,0.0323,(1.0/31.0)
1,1,5.0,9.0,0.3571,(5.0/14.0)
2,Total,35.0,10.0,0.1333,(6.0/45.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.9928983,0.75,9.0
1,max f2,0.001920026,0.769231,34.0
2,max f0point5,0.9999742,0.833333,6.0
3,max accuracy,0.9928983,0.866667,9.0
4,max precision,1.0,1.0,0.0
5,max recall,0.001920026,1.0,34.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.9928983,0.679934,9.0
8,max min_per_class_accuracy,0.3031061,0.709677,18.0
9,max mean_per_class_accuracy,0.9928983,0.8053,9.0



Gains/Lift Table: Avg response rate: 31.11 %, avg score: 37.97 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.022222,0.9999999,3.214286,3.214286,1.0,1.0,1.0,1.0,0.071429,0.071429,221.428571,221.428571
1,,2,0.022222,0.9999999,0.0,3.214286,0.0,0.0,1.0,1.0,0.0,0.071429,-100.0,221.428571
2,,3,0.044444,0.9999997,3.214286,3.214286,1.0,1.0,1.0,1.0,0.071429,0.142857,221.428571,221.428571
3,,4,0.044444,0.9999994,0.0,3.214286,0.0,0.0,1.0,1.0,0.0,0.142857,-100.0,221.428571
4,,5,0.066667,0.9999992,3.214286,3.214286,1.0,0.999999,1.0,1.0,0.071429,0.214286,221.428571,221.428571
5,,6,0.111111,0.9999907,3.214286,3.214286,1.0,0.999995,1.0,0.999998,0.142857,0.357143,221.428571,221.428571
6,,7,0.155556,0.9999563,3.214286,3.214286,1.0,0.999982,1.0,0.999993,0.142857,0.5,221.428571,221.428571
7,,8,0.2,0.9942503,1.607143,2.857143,0.5,0.999801,0.888889,0.999951,0.071429,0.571429,60.714286,185.714286
8,,9,0.311111,0.7381155,0.642857,2.066327,0.2,0.893396,0.642857,0.961896,0.071429,0.642857,-35.714286,106.632653
9,,10,0.4,0.3055163,0.0,1.607143,0.0,0.466273,0.5,0.851757,0.0,0.642857,-100.0,60.714286







In [61]:
#Lets look at Plagerized data
plagerize_unseen = pd.read_pickle('merged_unseen.pickle')

In [62]:
plagerize_unseen['sim_distance_bass'] = np.log(plagerize_unseen['sim_distance_bass']+0.0001)
plagerize_unseen['sim_distance_drums'] = np.log(plagerize_unseen['sim_distance_drums']+0.0001)
plagerize_unseen['sim_distance_vocals'] = np.log(plagerize_unseen['sim_distance_vocals']+0.0001)
plagerize_unseen['sim_distance_other'] = np.log(plagerize_unseen['sim_distance_other']+0.0001)
plagerize_unseen['sim_distance_argmin_combined'] = plagerize_unseen[['sim_distance','sim_distance_argmin']].min(axis=1)
plagerize_unseen['sim_distance_argmin_combined'] = np.log(plagerize_unseen['sim_distance_argmin_combined']+0.0001)

In [63]:
plagerize_unseen = plagerize_unseen.drop(columns=['pair_x', 'song_1', 'song_2', 'pair_y', 'sim_distance', 'sim_distance_argmin'])
plagerize_unseen.reset_index(drop=True, inplace=True)
plagerize_unseen = h2o.H2OFrame(plagerize_unseen)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [64]:
pred_unseen = aml.predict(plagerize_unseen)

deeplearning prediction progress: | (failed)


OSError: Job with key $03017f00000132d4ffffffff$_967bf646b9999e642e2b9c6feb614d02 failed with an exception: java.lang.IllegalArgumentException: Test/Validation dataset has no columns in common with the training set
stacktrace: 
java.lang.IllegalArgumentException: Test/Validation dataset has no columns in common with the training set
	at hex.Model.adaptTestForTrain(Model.java:1377)
	at hex.Model.adaptTestForTrain(Model.java:1216)
	at hex.Model.score(Model.java:1503)
	at water.api.ModelMetricsHandler$1.compute2(ModelMetricsHandler.java:396)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1557)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:974)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [133]:
pred_unseen

predict,p0,p1
0,0.645903,0.354097
0,0.891783,0.108217
1,0.612703,0.387297
1,0.565323,0.434677
1,0.404694,0.595306
0,0.62433,0.37567
0,0.834967,0.165033
0,0.756806,0.243194
1,0.14575,0.85425
1,0.157077,0.842923




In [134]:
results_unseen = pd.concat([plagerize_unseen.as_data_frame(), pred_unseen.as_data_frame()], axis=1)
results_unseen['rank'] = results_unseen['p1'].rank(ascending=False)

results_unseen = results_unseen.sort_values(by=['rank'], ascending=True)
results_unseen

Unnamed: 0,sim_distance_bass,sim_distance_drums,sim_distance_vocals,sim_distance_other,sim_distance_argmin_combined,predict,p0,p1,rank
13,-3.377752,-3.296034,-3.686204,-3.564177,-3.7115,1,0.115353,0.884647,1.0
8,-3.176397,-3.252703,-2.836602,-3.49097,-3.49097,1,0.14575,0.85425,2.0
9,-3.880038,-2.531039,-2.979819,-3.406609,-3.880038,1,0.157077,0.842923,3.0
4,-1.731434,-3.445135,-3.127537,-3.144894,-3.445135,1,0.404694,0.595306,4.0
15,-3.130439,-3.375734,-2.162241,-3.175864,-3.529704,1,0.414288,0.585712,5.0
14,-3.180531,-3.027164,-2.616983,-3.170255,-3.276949,1,0.441112,0.558888,6.0
20,-2.514196,-3.412679,-2.862315,-3.117153,-3.412679,1,0.536015,0.463985,7.0
25,-3.110523,-3.140508,-3.007707,-3.255698,-3.287768,1,0.563437,0.436563,8.0
3,-2.921536,-3.036883,-2.442322,-3.098474,-3.098474,1,0.565323,0.434677,9.0
2,-3.021577,-3.274226,-3.318774,-2.75368,-3.318774,1,0.612703,0.387297,10.0


In [157]:
aml2 = H2OAutoML(max_models=100, seed=1, balance_classes = True, exclude_algos = ["GBM"], stopping_metric='logloss')
aml2.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [158]:
# View the AutoML Leaderboard
lb2 = aml2.leaderboard
lb2.head(rows=lb2.nrows) 

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DeepLearning_grid__1_AutoML_20200618_010541_model_6,0.905902,1.47577,0.853138,0.173189,0.47103,0.22187
DeepLearning_grid__3_AutoML_20200618_010541_model_2,0.902356,1.10437,0.801165,0.153179,0.501927,0.251931
DeepLearning_grid__3_AutoML_20200618_010541_model_7,0.898556,0.763057,0.836082,0.183827,0.327623,0.107337
DeepLearning_grid__2_AutoML_20200618_010541_model_7,0.897416,0.319443,0.809103,0.16806,0.311305,0.0969107
DeepLearning_grid__3_AutoML_20200618_010541_model_1,0.89615,2.63214,0.786621,0.185094,0.664917,0.442114
DeepLearning_grid__2_AutoML_20200618_010541_model_5,0.895517,1.29603,0.796332,0.168503,0.465807,0.216976
DeepLearning_grid__3_AutoML_20200618_010541_model_10,0.892857,1.40145,0.803036,0.160841,0.445919,0.198844
DeepLearning_grid__2_AutoML_20200618_010541_model_8,0.890198,1.03643,0.824637,0.173189,0.459556,0.211191
DeepLearning_grid__1_AutoML_20200618_010541_model_5,0.883739,1.88994,0.782815,0.163817,0.571476,0.326584
StackedEnsemble_BestOfFamily_AutoML_20200618_010541,0.883359,0.287005,0.816748,0.154445,0.283025,0.0801029




In [159]:
aml2.leader

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_grid__1_AutoML_20200618_010541_model_6


Status of Neuron Layers: predicting target, 2-class classification, bernoulli distribution, CrossEntropy loss, 4,002 weights/biases, 56.1 KB, 1,323,840 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,5,Input,5.0,,,,,,,,,
1,,2,500,RectifierDropout,30.0,0.0,0.0,0.0948417,0.190474,0.0,0.130583,0.648805,-0.4535,0.454669
2,,3,2,Softmax,,0.0,0.0,0.116843,0.203498,0.0,-0.287446,1.7959,-13.9339,0.856533




ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.003606874034523682
RMSE: 0.06005725630199636
LogLoss: 0.014922674337819183
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9649643070089776: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,168.0,0.0,0.0,(0.0/168.0)
1,1,0.0,168.0,0.0,(0.0/168.0)
2,Total,168.0,168.0,0.0,(0.0/336.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.9649643,1.0,13.0
1,max f2,0.9649643,1.0,13.0
2,max f0point5,0.9649643,1.0,13.0
3,max accuracy,0.9649643,1.0,13.0
4,max precision,1.0,1.0,0.0
5,max recall,0.9649643,1.0,13.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.9649643,1.0,13.0
8,max min_per_class_accuracy,0.9649643,1.0,13.0
9,max mean_per_class_accuracy,0.9649643,1.0,13.0



Gains/Lift Table: Avg response rate: 50.00 %, avg score: 50.75 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.282738,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.565476,0.565476,100.0,100.0
1,,2,0.303571,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.041667,0.607143,100.0,100.0
2,,3,0.401786,0.9999955,2.0,2.0,1.0,0.9999995,1.0,1.0,0.196429,0.803571,100.0,100.0
3,,4,0.5,0.9484203,2.0,2.0,1.0,0.9917329,1.0,0.998376,0.196429,1.0,100.0,100.0
4,,5,0.60119,0.001839669,0.0,1.663366,0.0,0.08147605,0.831683,0.844046,0.0,1.0,-100.0,66.336634
5,,6,0.699405,1.835358e-06,0.0,1.429787,0.0,0.0003492801,0.714894,0.72557,0.0,1.0,-100.0,42.978723
6,,7,0.800595,1.184478e-13,0.0,1.249071,0.0,1.48103e-07,0.624535,0.633862,0.0,1.0,-100.0,24.907063
7,,8,0.89881,1.21385e-36,0.0,1.112583,0.0,4.083212e-15,0.556291,0.564599,0.0,1.0,-100.0,11.258278
8,,9,1.0,1.609982e-129,0.0,1.0,0.0,5.756976e-38,0.5,0.507467,0.0,1.0,-100.0,0.0




ModelMetricsBinomial: deeplearning
** Reported on cross-validation data. **

MSE: 0.22186969017548958
RMSE: 0.4710304556772201
LogLoss: 1.4757744421742018
Mean Per-Class Error: 0.1310157041540021
AUC: 0.9059017223910841
AUCPR: 0.8531381910495553
Gini: 0.8118034447821683

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9999975788658066: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,167.0,1.0,0.006,(1.0/168.0)
1,1,16.0,31.0,0.3404,(16.0/47.0)
2,Total,183.0,32.0,0.0791,(17.0/215.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.9999976,0.78481,3.0
1,max f2,0.9873063,0.809717,30.0
2,max f0point5,1.0,0.889571,0.0
3,max accuracy,0.9999976,0.92093,3.0
4,max precision,1.0,1.0,0.0
5,max recall,5.989936000000001e-28,1.0,159.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.9999976,0.758979,3.0
8,max min_per_class_accuracy,0.9873063,0.851064,30.0
9,max mean_per_class_accuracy,0.9873063,0.868984,30.0



Gains/Lift Table: Avg response rate: 21.86 %, avg score: 41.87 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.088372,1.0,4.574468,4.574468,1.0,1.0,1.0,1.0,0.404255,0.404255,357.446809,357.446809
1,,2,0.102326,1.0,4.574468,4.574468,1.0,1.0,1.0,1.0,0.06383,0.468085,357.446809,357.446809
2,,3,0.153488,0.9999863,3.742747,4.297228,0.818182,0.9999986,0.939394,1.0,0.191489,0.659574,274.274662,329.72276
3,,4,0.2,0.9983247,0.914894,3.510638,0.2,0.9995114,0.767442,0.999886,0.042553,0.702128,-8.510638,251.06383
4,,5,0.302326,0.9604002,1.455513,2.815057,0.318182,0.9881589,0.615385,0.995917,0.148936,0.851064,45.551257,181.505728
5,,6,0.4,0.6461868,0.217832,2.180851,0.047619,0.8776886,0.476744,0.967047,0.021277,0.87234,-78.216819,118.085106
6,,7,0.502326,0.0405363,0.415861,1.821316,0.090909,0.3044863,0.398148,0.832081,0.042553,0.914894,-58.413926,82.1316
7,,8,0.6,0.0004895304,0.0,1.524823,0.0,0.007802093,0.333333,0.697896,0.0,0.914894,-100.0,52.48227
8,,9,0.697674,1.66208e-06,0.217832,1.341844,0.047619,8.387798e-05,0.293333,0.600202,0.021277,0.93617,-78.216819,34.184397
9,,10,0.8,9.841058e-16,0.20793,1.196809,0.045455,2.186325e-07,0.261628,0.523432,0.021277,0.957447,-79.206963,19.680851




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.9209302,0.072802216,1.0,0.88372093,0.81395346,0.95348835,0.95348835
1,auc,0.92717767,0.08168107,1.0,0.8090909,0.875817,0.9836601,0.96732026
2,aucpr,0.86196584,0.13269758,1.0,0.72241664,0.71788657,0.95133847,0.9181876
3,err,0.07906977,0.072802216,0.0,0.11627907,0.18604651,0.046511628,0.046511628
4,err_count,3.4,3.130495,0.0,5.0,8.0,2.0,2.0
5,f0point5,0.8293923,0.15820979,1.0,0.7894737,0.5797101,0.8888889,0.8888889
6,f1,0.83006537,0.13956966,1.0,0.7058824,0.6666667,0.8888889,0.8888889
7,f2,0.8400779,0.13616619,1.0,0.63829786,0.78431374,0.8888889,0.8888889
8,lift_top_group,4.5866666,0.26168966,4.3,4.3,4.7777777,4.7777777,4.7777777
9,logloss,1.4757744,0.77236855,1.1861598,2.56315,1.8991042,1.1776481,0.5528102



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-06-18 01:09:47,0.000 sec,,0.0,0,0.0,,,,,,,
1,,2020-06-18 01:09:47,3 min 52.485 sec,45405 obs/sec,10.0,1,3360.0,0.683058,4.984687,-0.866272,0.89066,0.881439,2.0,0.22619
2,,2020-06-18 01:09:52,3 min 57.509 sec,100813 obs/sec,1520.0,152,510720.0,0.300829,0.385952,0.638007,0.997732,0.997631,2.0,0.017857
3,,2020-06-18 01:09:57,4 min 2.518 sec,116104 obs/sec,3470.0,347,1165920.0,0.177339,0.096239,0.874204,0.999858,0.999857,2.0,0.002976
4,,2020-06-18 01:09:58,4 min 3.700 sec,118422 obs/sec,3940.0,394,1323840.0,0.060057,0.014923,0.985573,1.0,1.0,2.0,0.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,sim_distance_other,1.0,1.0,0.278376
1,sim_distance_argmin_combined,0.829031,0.829031,0.230782
2,sim_distance_vocals,0.75423,0.75423,0.209959
3,sim_distance_bass,0.532916,0.532916,0.148351
4,sim_distance_drums,0.476091,0.476091,0.132532




In [160]:
pred2 = aml2.predict(test)
pred2

deeplearning prediction progress: |███████████████████████████████████████| 100%


predict,p0,p1
1,3.38781e-50,1.0
1,4.89301e-109,1.0
1,1.65899e-37,1.0
1,0.0059793,0.994021
0,1.0,1.10701e-63
0,1.0,1.1377500000000001e-34
1,5.88561e-172,1.0
0,0.99669,0.00330997
0,0.999994,6.28546e-06
0,1.0,2.36223e-19




In [161]:
results2 = pd.concat([test.as_data_frame(), pred2.as_data_frame()], axis=1)
results2['rank'] = results2['p1'].rank(ascending=False)

In [162]:
results2_sorted = results2.sort_values(by=['rank'], ascending=True)
results2_sorted

Unnamed: 0,sim_distance_bass,sim_distance_drums,sim_distance_vocals,sim_distance_other,target,sim_distance_argmin_combined,predict,p0,p1,rank
0,-3.396255,-3.318984,-3.726317,-3.307305,1,-3.752329,1,3.3878059999999995e-50,1.0,3.5
11,-2.226876,-3.649815,-3.593604,-3.713786,1,-3.862291,1,8.85373e-65,1.0,3.5
6,-3.748001,-3.524138,-4.008976,-4.432945,1,-4.432945,1,5.885614e-172,1.0,3.5
14,-3.601015,-3.368839,-3.177789,-3.862225,1,-4.050557,1,1.759876e-74,1.0,3.5
2,-3.217733,-2.925328,-2.535029,-3.505658,1,-3.812027,1,1.65899e-37,1.0,3.5
1,-3.793328,-3.451294,-3.797983,-3.908221,1,-4.110113,1,4.893007e-109,1.0,3.5
25,-2.437577,-2.955856,-2.886427,-3.238617,0,-3.365747,1,9.909716e-05,0.9999009,7.0
15,-2.782669,-3.015499,-2.328491,-2.815056,0,-3.015499,1,0.0004013327,0.9995987,8.0
37,-2.994229,-3.466758,-3.234972,-3.114258,0,-3.466758,1,0.002885612,0.9971144,9.0
3,-2.739924,-3.098317,-3.356697,-3.304992,1,-3.356697,1,0.0059793,0.9940207,10.0


In [163]:
#find Average Precision
tp = results2_sorted['target'].values
conf = results2_sorted['p1'].values
pred_cls = results2_sorted['predict'].values
target_cls = [results2_sorted['target'].values]

p, r, ap, f1, unique_classes = ap_per_class(tp, conf, pred_cls, target_cls)

Computing AP: 100%|██████████| 2/2 [00:00<00:00, 1278.36it/s]


In [164]:
#MAP
np.mean(ap)

0.2719764142287931

In [165]:
perf2 = aml2.leader.model_performance(valid)
perf2


ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.27584510106617555
RMSE: 0.5252095782315623
LogLoss: 3.128556380625003
Mean Per-Class Error: 0.22857142857142865
AUC: 0.7904761904761906
AUCPR: 0.6650018748993816
Gini: 0.5809523809523811

Confusion Matrix (Act/Pred) for max f1 @ threshold = 4.00081380260415e-08: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,19.0,16.0,0.4571,(16.0/35.0)
1,1,0.0,15.0,0.0,(0.0/15.0)
2,Total,19.0,31.0,0.32,(16.0/50.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,4.000814e-08,0.652174,26.0
1,max f2,4.000814e-08,0.824176,26.0
2,max f0point5,1.0,0.714286,0.0
3,max accuracy,1.0,0.8,0.0
4,max precision,1.0,1.0,0.0
5,max recall,4.000814e-08,1.0,26.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,4.000814e-08,0.512516,26.0
8,max min_per_class_accuracy,0.002627828,0.657143,17.0
9,max mean_per_class_accuracy,4.000814e-08,0.771429,26.0



Gains/Lift Table: Avg response rate: 30.00 %, avg score: 30.66 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.1,1.0,3.333333,3.333333,1.0,1.0,1.0,1.0,0.333333,0.333333,233.333333,233.333333
1,,2,0.1,1.0,0.0,3.333333,0.0,0.0,1.0,1.0,0.0,0.333333,-100.0,233.333333
2,,3,0.16,0.9999998,0.0,2.083333,0.0,0.9999999,0.625,1.0,0.0,0.333333,-100.0,108.333333
3,,4,0.2,0.999996,1.666667,2.0,0.5,0.9999982,0.6,1.0,0.066667,0.4,66.666667,100.0
4,,5,0.3,0.6023977,1.333333,1.777778,0.4,0.949924,0.533333,0.983308,0.133333,0.533333,33.333333,77.777778
5,,6,0.4,0.005393055,0.0,1.333333,0.0,0.1142841,0.4,0.766052,0.0,0.533333,-100.0,33.333333
6,,7,0.5,7.557687e-06,2.666667,1.6,0.8,0.002187024,0.48,0.613279,0.266667,0.8,166.666667,60.0
7,,8,0.6,7.764053e-07,1.333333,1.555556,0.4,4.06695e-06,0.466667,0.511066,0.133333,0.933333,33.333333,55.555556
8,,9,0.7,2.811431e-14,0.666667,1.428571,0.2,1.248289e-08,0.428571,0.438057,0.066667,1.0,-33.333333,42.857143
9,,10,0.8,2.81288e-31,0.0,1.25,0.0,3.7245890000000003e-22,0.375,0.3833,0.0,1.0,-100.0,25.0







In [None]:
h2o.cluster().shutdown()

In [None]:
# define the range of hyper-parameters for DNN grid search
# 81 combinations in total
# define the criteria for full grid search
search_criteria = {'strategy': "Cartesian"}
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ODeepLearningEstimator
hyper_params = {'activation': ['tanh', 'rectifier', 'maxout'],
                'hidden': [[100,100,100,100], [50,50], [50,50,50]],
                'l1': [0, 1e-3, 1e-5],
                'l2': [0, 1e-3, 1e-5]}
# Set up DNN grid search
# Add a seed for reproducibility
dnn_rand_grid = H2OGridSearch(
                    H2ODeepLearningEstimator(
                        model_id = 'dnn_rand_grid', 
                        seed = 1234,
                        epochs = 20,   
                        nfolds = 5,
                        fold_assignment = "Modulo",                
                        keep_cross_validation_predictions = True), 
                    search_criteria = search_criteria, 
                    hyper_params = hyper_params)
# Use .train() to start the grid search
dnn_rand_grid.train(x = x, 
                    y = y, 
                    training_frame = train)
dnn_rand_grid_sorted = dnn_rand_grid.get_grid(sort_by='mse', decreasing=False)
# Extract the best model from random grid search
best_dnn_model_id = dnn_rand_grid_sorted.model_ids[0]
best_dnn_from_rand_grid = h2o.get_model(best_dnn_model_id)