In [None]:
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from hit_prediction_code.analytics import get_results_as_dataframe
from hit_prediction_code import analytics

results = get_results_as_dataframe(
    project_name='hit-prediction-ismir2020',
    table_name='hit_prediction',
    filter_git_dirty=True,
    date_filter='> \'2021-08-04 16:20:00\'',
    columns=['id', 'date', 'sourcefile', 'outcome'],
#     filters=['sourcefile LIKE \'plans/hspd/regression_%%.py\''],
    filters=['sourcefile LIKE \'plans/hspd/regression_cleaned%%_%%all_yanghitscore.py\''],
#     filters=['sourcefile LIKE \'plans/ordinal%%\''],
)

analytics.add_approach_to_df(results)
analytics.add_cv_epoch_evaluator_outcome_to_df(results)

results.sort_values(by='sourcefile', inplace=True)
display(len(results))

In [None]:
best_results = []
for _, row in results.iterrows():
    metrics = list(filter(lambda i: i not in ['confusion_matrix'], row['mean'].index))
    row = pd.DataFrame(pd.concat([row, row['mean'].loc[metrics].max(axis=1).abs()], axis=0)).transpose()

    best_results.append(row)
best_results = pd.concat(best_results)

best_results['mae'] = best_results['neg_mean_absolute_error']
best_results['meae'] = best_results['neg_median_absolute_error']
best_results['rmse'] = best_results['neg_mean_squared_error'].pow(1./2)
best_results = best_results.sort_values(by=['sourcefile'])

In [None]:
metric_avg = analytics.aggregate_splits_per_epoch(results['outcome'], np.average)

# display(analytics.aggregate_epochs(metric_avg, np.max).abs())

In [None]:
best_results['approach'] = best_results['approach'].apply(lambda v: v.replace('mean_std', 'ms'))
# display(best_results[['date', 'approach', 'mae', 'rmse', 'pearsonr', 'spearmanr', 'kendalltau']])
display(best_results[['approach', 'spearmanr', 'kendalltau']])

# display(best_results[['approach', 'mae']].plot.bar(x='approach', y='mae', title='Wide and Deep MAE', figsize=(12, 6)))
# display(best_results[['approach', 'rmse']].plot.bar(x='approach', y='rmse', title='Wide and Deep RMSE', figsize=(12, 6)))

# display(best_results[['approach', 'pearsonr']].plot.bar(x='approach', y='pearsonr', title='Wide and Deep Pearson Correlation', figsize=(12, 6)))
display(best_results[['approach', 'spearmanr']].plot.bar(x='approach', y='spearmanr', title='Spearman Correlation', figsize=(12, 6)))
display(best_results[['approach', 'kendalltau']].plot.bar(x='approach', y='kendalltau', title='Kendall Correlation', figsize=(12, 6)))

In [None]:
split_names = list(filter(lambda e: e.startswith('split-'), result2.index))

splits1 = []
for split in split_names:
    splits1.append(results.iloc[1][split]['1'].loc['spearmanr'])

splits2 = []
for split in split_names:
    splits2.append(results.iloc[4][split]['300'].loc['spearmanr'])
    
num_samples = 10**4
sample_size = 50
    
sample_means1 = []
for _ in range(num_samples):
    sample_mean = np.random.choice(splits1, size=sample_size).mean()
    sample_means1.append(sample_mean)
    
sample_means2 = []
for _ in range(num_samples):
    sample_mean = np.random.choice(splits2, size=sample_size).mean()
    sample_means2.append(sample_mean)
    
print(splits1)
print(splits2)

print('0.4326', '0.4393', '', sep='\n')
print(np.mean(sample_means1))
print(np.mean(sample_means2))
    
import seaborn as sns

sns.set(color_codes=True)
sns.set(rc={'figure.figsize': (20,10)})
plt.title('Bootstrap', fontsize='25')
plt.xlabel('Score', fontsize='20')
plt.ylabel('Score Frequency', fontsize='20')
sns.distplot(sample_means1)
sns.distplot(sample_means2)
plt.show()