In [1]:
%matplotlib inline

from IPython.display import display, HTML
from ipywidgets import interactive
import matplotlib.pyplot as plt
# import networkx as nx
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
# import tensorflow as tf
# import tensorflow_probability as tfp
from tqdm.notebook import tqdm

from collections import namedtuple, defaultdict, Counter
import functools
from glob import glob
import itertools
import os
import pickle
import re
import socket
import subprocess
import sys
from time import sleep

plt.style.use('seaborn')
# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
np.set_printoptions(suppress=True, precision=2, threshold=20)
sys.path += ['../src/']

display(HTML("<h3>Runned on %s</h3><h4>Last commit: %s</h4><h4>Conda env: %s</h4>" % (
    socket.gethostname(),
    subprocess.run(["git","log", "-1", "--pretty=%h %s"],
        check=False, capture_output=True).stdout.decode() or 'none.',
    (re.findall(""".*envs[\\\/](.*?)[\\\/]""", sys.executable) or [sys.executable])[0]
)))


In [2]:
df_all = pd.read_csv("../data/oabm-experiments/main-experiments-original-abm.csv")
print(set(df_all["tags.mlflow.source.git.commit"].values))
df_all = df_all[df_all.status == "FINISHED"]
if 'tags.crashed' in df_all.columns:
    df_all = df_all[df_all['tags.crashed'].isna()]
params = [c[7:] for c in df_all.columns if c.startswith("params.") ]
df_all.columns = [c[7:] if c.startswith("params.") else c for c in df_all.columns]
print(len(df_all))

{'2a4ecb25997a2822cc9b10841572ca436a908aa3'}
92


In [3]:
df_all.num_considered_Dbs.unique()

array([256])

In [4]:
deltas = (1/16., )
stddevs = [1.]
num_Dbs = [256]
# traces = list(range(1, 21))
traces = list(range(11, 21))

In [5]:
df_all = df_all[df_all.learning_rate == 0.001].copy()
df_all = df_all[df_all.delta.isin(deltas)].copy()
df_all = df_all[df_all.num_deals_error_stddev.isin(stddevs)]
df_all = df_all[df_all.num_considered_Dbs.isin(num_Dbs)]
df_all = df_all[~(
    (df_all['tags.mlflow.source.git.commit'] == "1487a8a39b37170cd13a04face00f1060f3098f4") &
    df_all.trace_num.isin((14, 18))
)]
len(df_all)

92

In [6]:
for num_deals_error_stddev in stddevs:
    for num_considered_Dbs in num_Dbs:
        for trace_num in traces:
            for seed in df_all.seed.unique():
                exp_string = f"{num_considered_Dbs}, {num_deals_error_stddev}: trace {trace_num}, seed {seed}"
                df_experiment = df_all[
                            (df_all['num_deals_error_stddev'] == num_deals_error_stddev) &
                            (df_all['num_considered_Dbs'] == num_considered_Dbs) &
                            (df_all['trace_num'] == trace_num) &
                            (df_all['seed'] == seed)
                      ]
                num_experiments = len(df_experiment)
                if num_experiments == 0:
                    print("MISSING:", exp_string)
                elif num_experiments > 1:
                    print("Duplicate:", exp_string)
                    print(set(df_experiment["tags.mlflow.source.git.commit"].values))

MISSING: 256, 1.0: trace 14, seed 51
MISSING: 256, 1.0: trace 15, seed 51
MISSING: 256, 1.0: trace 16, seed 51
MISSING: 256, 1.0: trace 17, seed 51
MISSING: 256, 1.0: trace 18, seed 51
MISSING: 256, 1.0: trace 19, seed 51
MISSING: 256, 1.0: trace 20, seed 51


In [7]:
params_not_unique = dict()
params_unique = dict()
for f in params:
    values = set(df_all[f])
    if len(values) > 1:
        params_not_unique[f] = values
    else:
        params_unique[f] = values

display(HTML("<h1>Parameters Not Unique</h1>"))
display(pd.DataFrame(index=params_not_unique.keys(), data=params_not_unique.values()).fillna(''))

display(HTML("<h1>Unique Parameters</h1>"))
display(pd.DataFrame(index=params_unique.keys(), data=params_unique.values(), columns=["Value"]).fillna(''))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
trace_num,10,11,12,13,14,15,16,17,18,19.0,20.0
seed,43,44,45,46,47,48,49,50,51,,


Unnamed: 0,Value
use_relative_error,False
price_error_stddev,1.0
Gammak,[0.5 0.4 0.1]
delta,0.0625
epochs,5
num_restarts,1
N,1000
num_deals_error_stddev,1.0
Q,500
num_considered_Dbs,256


In [8]:
df_best_runs = []
for trace_num in traces:
    for num_deals_error_stddev in stddevs:
        for num_considered_Dbs in num_Dbs:
            df_trace = df_all[
                (df_all['trace_num'] == trace_num) &
                (df_all['num_deals_error_stddev'] == num_deals_error_stddev) &
                (df_all['num_considered_Dbs'] == num_considered_Dbs)
            ]
            if len(df_trace) != 10:
                print(f" {trace_num} {num_deals_error_stddev} {num_considered_Dbs} has "
                     f"{len(df_trace)} experiments")
            df_best_run = df_trace[
                df_trace['metrics.total_loss'] == df_trace['metrics.total_loss'].min()
            ]
            assert len(df_best_run) == 1
            df_best_runs.append(df_best_run)
df_best_runs = pd.concat(df_best_runs)

 11 1.0 256 has 9 experiments
 12 1.0 256 has 9 experiments
 13 1.0 256 has 9 experiments
 14 1.0 256 has 8 experiments
 15 1.0 256 has 8 experiments
 16 1.0 256 has 8 experiments
 17 1.0 256 has 8 experiments
 18 1.0 256 has 8 experiments
 19 1.0 256 has 8 experiments
 20 1.0 256 has 8 experiments


In [9]:
df_best_runs['metrics.pearson_MP'] = (
    df_best_runs['metrics.pearson_M'] + df_best_runs['metrics.pearson_P']) / 2

df_best_runs['metrics.r2_score_MP'] = (
    df_best_runs['metrics.r2_score_M'] + df_best_runs['metrics.r2_score_P']) / 2

In [10]:
# plt.rcParams.update({'font.size': 20})

# metric = 'metrics.pearson_M'

# validation_traces = list(range(1, 11))

# results = np.full((len(num_Dbs), len(stddevs)), np.nan)

# for j, num_deals_error_stddev in enumerate(stddevs):
#     for i, num_Db in enumerate(num_Dbs):
#         results[i, j] = df_best_runs[
#             (df_best_runs['num_deals_error_stddev'] == num_deals_error_stddev) &
#             (df_best_runs['num_considered_Dbs'] == num_Db) &
#             (df_best_runs['trace_num'].isin(validation_traces))
#         ][metric].mean()

# fig, ax = plt.subplots()
# # plt.title(metric.replace('metrics.', ''))
# vmin = 0. # np.floor(np.nanmin(results) * 10) / 10 - 0.1
# vmax = 0.8 # np.ceil(np.nanmax(results) * 10) / 10
# ax.matshow(results, cmap='Blues', vmin=vmin, vmax=vmax)
# for (i, j), z in np.ndenumerate(results):
#     z = np.floor(z * 100) / 100
#     ax.text(j, i, '{:0.2f}'.format(z), ha='center', va='center', color='w')
# plt.grid()
# plt.xticks(np.arange(len(stddevs)), stddevs,fontsize=18)
# plt.xlabel('$\sigma_D$',fontsize=20)
# plt.yticks(np.arange(len(num_Dbs)), num_Dbs,fontsize=18)
# plt.ylabel('Num. of $D^B$ samples',fontsize=20)
# # plt.gca().xaxis.set_label_position('top')
# ax.xaxis.set_ticks_position('bottom')
# plt.title(r'$\rho(M, \hat{M})$' if metric == 'metrics.pearson_M' else metric,fontsize=20)

# for ext in ("pdf", "png"):
#     plt.savefig(f"../paper/plots/hyperparam-{metric.split('.')[1]}.{ext}",
#                         bbox_inches='tight', dpi=200)
# plt.show()

In [11]:
best_num_deals_error_stddev = 1.0
best_num_considered_Dbs = 256

In [12]:
test_traces = traces # sorted(set(df_best_runs['trace_num'].unique()) - set(validation_traces))
assert test_traces == list(range(11, 21))
df_plot = df_best_runs[
    (df_best_runs['num_deals_error_stddev'] == best_num_deals_error_stddev) &
    (df_best_runs['num_considered_Dbs'] == best_num_considered_Dbs) &
    (df_best_runs['trace_num'].isin(test_traces))
].copy()

for f in sorted(df_all.columns):
    if 'metrics.' in f:
        print(f"{f:40}: {df_plot[f].mean():.4f}")

metrics.explained_variance_score_Db     : 0.7427
metrics.explained_variance_score_Db_m1  : 0.6664
metrics.explained_variance_score_M      : 0.2829
metrics.explained_variance_score_M0     : 0.2633
metrics.explained_variance_score_Nd     : 0.7360
metrics.explained_variance_score_P      : 0.9722
metrics.explained_variance_score_avg_M  : 0.2605
metrics.explained_variance_score_avg_M0 : 0.2346
metrics.final_est_num_agents            : 5000.0000
metrics.final_num_agents                : 5000.0000
metrics.max_error_Db                    : 33.2359
metrics.max_error_Db_m1                 : 48.3376
metrics.max_error_M                     : 577.8966
metrics.max_error_M0                    : 577.8966
metrics.max_error_Nd                    : 20.5559
metrics.max_error_P                     : 4.4891
metrics.max_error_avg_M                 : 589.6049
metrics.max_error_avg_M0                : 589.4629
metrics.mean_absolute_error_Db          : 5.3246
metrics.mean_absolute_error_Db_m1       : 6.5311
met

In [13]:
df_plot['dist_to_median'] = (
    df_plot['metrics.pearson_M'] - df_plot['metrics.pearson_M'].median()).abs()

In [14]:
df_plot[['run_id', 'trace_num', 'metrics.pearson_M', 'dist_to_median']
       ].sort_values('dist_to_median')

Unnamed: 0,run_id,trace_num,metrics.pearson_M,dist_to_median
86,c70e839e9f124225910569fd91984f27,16,0.546428,0.029372
60,9f9ce588b0a444ec9f13c74b10faefe8,20,0.487684,0.029372
54,1bb58dfadba7462fa3df5d95c66d4ec9,15,0.479444,0.037612
57,88cd56216aa34c0089c0aa68d7e39a96,12,0.430069,0.086987
52,9c0b7116da774426b4e00d5bf8d897cf,17,0.632932,0.115876
88,56eacfdd2c9f4d6ab544fb81063b4f8e,14,0.398635,0.118421
51,27928f6df6db40baa1f923f09cf0d0e4,18,0.721024,0.203968
3,3b81e577424e4b69ac4b26eda1fc3d4a,11,0.302964,0.214092
56,562eac25520b4140832de9522b391bf9,13,0.737249,0.220193
61,2691fe570cc9453ca055072ec0c30e8c,19,0.875822,0.358766


In [15]:
# K = 3
# L = 5
# T = 20

# pearson_Dbs = []

# assert sorted(df_plot['trace_num']) == test_traces
# for trace_num in df_plot['trace_num']:
#     est_path = ("../data/original-abm-experiments/best_estimates/"
#                 f"trace_{trace_num}_estimate.pickle")
#     with open(est_path, 'rb') as f:
#         est = pickle.load(f)
#     real_Db = np.genfromtxt(f"../data/old_abm/traces/buyers{trace_num}.tsv")
    
#     assert np.all(real_Db[0] == 0)
#     real_Db = real_Db[1:T]
#     assert real_Db.shape == (T - 1, L * K)
#     real_Db = real_Db.reshape(T - 1, L, K)
#     assert est.Db.shape == (T - 1, L, K)

#     pearson_Db = scipy.stats.pearsonr(real_Db.flatten(), est.Db.flatten())[0]
#     pearson_Dbs.append(pearson_Db)
    
# df_plot['metrics.pearson_Db'] = pearson_Dbs

In [16]:
df_plot["Model"] = "original"

In [17]:
df_plot.to_csv("../data/main-figure-experiments.csv",
               index=False)