# Picking the reference scenario // Parallel Axis Plots
-> Parallel Axis Plots for data from 'no policy'-exploration

In [1]:
import pandas as pd

from dmdu.exploration.ref_scenario_parcoords_utils import *

### 1. Loading the data
For each of the three models, the results from the 'no policy'-exploration (produced by perform_experiments_replicator_model.py) are loaded and split into the experiments (DataFrame) and outcomes (dict).

In [2]:
# Loading the data
n_scenarios = 200
n_replications = 30

deffuant_exp, deffuant_out  = get_results('DEFFUANT', scenarios=n_scenarios, replications=n_replications)
sample_exp, sample_out  = get_results('SAMPLE', scenarios=n_scenarios, replications=n_replications)
sit_exp, sit_out  = get_results('SIT', scenarios=n_scenarios, replications=n_replications)


### 2. Transforming from array dicts
The use of the ReplicatorModel has ArrayOutcomes (not ScalarOutcomes). Because the function for the parallel axis plots of the ema_workbench requires the data input to be DataFrames, the outcomes will be reshaped. Furthermore, to combine the outcomes from all three models in one plot, the outcomes are combined into a single DataFrame.

In [3]:
# Transforming from array dicts (due to ReplicatorModel with ArrayOutcome) to DataFrames
deffuant_out = arraydict_to_df(deffuant_out)
sample_out = arraydict_to_df(sample_out)
sit_out = arraydict_to_df(sit_out)


In [4]:
# Combine outcomes-data from all models (for combined parcoords)
deffuant_out['belief_update_fn'] = 'DEFFUANT'
sample_out['belief_update_fn'] = 'SAMPLE'
sit_out['belief_update_fn'] = 'SIT'

out = pd.concat([deffuant_out, sample_out, sit_out])
out = out.reset_index()  # to not have 'triplicate' indices
out = out.drop(columns=['index'])
out

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort,belief_update_fn
0,650.0,242.26,373.457,0.039683,55.32,DEFFUANT
1,647.0,260.70,371.624,0.036624,62.23,DEFFUANT
2,551.0,297.18,515.109,0.033089,63.12,DEFFUANT
3,708.0,249.08,436.553,0.028944,58.78,DEFFUANT
4,764.0,209.53,396.218,0.021778,66.19,DEFFUANT
...,...,...,...,...,...,...
17995,640.0,915.65,178.123,0.120195,28.74,SIT
17996,566.0,912.87,185.321,0.141249,32.57,SIT
17997,607.0,919.52,189.791,0.137223,28.16,SIT
17998,659.0,908.41,198.743,0.124952,33.40,SIT


In [5]:
# Also combine the experiment-DataFrames the same way -> indeces of outcomes & experiments dataframes align

exp = pd.concat([deffuant_exp, sample_exp, sit_exp])
exp = exp.reset_index()  # to not have 'triplicate' indices
exp = exp.drop(columns=['index'])
exp


Unnamed: 0,belief_metric_threshold,deffuant_mu,high_media_lit,mean_disinformer,mean_normal_user,n_edges,n_posts_estimate_similarity,ratio_normal_user,sampling_p_update,mlit_select,del_t,rank_punish,rank_t,strikes_t,scenario,policy,model
0,76,0.010177,0.347936,9,2,3,12,0.989923,0.014639,0,0,0,0,0,200,all off,MisinfoPy
1,75,0.026042,0.324773,11,1,2,5,0.986406,0.022632,0,0,0,0,0,201,all off,MisinfoPy
2,76,0.026171,0.270825,8,2,3,5,0.980470,0.027951,0,0,0,0,0,202,all off,MisinfoPy
3,78,0.018171,0.285280,8,1,3,8,0.988295,0.020174,0,0,0,0,0,203,all off,MisinfoPy
4,77,0.010828,0.252359,12,2,2,14,0.991354,0.016223,0,0,0,0,0,204,all off,MisinfoPy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,75,0.020276,0.308778,10,0,2,5,0.994205,0.010955,0,0,0,0,0,595,all off,MisinfoPy
596,77,0.011447,0.261904,8,1,3,13,0.993374,0.016519,0,0,0,0,0,596,all off,MisinfoPy
597,80,0.025131,0.251584,8,2,2,12,0.989375,0.027652,0,0,0,0,0,597,all off,MisinfoPy
598,78,0.021480,0.332444,8,0,3,7,0.982338,0.026114,0,0,0,0,0,598,all off,MisinfoPy


### Plotting the Parallel Axis Plot
Below, the resulting parallel axis plot is depicted.

In [6]:
# Parallel Axis Plot
plot_again_despite_it_taking_forever = False

if plot_again_despite_it_taking_forever:
    parcoords_color_by(out, 'belief_update_fn')

In [7]:
out.describe()

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort
count,18000.0,18000.0,18000.0,18000.0,18000.0
mean,675.828778,540.604033,363.298552,0.09392,49.467557
std,124.777892,212.526816,171.447652,0.042783,22.680147
min,22.0,30.16,92.469,0.000985,12.85
25%,592.0,379.8475,230.273,0.063286,31.56
50%,669.0,535.83,331.274,0.08962,45.43
75%,764.0,693.1725,464.16475,0.118281,64.24
max,991.0,996.85,1081.285,0.271471,140.81


In [8]:
out

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort,belief_update_fn
0,650.0,242.26,373.457,0.039683,55.32,DEFFUANT
1,647.0,260.70,371.624,0.036624,62.23,DEFFUANT
2,551.0,297.18,515.109,0.033089,63.12,DEFFUANT
3,708.0,249.08,436.553,0.028944,58.78,DEFFUANT
4,764.0,209.53,396.218,0.021778,66.19,DEFFUANT
...,...,...,...,...,...,...
17995,640.0,915.65,178.123,0.120195,28.74,SIT
17996,566.0,912.87,185.321,0.141249,32.57,SIT
17997,607.0,919.52,189.791,0.137223,28.16,SIT
17998,659.0,908.41,198.743,0.124952,33.40,SIT


In [9]:
# Picking a representative scenario as ref_scenario
medians = {}
column_headers = [e for e in out.columns if e != 'belief_update_fn']

for m_name in column_headers:
    median = out[m_name].median()
    medians[m_name] = median


# get specified ranges around the medians of each metric
envelope = 0.065  # 6.5% of the median-value -> range: [lowerbound=6.5% above median, upperbound=6.5% below the median]
ranges = {}

for m_name, median in medians.items():
    lowerbound = median * (1 - envelope)
    upperbound = median * (1 + envelope)
    ranges[m_name] = (lowerbound, upperbound)

for k, v in ranges.items():
    print(v, k)

# Ideas:
# - calculate median for each metric, then get a scenario that is close to those values?
# - visually pick ranges, get subset of scenarios that fulfills them all?
# - add kde or sth like that over each metric-axis, use that for picking the ranges, get subset of scenarios that fulfills them all?

(625.515, 712.485) n_agents_above_belief_threshold
(501.0010500000001, 570.65895) polarization_variance
(309.74119, 352.80681) engagement
(0.0837950961622267, 0.09544575124360581) free_speech_constraint
(42.477050000000006, 48.382949999999994) avg_user_effort


In [10]:
# more programmatic attempt
# metric_subsets = {}
#
# for m_name, range in ranges.items():
#     lowerbound, upperbound = range
#     m_scenarios = out[(out[m_name] >= lowerbound) & (out[m_name] <= upperbound)]
#     metric_subsets[m_name] = m_scenarios
#
# for k, v in metric_subsets.items():
#     print(k, v)

In [11]:
# Filtering to find scenarios closest to median values -> check whether each metric's values are within the specified ranges (i.e., within 'envelope'-many percent above/below the median)
median_scenarios = out[((out['n_agents_above_belief_threshold'] >= ranges['n_agents_above_belief_threshold'][0]) & (out['n_agents_above_belief_threshold'] <= ranges['n_agents_above_belief_threshold'][1])) &
                       ((out['polarization_variance'] >= ranges['polarization_variance'][0]) & (out['polarization_variance'] <= ranges['polarization_variance'][1])) &
                       ((out['engagement'] >= ranges['engagement'][0]) & (out['engagement'] <= ranges['engagement'][1])) &
                       ((out['free_speech_constraint'] >= ranges['free_speech_constraint'][0]) & (out['free_speech_constraint'] <= ranges['free_speech_constraint'][1])) &
                       ((out['avg_user_effort'] >= ranges['avg_user_effort'][0]) & (out['avg_user_effort'] <= ranges['avg_user_effort'][1])) ]

median_scenarios

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort,belief_update_fn
10916,691.0,547.96,310.602,0.087049,47.12,SAMPLE


In [12]:
##

In [13]:

out_avg_over_seeds = []
out

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort,belief_update_fn
0,650.0,242.26,373.457,0.039683,55.32,DEFFUANT
1,647.0,260.70,371.624,0.036624,62.23,DEFFUANT
2,551.0,297.18,515.109,0.033089,63.12,DEFFUANT
3,708.0,249.08,436.553,0.028944,58.78,DEFFUANT
4,764.0,209.53,396.218,0.021778,66.19,DEFFUANT
...,...,...,...,...,...,...
17995,640.0,915.65,178.123,0.120195,28.74,SIT
17996,566.0,912.87,185.321,0.141249,32.57,SIT
17997,607.0,919.52,189.791,0.137223,28.16,SIT
17998,659.0,908.41,198.743,0.124952,33.40,SIT


In [13]:
# PLAN
# Get first 30 rows (for first seed)
# Get new 'row'-dict: avg over each column (axis = 0)
# Add row dict to out_avg_over_seeds list
# list -> df

# then filter the 600 scenarios for proximity to the median values

In [None]:
# PLAN
# Get rows of index n*200 (for first seed) for n in range (30, n_seeds)
# Get new 'row'-dict: avg over each column (axis = 0)
# Add row dict to out_avg_over_seeds list
# list -> df

# then filter the 600 scenarios for proximity to the median values

In [58]:
# n_replications = n_seeds = 30
# n_scenarios = 600

replic_ids = list(range(n_replications))  # 30
# print(replic_ids)

# scenario_ids = list(range(n_scenarios))  # 200
scenario_ids = list(range(n_scenarios))  # 200(scenarios) * 3(models) = 600

# print(scenario_ids)
summarized = []
data = out

def get_avg_metrics_per_scenario(data, n_replications, n_scenarios):

    replic_ids = list(range(n_replications))  # 30
    scenario_ids = list(range(n_scenarios))  # 200
    summarized = []

    for n in scenario_ids:
        # Row indeces of 1 seed/replication
        row_ids = [n + (idx * n_scenarios) for idx in replic_ids]
        # print(row_ids, len(row_ids))
        # Get subset DataFrame, which only contains data from that scenario
        subset = data.iloc[row_ids]
        avgs = subset.mean(axis=0)
        summarized.append(avgs)

    # each element in summarized becomes a row of the final df
    avg_per_scenario = pd.concat(summarized, axis=1)
    avg_per_scenario = avg_per_scenario.transpose()

    return avg_per_scenario


# outcomes data per belief_update_fn, each with only 1 entry per scenario
deffuant = get_avg_metrics_per_scenario(deffuant_out, n_replications, n_scenarios)
sample = get_avg_metrics_per_scenario(sample_out, n_replications, n_scenarios)
sit = get_avg_metrics_per_scenario(sit_out, n_replications, n_scenarios)


combined = pd.concat([deffuant, sample, sit])
combined = combined.reset_index()  # to not have 'triplicate' indices
combined = combined.drop(columns=['index'])
combined

# TODO: CONTINUE HERE. Now from 0 to 6000. I.e., for 1 belief update fn  -> do it not for the combined out, but for each separate out
    # seed_data = out.iloc[]

  avgs = subset.mean(axis=0)


Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort
0,627.033333,341.206333,255.522033,0.090894,34.029667
1,604.866667,355.905333,261.803167,0.085726,33.231333
2,605.066667,351.885333,245.431533,0.077292,33.917000
3,622.266667,347.855667,242.300467,0.073062,32.589667
4,615.800000,353.621000,236.858533,0.084807,33.969333
...,...,...,...,...,...
595,716.300000,756.091000,477.317400,0.114040,62.640667
596,689.266667,769.753333,487.999567,0.115670,63.632667
597,724.966667,749.615667,455.757033,0.113233,64.107333
598,749.200000,734.638000,447.081633,0.111342,61.849667


In [65]:
# Now the actual filtering
medians = {}
column_headers = [e for e in combined.columns if e != 'belief_update_fn']

for m_name in column_headers:
    median = combined[m_name].median()
    medians[m_name] = median


# get specified ranges around the medians of each metric
envelope = 0.03  # 6.5% of the median-value -> range: [lowerbound=6.5% above median, upperbound=6.5% below the median]
ranges = {}

for m_name, median in medians.items():
    lowerbound = median * (1 - envelope)
    upperbound = median * (1 + envelope)
    ranges[m_name] = (lowerbound, upperbound)

for k, v in ranges.items():
    print(v, k)

(670.6094999999999, 712.0904999999999) n_agents_above_belief_threshold
(516.8067850000001, 548.774215) polarization_variance
(372.0714198333334, 395.0861468333334) engagement
(0.09174974347304087, 0.09742498533735268) free_speech_constraint
(50.99791166666667, 54.15242166666667) avg_user_effort


In [66]:
# Filtering to find scenarios closest to median values -> check whether each metric's values are within the specified ranges (i.e., within 'envelope'-many percent above/below the median)
out = combined

median_scenarios = out[((out['n_agents_above_belief_threshold'] >= ranges['n_agents_above_belief_threshold'][0]) & (out['n_agents_above_belief_threshold'] <= ranges['n_agents_above_belief_threshold'][1])) &
                       ((out['polarization_variance'] >= ranges['polarization_variance'][0]) & (out['polarization_variance'] <= ranges['polarization_variance'][1])) &
                       ((out['engagement'] >= ranges['engagement'][0]) & (out['engagement'] <= ranges['engagement'][1])) &
                       ((out['free_speech_constraint'] >= ranges['free_speech_constraint'][0]) & (out['free_speech_constraint'] <= ranges['free_speech_constraint'][1])) &
                       ((out['avg_user_effort'] >= ranges['avg_user_effort'][0]) & (out['avg_user_effort'] <= ranges['avg_user_effort'][1])) ]

median_scenarios

Unnamed: 0,n_agents_above_belief_threshold,polarization_variance,engagement,free_speech_constraint,avg_user_effort
310,698.733333,528.525333,374.188767,0.095943,51.641667


In [66]:
# -> idx 310
# =>  sample, idx 110  (310 - 200)



In [80]:
# Get that scenario from sample_exp
ref_scenario = sample_exp.iloc[110, :]
uncertainties = ['belief_metric_threshold', 'deffuant_mu', 'high_media_lit', 'mean_disinformer', 'mean_normal_user', 'n_edges', 'n_posts_estimate_similarity', 'ratio_normal_user', 'sampling_p_update']
ref_scenario = ref_scenario.loc[uncertainties]
ref_scenario

belief_metric_threshold              80
deffuant_mu                    0.018566
high_media_lit                 0.285126
mean_disinformer                      8
mean_normal_user                      0
n_edges                               3
n_posts_estimate_similarity           5
ratio_normal_user              0.993085
sampling_p_update              0.027595
Name: 110, dtype: object

In [81]:
# double check whether matches sample idx 110
sample.iloc[110, :]

n_agents_above_belief_threshold    698.733333
polarization_variance              528.525333
engagement                         374.188767
free_speech_constraint               0.095943
avg_user_effort                     51.641667
Name: 110, dtype: float64

In [82]:
# Save ref_scenario
import os
from dmdu.utils_dmdu import make_sure_path_exists

# path
os.getcwd()
dir_path = os.path.join(os.getcwd(), 'data')
make_sure_path_exists(dir_path)
path = os.path.join(dir_path, 'ref_scenario.csv')

ref_scenario.to_csv(path)

In [89]:
# Make metric names the indeces of the df -> nope, is already, just not saved/loaded again like it

# indeces = []
#
# for idx, row in ref_scenario.iteritems():
#     indeces.append(idx)
#
# ref_scenario.index

In [99]:
# TODO: Getting ref_scenario into params form, then make Scenario out of it
from ema_workbench import Scenario

params = {}

# Path  # TODO: adjust if move to different file
os.getcwd()
dir_path = os.path.join(os.getcwd(), 'data')
make_sure_path_exists(dir_path)
path = os.path.join(dir_path, 'ref_scenario.csv')

ref_scenario_df = pd.read_csv(path)
ref_scenario_df
#
for idx, row in ref_scenario_df.iterrows():
    metric = row[0]
    value = row[1]
    params[metric] = value



# the reference scenario!!
ref_scenario = Scenario('reference', **params)
ref_scenario

Scenario({'belief_metric_threshold': 80.0, 'deffuant_mu': 0.0185664008066397, 'high_media_lit': 0.2851257267625852, 'mean_disinformer': 8.0, 'mean_normal_user': 0.0, 'n_edges': 3.0, 'n_posts_estimate_similarity': 5.0, 'ratio_normal_user': 0.9930851855394072, 'sampling_p_update': 0.027594598317021})