In [None]:
import pandas as pd
import os
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error 
import plotly.graph_objects as go
import plotly.graph_objs as go
from datetime import timedelta
pd.options.display.max_rows = 999

# Data prep
# Get the current working directory
current_directory = os.getcwd()

# Step back one folder to reach the parent directory
parent_directory = os.path.dirname(current_directory)

# For plots later
# Define a list of shades of gray
shades_of_gray = ['#AAAAAA', '#999999', '#888888', '#777777', '#666666', '#555555', '#444444', '#333333', '#222222', '#111111']

# Initialize counter for shades of gray
shade_index = 0

In [None]:
def calculate_similarity(df1, df2):
    correlation = np.corrcoef(df1, df2)[0, 1]
    mean_absolute_error = np.mean(np.abs(df1 - df2))
    root_mean_square_error = np.sqrt(np.mean((df1 - df2)**2))
    return correlation, mean_absolute_error, root_mean_square_error

In [None]:
def find_best(df):
    '''Function to find best RunId from a sweep'''
    
    score = 200
    best_run = None
    
    rmses = []
    
    unique_entries = df['RunId'].unique()
    
    for RunId in unique_entries:
        df2 = df[df['RunId'] == RunId]
        filtered_df = df2[df2['Step'] >= 64].drop_duplicates(subset=['Step'])
        
        corr, mae, rmse = calculate_similarity(filtered_df['normalized_bevs'], filtered_df['normalized_bev_target'])
        
        rmses.append(rmse)
        
        if rmse < score:
            
            score = rmse
            best_run = RunId
            
    return best_run, score, rmses

In [None]:
def find_interrun_corr(df):
    '''Function to correlations between runs in a sweep'''
    
    corrs = []
    
    unique_entries = df['RunId'].unique()
    
    # Generate unique combinations of RunIds
    run_combinations = itertools.combinations(unique_entries, 2)
    
    for RunId1, RunId2 in run_combinations:
        df1 = df[df['RunId'] == RunId1]
        df2 = df[df['RunId'] == RunId2]
        
        filtered_df1 = df1[df1['Step'] >= 64].drop_duplicates(subset=['Step'])
        filtered_df2 = df2[df2['Step'] >= 64].drop_duplicates(subset=['Step'])
        
        corr, mae, rmse = calculate_similarity(filtered_df1['normalized_bevs'], filtered_df2['normalized_bevs'])
        
        corrs.append(corr)
            
    return corrs

In [None]:
#name, size = '01 - Baseline all 1.0 params 90k 8it', 62278 # Average: 24.3921, Corr: 0.9999
#name, size = '01 - Baseline all 1.0 params 150k 8it', 100946 # Average: 24.6198,  Corr: 0.9999
#name, size = '01 - Baseline all 1.0 params full pop 8it', 506959 # Average: 24.5670 Corr: 0.9999

#name, size = '02 - Baseline 1.0 params 3.5 thresh 90k 8it', 62278 # Average: 0.1602, Corr: 0.9999, Fig 5.13

#name, size = '03 - Optimized Params 90k 8it', 62278 # Average: 0.1148, Corr: 0.9985, Fig 5.14
#name, size = '03 - Optimized Params 90k 100it', 62278 # Average: 0.1081
#name, size = '03 - Optimized Params NO NEWS 90k 8it', 62278 # Average: 0.1602, Corr: 0.9987

name, size = '03 - Optimized Params 83x83 full pop 8it', 506959 # Average: 0.0915

#name, size = 'Rogers Best Params 90k 8it', 62278 # Average: 0.1191, Corr: 0.9987

#name, size = 'rogers best current params 90k 40it', 62278

#name, size = 'test', 62278

file_path = os.path.join(parent_directory, 'Output data', name, 'FairfaxABM_Data.csv')

df = pd.read_csv(file_path)
df = df.drop(['Unnamed: 0'], axis=1)

In [None]:
df['normalized_bev_target'] = (df['bev_target'] / size) * 100
df['normalized_bevs'] = (df['bevs'] / size) * 100

In [None]:
result = find_best(df)

In [None]:
print(f"Best run: {result[0]}, Best run RMSE: {result[1]:.4f}, Average RMSE: {(sum(result[2])/len(result[2])):.4f}")

In [None]:
inter_run_corrs = find_interrun_corr(df)
print('Average corr between runs:',(sum(inter_run_corrs)/len(inter_run_corrs)))

<h3>All runs and average</h3>

In [None]:
# 1) Filter the dataframe
filtered_df = df[df['Step'] >= 64]

In [None]:
# Function to convert ticks to datetime
def tick_to_date(tick, start_month=5, start_year=2014):
    start_date = pd.Timestamp(year=start_year, month=start_month, day=1)
    return start_date + pd.DateOffset(months=tick-64)

# Apply the conversion function to the 'Step' column
filtered_df['Date'] = filtered_df['Step'].apply(tick_to_date)

# 2) Plot 'bev_target'
fig = go.Figure()
fig.add_trace(go.Scatter(x=filtered_df['Date'], y=filtered_df['normalized_bev_target'], mode='markers', name='Actual', marker=dict(size=5)))

# 3) Plot 'bevs' for each run
for run_id, group in filtered_df.groupby('RunId'):
    shade = shades_of_gray[shade_index % len(shades_of_gray)]
    fig.add_trace(go.Scatter(x=group['Date'], y=group['normalized_bevs'], mode='lines', name=f'Run {run_id}', line=dict(color=shade), opacity=0.4, showlegend=False))
    shade_index+=1

# 4) Plot the average of 'bevs' for all runs
average_bevs = filtered_df.groupby('Step')['normalized_bevs'].mean()
average_dates = pd.Series(average_bevs.index).apply(tick_to_date)
fig.add_trace(go.Scatter(x=average_dates, y=average_bevs.values, mode='lines', name='Simulated', line=dict(color='red')))

# Update layout
fig.update_layout(title=name, xaxis_title='Date', yaxis_title='Percentage of BEVs')

fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.85,
    xanchor="right",
    x=0.135
))

# Show plot
fig.show()

In [None]:
best = filtered_df[filtered_df['RunId'] == result[0]]
best

<h3>Inter-run correlation</h3>

In [None]:
r1 = '03 - Optimized Params 90k 8it'
fp1 = os.path.join(parent_directory, 'Output data', r1, 'FairfaxABM_Data.csv')
df1 = pd.read_csv(fp1)
df1 = df1.drop(['Unnamed: 0'], axis=1)

r2 = '03 - Optimized Params 83x83 full pop 8it'
fp2 = os.path.join(parent_directory, 'Output data', r2, 'FairfaxABM_Data.csv')
df2 = pd.read_csv(fp2)
df2 = df2.drop(['Unnamed: 0'], axis=1)

In [None]:
corr, mae, rmse = calculate_similarity(df1['bevs'], df2['bevs'])

In [None]:
corr