In [None]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={"figure.figsize":(16, 9)}) 

# Extract the current figsize to determine height and aspect for lmplot
global_fig_width, global_fig_height = plt.rcParams['figure.figsize']
global_aspect_ratio = global_fig_width / global_fig_height

In [None]:
# Directory containing the JSON files
json_folder = 'json_reports'

# List to hold the data
data = []

# Iterate over all JSON files in the directory
for filename in os.listdir(json_folder):
    if filename.endswith('.json'):
        filepath = os.path.join(json_folder, filename)
        
        # Read the JSON file
        with open(filepath, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            
            # Flatten the JSON data into a single dictionary
            flattened_data = {key: value.get('value') for key, value in json_data.items()}
            
            # Add the filename to the data
            flattened_data['filename'] = filename
            
            # Add the record to the data list
            data.append(flattened_data)

# Create a DataFrame from the data list
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.info()
df.head(1).T

In [None]:
df['post_analysis_date'] = pd.to_datetime(df['post_analysis_start_time'].astype(int), unit='s')

df.head(1).T
#df['post_analysis_start_time'].astype(int)

In [None]:
sns.scatterplot(data=df, x='post_analysis_date', y='average_error_of_individual_prediction_minutes', hue='race_id')

In [None]:
bad_execution_timestamps = df[df['average_error_of_individual_prediction_minutes'] > 30]['execution_timestamp'].values

In [None]:
bad_execution_timestamp_mask = df['execution_timestamp'].isin(bad_execution_timestamps)
clean_df = df[~bad_execution_timestamp_mask].copy()
#clean_df = clean_df.query('post_analysis_date > "2023-06-18"')
clean_df = clean_df.query('post_analysis_date > "2024-05-23"')
clean_df = clean_df.sort_values(by=['race_id', 'post_analysis_date'])
def get_first_and_last(group):
    return group.iloc[[0, -1]]

first_last_df = clean_df.groupby('race_id', group_keys=False).apply(get_first_and_last).reset_index(drop=True)

first_last_df.describe()

In [None]:
sns.lineplot(data=first_last_df, x='post_analysis_date', y='average_error_of_individual_prediction_minutes', hue='race_id')

In [None]:
sns.lineplot(data=first_last_df, x='post_analysis_date', y='individual_interval_prediction_is_wrong', hue='race_id')

In [None]:
sns.lineplot(data=first_last_df, x='post_analysis_date', y='relay_interval_prediction_wrong', hue='race_id')

In [None]:
def _diff_from_first_to_last(col):
    result = clean_df.groupby('race_id').agg(
        first_time=(col, 'first'), 
        last_time=(col, 'last')).reset_index()
    
    result['diff'] = result['last_time'] - result['first_time']
    display(f'{col}: {result[['diff']].mean().item():.3f}')
    display(result[['race_id', 'diff']].sort_values('diff'))





In [None]:
_diff_from_first_to_last('average_error_of_individual_prediction_minutes')

In [None]:
_diff_from_first_to_last('individual_interval_prediction_is_wrong')
#sns.lineplot(data=clean_df, x='post_analysis_start_time', y='individual_interval_prediction_is_wrong', hue='race_id')

In [None]:
_diff_from_first_to_last('relay_interval_prediction_wrong')

In [None]:
# Select columns with numeric data types
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    result = clean_df.groupby('race_id').agg(
    first_time=(col, 'first'), 
    last_time=(col, 'last')).reset_index()
    
    result['diff'] = result['last_time'] - result['first_time']
    display(f'{col}: {result[['diff']].mean().item():.3f}')
