In [1]:
import os
import json
import pandas as pd

In [2]:
def generate_file_dicts(dir):
    json_files = {}
    sorted_filenames = sorted(os.listdir(dir), key= lambda x: len(x))
    for filename in sorted_filenames:
        if filename.endswith(".json"):
            key_name = filename.replace('.json', '')
            file_path = os.path.join(dir, filename)
            json_files[key_name] = file_path
    
    return json_files       

In [4]:
def calculate_error(json_file, target):
    with open(json_file, 'r') as f:
        data = json.load(f)

    errors = []

    for _, value in data.items():
        pred = value.get(target, {}).get('pred')
        label = value.get(target, {}).get('label')
        if pred is None or label is None:
            continue
        error = round((pred - label), 3)
        errors.append(error)   
    return errors

def calculate_abs_error(json_file, target):
    with open(json_file, 'r') as f:
        data = json.load(f)

    abs_error = []

    for _, value in data.items():
        pred = value.get(target, {}).get('pred')
        label = value.get(target, {}).get('label')
        if pred is None or label is None:
            continue
        error = round(abs(pred - label), 3)
        abs_error.append(error)   
    return abs_error


In [6]:
def numerical_task_error(classifiers):
    purity_errors = {}
    fga_errors = {}

    for classifier_name, json_file in classifiers.items():
        if 'Purity' in classifier_name:
            purity_errors[classifier_name] = calculate_error(json_file, "purity")
        if 'FGA' in classifier_name:
            fga_errors[classifier_name] = calculate_error(json_file, "FRACTION_GENOME_ALTERED")

    return pd.DataFrame(purity_errors), pd.DataFrame(fga_errors)

In [7]:
def categorical_task_error(classifiers):
    subtyping = {}
    staging = {}

    for classifier_name, json_file in classifiers.items():
        if 'lung-cancer-subtyping' in classifier_name:
            subtyping.add(classifiers['lung-cancer-subtyping']['label'])
        if 'AJCC_PATHOLOGIC_TUMOR_STAGE_reduced' in classifier_name:
            staging.add(classifiers['AJCC_PATHOLOGIC_TUMOR_STAGE_reduced']['label'])

    return subtyping, staging        

In [9]:
subtyping, staging = categorical_task_error(generate_file_dicts('data'))
subtyping

{}

In [None]:
json_dir = 'data'
json_files = generate_file_dicts(json_dir)

purity_errors, fga_errors = numerical_task_error(json_files)
purity_errors.to_csv('error/purity.csv', index_label='Sample')
fga_errors.to_csv('error/fga.csv', index_label='Sample')