In [2]:
import os
import json
import csv
from collections import Counter
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
import numpy as np


ModuleNotFoundError: No module named 'sklearn'

## Read files

In [32]:
def generate_file_dicts(dir):
    json_files = {}
    sorted_filenames = sorted(os.listdir(dir), key= lambda x: len(x))
    for filename in sorted_filenames:
        if filename.endswith(".json"):
            key_name = filename.replace('.json', '')
            file_path = os.path.join(dir, filename)
            json_files[key_name] = file_path
    
    return json_files       

In [33]:
json_dir = 'data'
json_files = generate_file_dicts(json_dir)
json_files

{'FGA': 'data\\FGA.json',
 'Purity': 'data\\Purity.json',
 'Staging': 'data\\Staging.json',
 'Subtyping': 'data\\Subtyping.json',
 'Purity-FGA': 'data\\Purity-FGA.json',
 'Staging-FGA': 'data\\Staging-FGA.json',
 'Subtyping-FGA': 'data\\Subtyping-FGA.json',
 'Staging-Purity': 'data\\Staging-Purity.json',
 'Subtyping-Purity': 'data\\Subtyping-Purity.json',
 'Subtyping-Staging': 'data\\Subtyping-Staging.json',
 'Staging-Purity-FGA': 'data\\Staging-Purity-FGA.json',
 'Subtyping-Purity-FGA': 'data\\Subtyping-Purity-FGA.json',
 'Subtyping-Staging-FGA': 'data\\Subtyping-Staging-FGA.json',
 'Subtyping-Staging-Purity': 'data\\Subtyping-Staging-Purity.json',
 'Subtyping-Staging-Purity-FGA': 'data\\Subtyping-Staging-Purity-FGA.json'}

## Regression

In [34]:
def calculate_signed_error(json_file, target):
    with open(json_file, 'r') as f:
        data = json.load(f)

    errors = []

    for _, value in data.items():
        pred = value.get(target, {}).get('pred')
        label = value.get(target, {}).get('label')
        if pred is None or label is None:
            continue
        error = round((pred - label), 3)
        errors.append(error)   
    return errors

def calculate_abs_error(json_file, target):
    with open(json_file, 'r') as f:
        data = json.load(f)

    abs_error = []

    for _, value in data.items():
        pred = value.get(target, {}).get('pred')
        label = value.get(target, {}).get('label')
        if pred is None or label is None:
            continue
        error = round(abs(pred - label), 3)
        abs_error.append(error)   
    return abs_error


In [35]:
def numerical_task(classifiers, error_type):
    purity_errors = {}
    fga_errors = {}
    
    if (error_type == 'abs'):
        for classifier_name, json_file in classifiers.items():
            if 'Purity' in classifier_name:
                purity_errors[classifier_name] = calculate_abs_error(json_file, "purity")
            if 'FGA' in classifier_name:
                fga_errors[classifier_name] = calculate_abs_error(json_file, "FRACTION_GENOME_ALTERED")
    
    elif (error_type == 'signed'):
        for classifier_name, json_file in classifiers.items():
            if 'Purity' in classifier_name:
                purity_errors[classifier_name] = calculate_signed_error(json_file, "purity")
            if 'FGA' in classifier_name:
                fga_errors[classifier_name] = calculate_signed_error(json_file, "FRACTION_GENOME_ALTERED")    

    return pd.DataFrame(purity_errors), pd.DataFrame(fga_errors)

In [36]:
purity_errors, fga_errors = numerical_task(json_files, 'abs')
purity_errors.to_csv('calculation/abs-errors/purity.csv', index_label='Sample')
fga_errors.to_csv('calculation/abs-errors/fga.csv', index_label='Sample')

In [37]:
purity_errors, fga_errors = numerical_task(json_files, 'signed')
purity_errors.to_csv('calculation/signed-errors/purity.csv', index_label='Sample')
fga_errors.to_csv('calculation/signed-errors/fga.csv', index_label='Sample')

## Classification

### Staging

In [38]:
def report_accuracy(json_file, target):
    with open(json_file, 'r') as f:
        data = json.load(f)

    labels = []
    preds = []
    unique_labels = set()

    for _, value in data.items():
        pred = value.get(target, {}).get('pred')
        label = value.get(target, {}).get('label')
        if pred is None or label is None:
            continue
        labels.append(label)
        preds.append(pred)   
        unique_labels.add(label)
    
    return labels, preds

In [97]:
def categorical_task_staging(classifiers):
    staging_labels = []
    staging_preds = []
    classification_reports = {}
    confusion_matrices = {}
    results = []

    for classifier_name, json_file in classifiers.items():
        if 'Staging' in classifier_name:
            staging_labels, staging_preds = report_accuracy(json_file, "AJCC_PATHOLOGIC_TUMOR_STAGE_reduced")
            
            classification_reports[classifier_name] = classification_report(staging_labels, staging_preds, target_names=['Early Stage', 'Late Stage'])
            confusion_matrices[classifier_name] = confusion_matrix(staging_labels, staging_preds, labels=['Early Stage', 'Late Stage']).tolist()
            print(confusion_matrix(staging_labels, staging_preds))

            f1_macro = f1_score(staging_labels, staging_preds, average="macro")
            f1_scores_per_class = f1_score(staging_labels, staging_preds, average=None)
            f1_std_dev = np.std(f1_scores_per_class)
            precision = precision_score(staging_labels, staging_preds, average="weighted")
            recall = recall_score(staging_labels, staging_preds, average='weighted')

            results.append({
                'Model': classifier_name,  
                'F1 Score': round(f1_macro, 2),
                'F1 StdDev': round(f1_std_dev, 2), 
                'Precision': round(precision, 2),
                'Recall': round(recall, 2)
            })

    value_counts = Counter(staging_labels)
    for value, count in value_counts.items():
        print(f'{value}: {count}')

    print()
        
    return classification_reports, confusion_matrices, results

In [98]:
staging_reports, staging_matrices, results = categorical_task_staging(json_files)

staging_csv = 'calculation\classification\staging.csv'
with open(staging_csv, mode='w', newline='\n') as file:
    writer = csv.DictWriter(file, fieldnames=['Model', 'F1 Score', 'F1 StdDev', 'Precision', 'Recall', 'FPR', 'TPR', 'Threshold', 'AUC'])
    writer.writeheader()
    for row in results:
        writer.writerow(row)
    
json_file = 'calculation\classification\confusion_staging.json'
with open(json_file, "w") as file:
    for i, (model_name, matrix) in enumerate(staging_matrices.items()):
        if i > 0:
            file.write("\n")
        json.dump({model_name: matrix}, file)



for classifier_name, json_file in json_files.items():
    if 'Staging' in classifier_name:
        print(classifier_name + "\n")
        print(staging_reports[classifier_name])
        print(staging_matrices[classifier_name])

[[241  17]
 [ 43   6]]
[[257   1]
 [ 47   2]]
[[233  25]
 [ 40   9]]
[[227  31]
 [ 42   7]]
[[249   9]
 [ 47   2]]
[[229  29]
 [ 41   8]]
[[243  15]
 [ 45   4]]
[[239  19]
 [ 41   8]]
Early Stage: 258
Late Stage: 49

Staging

              precision    recall  f1-score   support

 Early Stage       0.85      0.93      0.89       258
  Late Stage       0.26      0.12      0.17        49

    accuracy                           0.80       307
   macro avg       0.55      0.53      0.53       307
weighted avg       0.75      0.80      0.77       307

[[241, 17], [43, 6]]
Staging-FGA

              precision    recall  f1-score   support

 Early Stage       0.85      1.00      0.91       258
  Late Stage       0.67      0.04      0.08        49

    accuracy                           0.84       307
   macro avg       0.76      0.52      0.50       307
weighted avg       0.82      0.84      0.78       307

[[257, 1], [47, 2]]
Staging-Purity

              precision    recall  f1-score   supp

### Subtyping

In [99]:
def categorical_task_subtyping(classifiers):
    subtyping_labels = []
    subtyping_preds = []
    classification_reports = {}
    confusion_matrices = {}
    results = []
    for classifier_name, json_file in classifiers.items():
        if 'Subtyping' in classifier_name:
            subtyping_labels, subtyping_preds = report_accuracy(json_file, "lung-cancer-subtyping")
            classification_reports[classifier_name] = classification_report(subtyping_labels, subtyping_preds, target_names=['normal', 'luad', 'lusc'])
            confusion_matrices[classifier_name] = confusion_matrix(subtyping_labels, subtyping_preds, labels=['normal', 'luad', 'lusc']).tolist()

            f1_macro = f1_score(subtyping_labels, subtyping_preds, average="macro")
            f1_scores_per_class = f1_score(subtyping_labels, subtyping_preds, average=None)
            f1_std_dev = np.std(f1_scores_per_class)
            precision = precision_score(subtyping_labels, subtyping_preds, average="weighted")
            recall = recall_score(subtyping_labels, subtyping_preds, average='weighted')

            results.append({
                'Model': classifier_name,  
                'F1 Score': round(f1_macro, 2),
                'F1 StdDev': round(f1_std_dev, 2), 
                'Precision': round(precision, 2),
                'Recall': round(recall, 2)
            })

    value_counts = Counter(subtyping_labels)
    for value, count in value_counts.items():
        print(f'{value}: {count}')

    print()       

    return classification_reports, confusion_matrices, results

In [100]:
subtyping_reports, subtyping_matrices, results = categorical_task_subtyping(json_files)
subtyping_csv = 'calculation\classification\subtyping.csv'
with open(subtyping_csv, mode='w', newline='\n') as file:
    writer = csv.DictWriter(file, fieldnames=['Model', 'F1 Score', 'F1 StdDev', 'Precision', 'Recall'])
    writer.writeheader()
    for row in results:
        writer.writerow(row)

json_file = 'calculation\classification\confusion_subtyping.json'
with open(json_file, "w") as file:
    for i, (model_name, matrix) in enumerate(subtyping_matrices.items()):
        if i > 0:
            file.write("\n")
        json.dump({model_name: matrix}, file)

for classifier_name, json_file in json_files.items():
    if 'Subtyping' in classifier_name:
        print(classifier_name + "\n")
        print(subtyping_reports[classifier_name])
        print(subtyping_matrices[classifier_name])

lusc: 166
normal: 92
luad: 146

Subtyping

              precision    recall  f1-score   support

      normal       0.72      0.82      0.77       146
        luad       0.85      0.72      0.78       166
        lusc       0.88      0.95      0.91        92

    accuracy                           0.80       404
   macro avg       0.82      0.83      0.82       404
weighted avg       0.81      0.80      0.80       404

[[87, 4, 1], [7, 119, 20], [5, 42, 119]]
Subtyping-FGA

              precision    recall  f1-score   support

      normal       0.71      0.82      0.76       146
        luad       0.86      0.71      0.78       166
        lusc       0.87      0.93      0.90        92

    accuracy                           0.80       404
   macro avg       0.81      0.82      0.81       404
weighted avg       0.81      0.80      0.80       404

[[86, 5, 1], [8, 120, 18], [5, 43, 118]]
Subtyping-Purity

              precision    recall  f1-score   support

      normal       0.77  