In [1]:
import os
import sys
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import matplotlib.patches as mpatches
sys.path.append("../../Utils")
from loaders import HNSCCFeatureHandler

METADATA_PATH = "../../Supplementary_Tables/ST1/RAW_HNSCC_METADATA_NEW.csv"
VALID_IDS_PATH = "../../Utils/Lists/cv_ids.txt"
HOLD_IDS_PATH = '../../Utils/Lists/holdout_ids.txt'

hc_data = HNSCCFeatureHandler(METADATA_PATH, VALID_IDS_PATH, HOLD_IDS_PATH)
_ = hc_data.load_feature_to_dataframe("../../Data/*.hg38.frag.interval_mds.tsv", 0, 4)
z = hc_data.normalize_zscore()
_ = hc_data.merge_feature_metadata()
z = hc_data.batch_correct()

def get_tumor_fraction(id):
    file_path = f"../../Data/{id}/{id}.params.txt"
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            second_line = lines[1].strip()
            second_value = float(second_line.split('\t')[1])
            return second_value
    except Exception as e:
        print(f"Error: {e}")
        return None

In [2]:
valid_ids_file = "../../Utils/Lists/valid_ids.txt"
with open(valid_ids_file, 'r') as file:
    valid_ids = [line.strip() for line in file.readlines()]
tumor_fractions = {}

for id in valid_ids:
    tumor_fraction = get_tumor_fraction(id)
    if tumor_fraction is not None:
        tumor_fractions[id] = tumor_fraction

df = pd.DataFrame(list(tumor_fractions.items()), columns=['ID', 'Tumor Fraction'])

df.set_index('ID', inplace=True)

print(df)

           Tumor Fraction
ID                       
Pilot2_3          0.00000
Pilot2_4          0.04471
Pilot2_5          0.00000
Pilot2_6          0.02241
Pilot2_7          0.03978
...                   ...
Pilot_252         0.00000
Pilot_253         0.06985
Pilot_254         0.18080
Pilot_292         0.00000
Pilot_293         0.02069

[176 rows x 1 columns]


In [3]:
metadata_df = pd.read_csv(METADATA_PATH)

metadata_df = metadata_df[['ID', 'Patient Number', 'Treatment Response']]  # Selecting relevant columns
df = df.merge(metadata_df, how='left', left_index=True, right_on='ID')

print(df)

     Tumor Fraction         ID  Patient Number Treatment Response
0           0.00000   Pilot2_3               1      Non-Responder
1           0.04471   Pilot2_4               1      Non-Responder
2           0.00000   Pilot2_5               1      Non-Responder
3           0.02241   Pilot2_6               2      Non-Responder
4           0.03978   Pilot2_7               2      Non-Responder
..              ...        ...             ...                ...
138         0.00000  Pilot_252              52      Non-Responder
139         0.06985  Pilot_253              52      Non-Responder
140         0.18080  Pilot_254              52      Non-Responder
178         0.00000  Pilot_292              66          Responder
179         0.02069  Pilot_293              66          Responder

[176 rows x 4 columns]


In [7]:
grouped_df = df.groupby('Patient Number')['Tumor Fraction'].mean().reset_index()
median_tf = grouped_df['Tumor Fraction'].median()
grouped_df['Predicted Treatment Response'] = grouped_df['Tumor Fraction'].apply(
    lambda x: 'High Tumor Fraction' if x > median_tf else 'Low Tumor Fraction'
)

In [8]:
predicted = grouped_df.set_index('Patient Number')['Predicted Treatment Response'].to_dict()

In [11]:
import pickle
with open('tumor_fraction_from_median.pkl', 'wb') as f:
    pickle.dump(predicted, f)