In [40]:
import pandas as pd
import numpy as np

# Load somatic mutation data
mut_df = pd.read_csv(
    "../ucec_tcga_pan_can_atlas_2018/data_mutations.txt",
    sep="\t",
    comment="#",   # skip header comments if present
    low_memory=False
)

# Check columns available
print(mut_df.columns[:20])  

# Standard TCGA mutation files use Hugo_Symbol (gene) and Tumor_Sample_Barcode (sample ID)
gene_col = "Hugo_Symbol"
sample_col = "Tumor_Sample_Barcode"

# Create binary mutation matrix (1 if gene mutated in sample, else 0)
binary_mut_matrix = (
    mut_df[[gene_col, sample_col]]
    .drop_duplicates()
    .assign(MUT=1)
    .pivot(index=sample_col, columns=gene_col, values="MUT")
    .fillna(0)
    .astype(int)
)

print(binary_mut_matrix.shape)  # samples × genes
print(binary_mut_matrix.head())


Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome',
       'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1',
       'Match_Norm_Seq_Allele2'],
      dtype='object')
(515, 21737)
Hugo_Symbol           A1BG  A1BG-AS1  A1CF  A2M  A2M-AS1  A2ML1  A2ML1-AS1  \
Tumor_Sample_Barcode                                                         
TCGA-2E-A9G8-01          0         0     0    0        0      0          0   
TCGA-4E-A92E-01          0         0     0    0        0      0          0   
TCGA-5B-A90C-01          0         0     0    0        0      0          0   
TCGA-5S-A9Q8-01          0         0     0    0        0      0          0   
TCGA-A5-A0G1-01          1         0     0    1        0      1          0

In [9]:
data_timeline_treatment_df = pd.read_csv(
    "../ucec_tcga_pan_can_atlas_2018/data_timeline_treatment.txt",
    sep="\t",
    comment="#",   # skip header comments if present
    low_memory=False
)

def find_recurrence_in_values(df):
    results = {}
    for col in df.columns:
        count = df[col].astype(str).str.lower().str.contains("recurrence", na=False).sum()
        if count > 0:  # only print columns where recurrence actually shows up
            results[col] = count
            print(f"Column: {col}, Count of 'recurrence': {count}")
    return results

find_recurrence_in_values(data_timeline_treatment_df)


Column: REGIMEN_INDICATION, Count of 'recurrence': 56
Column: ANATOMIC_TREATMENT_SITE, Count of 'recurrence': 26


{'REGIMEN_INDICATION': np.int64(56), 'ANATOMIC_TREATMENT_SITE': np.int64(26)}

In [12]:
def print_value_counts(df, column_name):
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found in DataFrame.")
        return
    
    value_counts = df[column_name].value_counts(dropna=False)  # includes NaN
    for value, count in value_counts.items():
        print(f"{value}: {count}")

print_value_counts(data_timeline_treatment_df, "ANATOMIC_TREATMENT_SITE")
print()
print_value_counts(data_timeline_treatment_df, "REGIMEN_INDICATION")


nan: 435
Primary Tumor Field: 170
Regional Site: 75
Distant Recurrence: 14
Local Recurrence: 12
Distant Site: 3

Adjuvant: 343
nan: 265
Recurrence: 56
Progression: 24
Other, Specify In Notes: 13
Palliative: 8


In [23]:
num_unique = data_timeline_treatment_df.iloc[:, 0].nunique()
print(f"Number of distinct values in first column: {num_unique}")


Number of distinct values in first column: 278


In [19]:
def count_recurrence_rows(df):
    # Make everything lowercase string for safe matching
    df_str = df.astype(str).apply(lambda col: col.str.lower())

    # Check each cell for 'recurrence'
    recurrence_mask = df_str.apply(lambda col: col.str.contains("recurrence", na=False))

    # Count how many columns per row contain 'recurrence'
    recurrence_counts = recurrence_mask.sum(axis=1)

    # Total number of rows with at least one 'recurrence'
    total_rows_with_recurrence = (recurrence_counts > 0).sum()
    total_rows_with_2_recurrence = (recurrence_counts > 1).sum()

    print(f"Total rows with at least one 'recurrence': {total_rows_with_recurrence}")
    print(f"Total rows with at 2 'recurrence': {total_rows_with_2_recurrence}")
    return recurrence_counts

recurrence_counts = count_recurrence_rows(data_timeline_treatment_df)


Total rows with at least one 'recurrence': 73
Total rows with at 2 'recurrence': 9


In [None]:
def count_recurrence_overlap(df, col1, col2):
    col1_contains = df[col1].astype(str).str.strip().str.lower().str.contains("recurrence", na=False)
    col2_contains = df[col2].astype(str).str.strip().str.lower().str.contains("recurrence", na=False)

    both = col1_contains & col2_contains
    only_col1 = col1_contains & ~col2_contains
    only_col2 = col2_contains & ~col1_contains

    print(f"Rows with recurrence in BOTH {col1} and {col2}: {both.sum()}")
    print(f"Rows with recurrence ONLY in {col1}: {only_col1.sum()}")
    print(f"Rows with recurrence ONLY in {col2}: {only_col2.sum()}")
    print(f"Rows with recurrence in EITHER column: {(both | only_col1 | only_col2).sum()}")

count_recurrence_overlap(data_timeline_treatment_df, "ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION")


Rows with recurrence in BOTH ANATOMIC_TREATMENT_SITE and REGIMEN_INDICATION: 9
Rows with recurrence ONLY in ANATOMIC_TREATMENT_SITE: 17
Rows with recurrence ONLY in REGIMEN_INDICATION: 47
Rows with recurrence in EITHER column: 73


In [22]:
# Just used for looking at the data for the labels
pair_counts = data_timeline_treatment_df.groupby(["ANATOMIC_TREATMENT_SITE", 'REGIMEN_INDICATION'], dropna=False).size().reset_index(name='Count')

# Print the pairings and the count
print(pair_counts)



   ANATOMIC_TREATMENT_SITE       REGIMEN_INDICATION  Count
0       Distant Recurrence  Other, Specify In Notes      2
1       Distant Recurrence               Palliative      6
2       Distant Recurrence              Progression      3
3       Distant Recurrence               Recurrence      2
4       Distant Recurrence                      NaN      1
5             Distant Site               Palliative      1
6             Distant Site                      NaN      2
7         Local Recurrence                 Adjuvant      3
8         Local Recurrence               Palliative      1
9         Local Recurrence              Progression      1
10        Local Recurrence               Recurrence      7
11     Primary Tumor Field                 Adjuvant    100
12     Primary Tumor Field               Recurrence      2
13     Primary Tumor Field                      NaN     68
14           Regional Site                 Adjuvant     52
15           Regional Site  Other, Specify In Notes     

In [24]:
def count_unique_recurrence_patients(df, id_col="PATIENT_ID"):
    # Normalize strings
    df_str = df.astype(str).apply(lambda col: col.str.strip().str.lower())

    # Masks for recurrence conditions
    anat_mask = df_str["ANATOMIC_TREATMENT_SITE"].isin(["local recurrence", "distant recurrence"])
    regimen_mask = df_str["REGIMEN_INDICATION"] == "recurrence"

    # Get patient IDs
    anat_patients = set(df.loc[anat_mask, id_col])
    regimen_patients = set(df.loc[regimen_mask, id_col])

    # Union of both sets
    all_recurrence_patients = anat_patients | regimen_patients

    print(f"Unique patients with recurrence in ANATOMIC_TREATMENT_SITE: {len(anat_patients)}")
    print(f"Unique patients with recurrence in REGIMEN_INDICATION: {len(regimen_patients)}")
    print(f"Total unique patients with recurrence in either: {len(all_recurrence_patients)}")

    return anat_patients, regimen_patients, all_recurrence_patients


anat_patients, regimen_patients, all_recurrence_patients = count_unique_recurrence_patients(data_timeline_treatment_df)


Unique patients with recurrence in ANATOMIC_TREATMENT_SITE: 18
Unique patients with recurrence in REGIMEN_INDICATION: 29
Total unique patients with recurrence in either: 39


In [37]:
def get_recurrence_patients_from_timeline_treatment(filepath, id_col="PATIENT_ID"):
    # Load file
    df = pd.read_csv(filepath, sep="\t", comment="#", low_memory=False)

    # Masks for recurrence conditions
    anat_mask = df["ANATOMIC_TREATMENT_SITE"].isin(["Local Recurrence", "Distant Recurrence"])
    regimen_mask = df["REGIMEN_INDICATION"] == "Recurrence"
    
    # Collect patient IDs
    anat_patients = set(df.loc[anat_mask, id_col])
    regimen_patients = set(df.loc[regimen_mask, id_col])
    
    # Union of both
    all_patients = anat_patients | regimen_patients
    return all_patients

def get_locoregional_recurrence_patients(filepath, id_col=0, status_col="STATUS"):
    # Load file
    df = pd.read_csv(filepath, sep="\t", comment="#", low_memory=False)

    # Normalize STATUS values for safe matching
    mask = df[status_col].astype(str).str.strip().str.lower() == "locoregional recurrence"

    # Grab patient IDs from the first column (id_col=0 by default)
    patient_ids = df.iloc[mask.values, id_col].unique().tolist()

    return patient_ids

timeline_treatment_recur = get_recurrence_patients_from_timeline_treatment("../ucec_tcga_pan_can_atlas_2018/data_timeline_treatment.txt")
timeline_status_recur = get_locoregional_recurrence_patients("../ucec_tcga_pan_can_atlas_2018/data_timeline_status.txt")

In [38]:
print(len(timeline_treatment_recur))
print(len(timeline_status_recur))

39
24


In [39]:
# Combine both lists and remove duplicates by converting to a set, then back to a list
recur_patients = list(set(timeline_treatment_recur) | set(timeline_status_recur))
print(f"Number of unique recurrence patients: {len(recur_patients)}")

Number of unique recurrence patients: 61


In [None]:
import pandas as pd
import numpy as np

# Load clinical data
clinical_df = pd.read_csv(
    "../ucec_tcga_pan_can_atlas_2018/data_clinical_patient.txt",
    sep="\t",
    comment="#",
    low_memory=False
)
clinical_df = clinical_df.set_index('PATIENT_ID')

def assign_label(row):
    """
    Given a row, assigns:
    1 for recurrence,
    0 for no recurrence,
    None if recurrence information is missing.
    Uses NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT first, then DFS_STATUS if missing.
    """
    if row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'Yes':
        return 1
    elif row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] == 'No':
        return 0
    elif pd.isna(row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT']):
        if row['DFS_STATUS'] == '1:Recurred/Progressed':
            return 1
        elif row['DFS_STATUS'] == '0:DiseaseFree':
            return 0
        else:
            return None

# Apply the labeling function
labels = clinical_df.apply(assign_label, axis=1)

# Get the patient IDs labeled as 1 (recurrence)
recurrence_patient_ids = labels[labels == 1].index.tolist()

print(f"Number of patients labeled as recurrence: {len(recurrence_patient_ids)}")


Number of patients labeled as recurrence: 84
Example patient IDs: ['TCGA-2E-A9G8', 'TCGA-5B-A90C', 'TCGA-A5-A0R6', 'TCGA-A5-A2K4', 'TCGA-A5-A7WK', 'TCGA-AJ-A23M', 'TCGA-AJ-A2QK', 'TCGA-AJ-A3BF', 'TCGA-AJ-A3BG', 'TCGA-AJ-A3BI']


In [42]:
# Convert both lists to sets for easy comparison
set_clinical = set(recurrence_patient_ids)
set_timeline = set(recur_patients)  # assuming recur_patients is your other list

# Overlap
overlap = set_clinical & set_timeline

# Unique to each
only_clinical = set_clinical - set_timeline
only_timeline = set_timeline - set_clinical

print(f"Number of patients in BOTH lists: {len(overlap)}")
print(f"Number of patients only in clinical_df list: {len(only_clinical)}")
print(f"Number of patients only in timeline list: {len(only_timeline)}")

# Optional: print example patient IDs
print(f"Example overlap IDs: {list(overlap)[:10]}")
print(f"Example only clinical IDs: {list(only_clinical)[:10]}")
print(f"Example only timeline IDs: {list(only_timeline)[:10]}")


Number of patients in BOTH lists: 48
Number of patients only in clinical_df list: 36
Number of patients only in timeline list: 13
Example overlap IDs: ['TCGA-EO-A3AY', 'TCGA-B5-A1MW', 'TCGA-EY-A1GQ', 'TCGA-2E-A9G8', 'TCGA-DI-A1NN', 'TCGA-AP-A0L8', 'TCGA-AJ-A3BF', 'TCGA-BS-A0TA', 'TCGA-BS-A0UM', 'TCGA-B5-A11X']
Example only clinical IDs: ['TCGA-BG-A0MT', 'TCGA-B5-A1MU', 'TCGA-D1-A179', 'TCGA-AX-A2IN', 'TCGA-AX-A2H7', 'TCGA-DF-A2L0', 'TCGA-KP-A3W4', 'TCGA-EO-A1Y7', 'TCGA-AX-A1CN', 'TCGA-DF-A2KS']
Example only timeline IDs: ['TCGA-A5-A0G3', 'TCGA-AP-A052', 'TCGA-BS-A0TE', 'TCGA-AX-A1C7', 'TCGA-BS-A0T9', 'TCGA-B5-A0K7', 'TCGA-AP-A0LL', 'TCGA-AP-A1DQ', 'TCGA-AX-A2HJ', 'TCGA-B5-A0JZ']


In [44]:
def compare_recurrence_labels(clinical_df, timeline_df, 
                              clinical_ids, timeline_ids,
                              clinical_id_col="PATIENT_ID",
                              status_col="NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT",
                              dfs_col="DFS_STATUS",
                              anat_col="ANATOMIC_TREATMENT_SITE",
                              regimen_col="REGIMEN_INDICATION"):
    """
    Compare clinical-based and timeline-based recurrence labels.
    
    Returns DataFrames of patients labeled only in one source and shows why.
    """
    # Convert lists to sets for comparison
    set_clinical = set(clinical_ids)
    set_timeline = set(timeline_ids)
    
    # Intersection and differences
    overlap = set_clinical & set_timeline
    only_clinical = set_clinical - set_timeline
    only_timeline = set_timeline - set_clinical
    
    print(f"Overlap patients: {len(overlap)}")
    print(f"Only in clinical labels: {len(only_clinical)}")
    print(f"Only in timeline labels: {len(only_timeline)}")
    
    # Examine why clinical-only patients were missing in timeline
    clinical_missing = timeline_df[timeline_df['PATIENT_ID'].isin(only_clinical)]
    
    # Examine why timeline-only patients were missing in clinical
    timeline_missing = clinical_df.loc[list(only_timeline), [status_col, dfs_col]]
    
    return {
        "overlap": overlap,
        "clinical_only": clinical_missing,
        "timeline_only": timeline_missing
    }

# Assuming:
# clinical_df has index PATIENT_ID and columns NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT, DFS_STATUS
# data_timeline_treatment_df is your timeline DataFrame
results = compare_recurrence_labels(
    clinical_df,
    data_timeline_treatment_df.reset_index(),  # need PATIENT_ID column, not index
    recurrence_patient_ids,
    recur_patients
)

print("Clinical-only patients (why missing in timeline):")
print(results["clinical_only"].head())

print("Timeline-only patients (why missing in clinical):")
print(results["timeline_only"].head())



Overlap patients: 48
Only in clinical labels: 36
Only in timeline labels: 13
Clinical-only patients (why missing in timeline):
    index    PATIENT_ID  START_DATE  STOP_DATE EVENT_TYPE     TREATMENT_TYPE  \
34     34  TCGA-A5-A0R6          82      183.0  Treatment       Chemotherapy   
35     35  TCGA-A5-A0R6         309      330.0  Treatment  Radiation Therapy   
85     85  TCGA-AJ-A2QK         643        NaN  Treatment       Chemotherapy   
86     86  TCGA-AJ-A2QK         653        NaN  Treatment       Chemotherapy   
87     87  TCGA-AJ-A2QK          48       62.0  Treatment  Radiation Therapy   

   TREATMENT_SUBTYPE        AGENT  NUMBER_OF_CYCLES PRESCRIBED_DOSE  ...  \
34               NaN   Paclitaxel               5.0             295  ...   
35               NaN  Radiation 1               NaN             NaN  ...   
85               NaN  Carboplatin               NaN             NaN  ...   
86               NaN   Paclitaxel               NaN             NaN  ...   
87          

In [45]:
# Convert your patient ID lists to sets
set_clinical = set(recurrence_patient_ids)   # patients labeled as recurrence from clinical
set_timeline = set(recur_patients)           # patients labeled as recurrence from timeline

# Patients labeled as recurrence in both sources
overlap = set_clinical & set_timeline

# Patients labeled as recurrence in only one source
only_clinical = set_clinical - set_timeline
only_timeline = set_timeline - set_clinical

# Print summary
print(f"Patients labeled as recurrence in BOTH files: {len(overlap)}")
print(f"Patients labeled as recurrence ONLY in clinical file: {len(only_clinical)}")
print(f"Patients labeled as recurrence ONLY in timeline file: {len(only_timeline)}")
print(f"Total unique patients labeled as recurrence in at least one file: {len(overlap | only_clinical | only_timeline)}")


Patients labeled as recurrence in BOTH files: 48
Patients labeled as recurrence ONLY in clinical file: 36
Patients labeled as recurrence ONLY in timeline file: 13
Total unique patients labeled as recurrence in at least one file: 97


In [None]:
# Convert patient ID lists to sets
set_clinical = set(recurrence_patient_ids)  # clinical recurrence labels
set_timeline = set(recur_patients)          # timeline recurrence labels

# Patients present in both datasets
patients_in_both = set(clinical_df.index) & set(data_timeline_treatment_df["PATIENT_ID"])

# Disagreements
clinical_only_disagree = (set_clinical - set_timeline) & patients_in_both
timeline_only_disagree = (set_timeline - set_clinical) & patients_in_both

print(f"Number of patients labeled recurrence in clinical but NOT in timeline: {len(clinical_only_disagree)}")
print(f"Number of patients labeled recurrence in timeline but NOT in clinical: {len(timeline_only_disagree)}")


Number of patients labeled recurrence in clinical but NOT in timeline: 23
Number of patients labeled recurrence in timeline but NOT in clinical: 13


In [61]:
import pandas as pd

# Make sure your timeline DataFrame has PATIENT_ID as a column (not index)
timeline_df = data_timeline_treatment_df.copy()
if "PATIENT_ID" not in timeline_df.columns:
    timeline_df = timeline_df.reset_index()

# Columns to display
clinical_cols = ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"]
timeline_cols = ["ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION"]

def create_disagreement_table(patient_ids, clinical_df, timeline_df):
    """
    Returns a DataFrame for the given patient_ids showing relevant columns
    from both clinical and timeline data.
    """
    # Clinical data (some columns may be missing, handle with get)
    clinical_part = clinical_df.loc[list(patient_ids), clinical_cols].copy()
    
    # Timeline data (merge on PATIENT_ID)
    timeline_part = timeline_df[timeline_df["PATIENT_ID"].isin(patient_ids)][["PATIENT_ID"] + timeline_cols].copy()
    
    # Merge clinical and timeline info
    merged = clinical_part.reset_index().merge(timeline_part, left_on="PATIENT_ID", right_on="PATIENT_ID", how="outer")
    
    return merged

# Clinical-only disagreements
clinical_only_table = create_disagreement_table(list(clinical_only_disagree), clinical_df, timeline_df)
print("Patients labeled recurrence in clinical but not in timeline:")
print(clinical_only_table)

# Timeline-only disagreements
timeline_only_table = create_disagreement_table(timeline_only_disagree, clinical_df, timeline_df)
print("\nPatients labeled recurrence in timeline but not in clinical:")
print(timeline_only_table)


Patients labeled recurrence in clinical but not in timeline:
      PATIENT_ID NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT  \
0   TCGA-A5-A0R6                                     Yes   
1   TCGA-A5-A0R6                                     Yes   
2   TCGA-AJ-A2QK                                     Yes   
3   TCGA-AJ-A2QK                                     Yes   
4   TCGA-AJ-A2QK                                     Yes   
..           ...                                     ...   
60  TCGA-FI-A2EX                                     Yes   
61  TCGA-FI-A2EX                                     Yes   
62  TCGA-FI-A2EX                                     Yes   
63  TCGA-FI-A2EY                                     Yes   
64  TCGA-FI-A2EY                                     Yes   

   ANATOMIC_TREATMENT_SITE REGIMEN_INDICATION  
0                      NaN           Adjuvant  
1      Primary Tumor Field           Adjuvant  
2                      NaN                NaN  
3                      NaN

In [62]:
import pandas as pd

# Make sure your timeline DataFrame has PATIENT_ID as a column (not index)
timeline_df = data_timeline_treatment_df.copy()
if "PATIENT_ID" not in timeline_df.columns:
    timeline_df = timeline_df.reset_index()

# Columns to display
clinical_cols = ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"]
timeline_cols = ["ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION"]

def create_disagreement_table(patient_ids, clinical_df, timeline_df):
    """
    Returns a DataFrame for the given patient_ids showing relevant columns
    from both clinical and timeline data.
    """
    # Clinical data (some columns may be missing, handle with get)
    clinical_part = clinical_df.loc[list(patient_ids), clinical_cols].copy()
    
    # Timeline data (merge on PATIENT_ID)
    timeline_part = timeline_df[timeline_df["PATIENT_ID"].isin(patient_ids)][["PATIENT_ID"] + timeline_cols].copy()
    
    # Merge clinical and timeline info
    merged = clinical_part.reset_index().merge(timeline_part, on="PATIENT_ID", how="outer")
    
    return merged

# Clinical-only disagreements
clinical_only_table = create_disagreement_table(list(clinical_only_disagree), clinical_df, timeline_df)
clinical_only_table.to_csv("clinical_only_disagreements.csv", index=False)
print("Saved clinical-only disagreements to 'clinical_only_disagreements.csv'")

# Timeline-only disagreements
timeline_only_table = create_disagreement_table(list(timeline_only_disagree), clinical_df, timeline_df)
timeline_only_table.to_csv("timeline_only_disagreements.csv", index=False)
print("Saved timeline-only disagreements to 'timeline_only_disagreements.csv'")


Saved clinical-only disagreements to 'clinical_only_disagreements.csv'
Saved timeline-only disagreements to 'timeline_only_disagreements.csv'


In [66]:
import pandas as pd

# Make sure your timeline DataFrame has PATIENT_ID as a column
timeline_df = data_timeline_treatment_df.copy()
if "PATIENT_ID" not in timeline_df.columns:
    timeline_df = timeline_df.reset_index()

# Columns to display
clinical_cols = ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"]
timeline_cols = ["ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION"]

def create_disagreement_table(patient_ids, clinical_df, timeline_df):
    """Return a merged, sorted, human-readable DataFrame for given patient_ids."""
    patient_ids = list(patient_ids)
    
    # Clinical data
    clinical_part = clinical_df.loc[patient_ids, clinical_cols].copy()
    
    # Timeline data
    timeline_part = timeline_df[timeline_df["PATIENT_ID"].isin(patient_ids)][["PATIENT_ID"] + timeline_cols].copy()
    
    # Merge
    merged = clinical_part.reset_index().merge(timeline_part, on="PATIENT_ID", how="outer")
    
    # Reorder and rename columns for readability
    merged = merged[["PATIENT_ID"] + clinical_cols + timeline_cols]
    merged = merged.rename(columns={
        "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT": "Clinical_New_Tumor_Event",
        "ANATOMIC_TREATMENT_SITE": "Timeline_Anatomic_Treatment_Site",
        "REGIMEN_INDICATION": "Timeline_Regimen_Indication"
    })
    
    # Sort by PATIENT_ID
    merged = merged.sort_values("PATIENT_ID").reset_index(drop=True)
    
    return merged

# Clinical-only disagreements
clinical_only_table = create_disagreement_table(list(clinical_only_disagree), clinical_df, timeline_df)
clinical_only_table.to_excel("clinical_only_disagreements.xlsx", index=False)
print("Saved clinical-only disagreements to 'clinical_only_disagreements.xlsx'")

# Timeline-only disagreements
timeline_only_table = create_disagreement_table(list(timeline_only_disagree), clinical_df, timeline_df)
timeline_only_table.to_excel("timeline_only_disagreements.xlsx", index=False)
print("Saved timeline-only disagreements to 'timeline_only_disagreements.xlsx'")


ModuleNotFoundError: No module named 'openpyxl'

In [68]:
import pandas as pd

# Ensure timeline DataFrame has PATIENT_ID as a column
timeline_df = data_timeline_treatment_df.copy()
if "PATIENT_ID" not in timeline_df.columns:
    timeline_df = timeline_df.reset_index()

# Columns to display
clinical_cols = ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"]
timeline_cols = ["ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION"]

def create_disagreement_table(patient_ids, clinical_df, timeline_df):
    """Return a merged, sorted, human-readable DataFrame for given patient_ids."""
    patient_ids = list(patient_ids)
    
    # Clinical data
    clinical_part = clinical_df.loc[patient_ids, clinical_cols].copy()
    
    # Timeline data
    timeline_part = timeline_df[timeline_df["PATIENT_ID"].isin(patient_ids)][["PATIENT_ID"] + timeline_cols].copy()
    
    # Merge
    merged = clinical_part.reset_index().merge(timeline_part, on="PATIENT_ID", how="outer")
    
    # Reorder and rename columns for readability
    merged = merged[["PATIENT_ID"] + clinical_cols + timeline_cols]
    merged = merged.rename(columns={
        "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT": "Clinical_New_Tumor_Event",
        "ANATOMIC_TREATMENT_SITE": "Timeline_Anatomic_Treatment_Site",
        "REGIMEN_INDICATION": "Timeline_Regimen_Indication"
                })
    
    # Sort by PATIENT_ID
    merged = merged.sort_values("PATIENT_ID").reset_index(drop=True)
    
    return merged

def save_disagreement_table_text(df, filename):
    """Save a DataFrame to a nicely formatted text file."""
    with open(filename, "w") as f:
        f.write(df.to_string(index=False))
    print(f"Saved nicely formatted table to '{filename}'")

# Clinical-only disagreements
clinical_only_table = create_disagreement_table(list(clinical_only_disagree), clinical_df, timeline_df)
save_disagreement_table_text(clinical_only_table, "clinical_only_disagreements.txt")

# Timeline-only disagreements
timeline_only_table = create_disagreement_table(list(timeline_only_disagree), clinical_df, timeline_df)
save_disagreement_table_text(timeline_only_table, "timeline_only_disagreements.txt")


Saved nicely formatted table to 'clinical_only_disagreements.txt'
Saved nicely formatted table to 'timeline_only_disagreements.txt'


In [69]:
import pandas as pd

# Ensure timeline DataFrame has PATIENT_ID as a column
timeline_df = data_timeline_treatment_df.copy()
if "PATIENT_ID" not in timeline_df.columns:
    timeline_df = timeline_df.reset_index()

# Columns to display
clinical_cols = ["NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT", "DFS_STATUS"]
timeline_cols = ["ANATOMIC_TREATMENT_SITE", "REGIMEN_INDICATION"]

def create_full_label_table(clinical_df, timeline_df, clinical_set, timeline_set):
    """
    Create a merged table for all patients with recurrence info and disagreement markers.
    """
    # Merge clinical and timeline info
    clinical_part = clinical_df[clinical_cols].copy().reset_index()
    merged = clinical_part.merge(timeline_df[["PATIENT_ID"] + timeline_cols], on="PATIENT_ID", how="outer")
    
    # Marker columns
    merged["Labeled_Clinical"] = merged["PATIENT_ID"].apply(lambda x: 1 if x in clinical_set else 0)
    merged["Labeled_Timeline"] = merged["PATIENT_ID"].apply(lambda x: 1 if x in timeline_set else 0)
    
    # Sort by patient ID
    merged = merged.sort_values("PATIENT_ID").reset_index(drop=True)
    
    # Rename columns for readability
    merged = merged.rename(columns={
        "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT": "Clinical_New_Tumor_Event",
        "DFS_STATUS": "Clinical_DFS_Status",
        "ANATOMIC_TREATMENT_SITE": "Timeline_Anatomic_Treatment_Site",
        "REGIMEN_INDICATION": "Timeline_Regimen_Indication"
    })
    
    return merged

def save_table_as_text(df, filename):
    """Save DataFrame as nicely formatted text file."""
    with open(filename, "w") as f:
        f.write(df.to_string(index=False))
    print(f"Saved full patient table to '{filename}'")

# Create the full table
full_table = create_full_label_table(
    clinical_df,
    timeline_df,
    clinical_set=set(recurrence_patient_ids),
    timeline_set=set(recur_patients)
)

# Save as a text file
save_table_as_text(full_table, "all_patients_labels.txt")


Saved full patient table to 'all_patients_labels.txt'
