In [None]:
import pandas as pd
import re

def get_drug_columns(df):
  # Create an empty list to store the names of the columns we want to concatenate
  drug_columns = []

  # Iterate through the column names in the dataframe
  for col_name in df.columns:
        if re.search(r'drugs\.drug', col_name):
            print(col_name)
            drug_columns.append(col_name)

  # Concatenate the columns in the list and return the result
  return drug_columns

## Load exisiting overview table

In [None]:
path = r"Z:\HiWi\Popp\TCGA_NSCLC_2022\LUNG\TCGA_LUNG_overview_table.csv" # r"Z:\HiWi\Popp\TCGA_Breast_2022\TCGA_BRCA_clinical.csv" # 
df_overview = pd.read_csv(path, index_col=0)

df_overview

## Compare other clinical dataframe with therapy information

In [None]:
url = r"Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\Select_Clinical_data_LUAD_firehose.xlsx" # r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\gdac.broadinstitute.org_LUAD.Merge_Clinical.Level_1.2016012800.0.0\LUAD.clin.merged.firehose.txt'
df_med = pd.read_excel(url, header=5)

df_med.set_index('patient.bcr_patient_barcode', inplace=True)
df_med = df_med.T

import pandas as pd; import numpy as np
df_med = df_med.reset_index()
import pandas as pd; import numpy as np
# Step: Capitalize text in 'index'
df_med['Sample_ID'] = df_med['index'].str.upper()
df_med['Sample_ID'] = df_med['Sample_ID'] + '-01A'


df_med

In [None]:
drug_cols = get_drug_columns(df_med)

In [None]:
df_med['drugs'] = df_med[drug_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [None]:
df_med['radiation_therapy_first'] = [1 if x == 'yes' else 0 for x in df_med['patient.radiation_therapy']]
df_med['radiation_therapy_follow'] = [1 if x == 'yes' else 0 for x in df_med['patient.follow_ups.follow_up.radiation_therapy']]

df_med['radiationtherapy'] = df_med['radiation_therapy_follow'] + df_med['radiation_therapy_first'] 
df_med['radiationtherapy'] = [0 if x == 0 else 1 for x in df_med['radiationtherapy']]

In [None]:
df_med['targeted_therapy_first'] = [1 if x == 'yes' else 0 for x in df_med['patient.targeted_molecular_therapy']]
df_med['targeted_therapy_follow'] = [1 if x == 'yes' else 0 for x in df_med['patient.follow_ups.follow_up.targeted_molecular_therapy']]
df_med['targeted_therapy_combo'] = [1 if 'targeted' in x else 0 for x in df_med['drugs']]


df_med['targeted_therapy'] = df_med['targeted_therapy_first'] + df_med['targeted_therapy_follow'] + df_med['targeted_therapy_combo']
df_med['targeted_therapy'] = [0 if x == 0 else 1 for x in df_med['targeted_therapy']]

In [None]:
df_med['chemotherapy'] = [1 if 'chemotherapy' in x else 0 for x in df_med['drugs']]

In [None]:
df_med['immunotherapy'] = [1 if 'immunotherapy' in x else 0 for x in df_med['drugs']]

In [None]:
import pandas as pd; import numpy as np
# Step: Select columns
df_therapy = df_med[['Sample_ID', 'immunotherapy', 'radiationtherapy', 'chemotherapy', 'targeted_therapy']]

df_therapy

## Fuse therapy to other overview

In [None]:
df_overview.drop(['radiationtherapy', 'immunotherapy', 'chemotherapy', 'targeted_therapy'], axis=1, inplace=True)

In [None]:
combined_df = df_overview.merge(df_therapy, on='Sample_ID', how='left')
#add surgery
combined_df['surgical_therapy'] = 0
combined_df.loc[combined_df['additional_surgery_locoregional_procedure'].str.contains('YES') | combined_df['additional_surgery_metastatic_procedure'].str.contains('YES'), 'surgical_therapy'] = 1
#add radiation
combined_df['add_radio'] = [1 if x == 'YES' else 0 for x in combined_df['additional_radiation_therapy']]
combined_df['radiation_therapy'] = combined_df['radiationtherapy'] + combined_df['add_radio']
combined_df['radiation_therapy'] = [0 if x == 0 else 1 for x in combined_df['radiation_therapy']]

combined_df

In [None]:
combined_df['Sum'] = combined_df[['surgical_therapy', 'immunotherapy', 'radiation_therapy', 'chemotherapy', 'targeted_therapy']].sum(axis=1)
combined_df['therapy_classification'] = ['Mono_therapy' if x == 1 else 'Combination_therapy' if x > 1 else np.NaN for x in combined_df['Sum']]
#set LUSC nan
combined_df.loc[combined_df['disease_code'] == 'LUSC', ['Sum', 'therapy_classification']] = np.nan
combined_df['therapy_classification'] 

## add smoker

In [None]:
combined_df['non-smoker'] = [1 if x == 1 else 0 for x in combined_df['tobacco_smoking_history']]

In [None]:
#save back to overview table
combined_df.to_csv(path)

## add residual

In [None]:
import pandas as pd
import bamboolib
url = r'Z:\HiWi\Popp\TCGA_NSCLC_2022\LUAD\Clinic\gdac.broadinstitute.org_LUAD.Merge_Clinical.Level_1.2016012800.0.0\LUAD.clin.merged.firehose.txt'
df_med = pd.read_csv(url, sep = '\t')
df_med.set_index('admin.batch_number', inplace=True)
df_med = df_med.T
import pandas as pd; import numpy as np
df_med = df_med.reset_index()
# Step: Select columns
df_res = df_med[['patient.residual_tumor', 'patient.bcr_patient_barcode']]
df_res['Sample_ID'] = df_res['patient.bcr_patient_barcode'].str.upper()
df_res['Sample_ID'] = df_res['Sample_ID'] + '-01A'

df_res

In [None]:
combined_df = combined_df.merge(df_res, on='Sample_ID', how='left')
combined_df

In [None]:
#save back to overview table
combined_df.to_csv(path)