# Clinical data processing for joining tables (RNA+Clinic)

### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Upload Clinic data

In [2]:
clinic_path ='../Data/Clinical_data_of_patients_with_RNA_study.csv' 
clinic_data = pd.read_csv(clinic_path, delimiter=';')
clinic_data_testing_everolimus = clinic_data[(clinic_data['Arm']=='EVEROLIMUS')]
clinic_data = clinic_data[(clinic_data['Arm']=='NIVOLUMAB')]
clinic_data.head(5)

Unnamed: 0,SUBJID,Cohort,Arm,MAF_Tumor_ID,MAF_Normal_ID,CNV_ID,RNA_ID,CD8_IF_ID,Sex,Age,...,TSC1,USP32,VHL,WNT8A,ZNF800,Angio,Teff,Myeloid,Javelin,Merck18
0,RCC10-20,CM-010,NIVOLUMAB,,,,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,0001 00020,Male,62.0,...,,,,,,33.944389,29.755243,28.439512,30.23792,32.566911
1,RCC10-97,CM-010,NIVOLUMAB,RP-1458_RCCBMS-00097-T_v4_Exome_OnPrem,RP-1458_RCCBMS-00097-N_v1_Exome_OnPrem,RCCBMS-00097-T.called.seg,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,0001 00097,Female,67.0,...,WT,WT,WT,WT,WT,35.142149,28.844206,27.360042,30.276256,32.095831
2,RCC10-141,CM-010,NIVOLUMAB,RP-1458_RCCBMS-00141-T_v1_Exome_OnPrem,RP-1458_RCCBMS-00141-N_v1_Exome_OnPrem,RCCBMS-00141-T.called.seg,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,0014 00141,Female,62.0,...,WT,WT,MUT,WT,WT,33.952069,29.595384,32.586142,29.565893,32.010041
3,RCC10-99,CM-010,NIVOLUMAB,,,,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,0015 00099,Male,60.0,...,,,,,,35.728524,29.22152,31.233694,30.039203,31.80172
4,RCC10-163,CM-010,NIVOLUMAB,RP-1458_RCCBMS-00163-T_v1_Exome_OnPrem,RP-1458_RCCBMS-00163-N_v1_Exome_OnPrem,RCCBMS-00163-T.called.seg,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,,Male,76.0,...,WT,WT,WT,WT,WT,35.09391,29.114337,30.324276,28.993455,30.815757


### Delete unnecessary columns

In [3]:
# Select only necesary classes for the classification
clinic_data_PFS = clinic_data[['Cohort', 'RNA_ID','Sex','Age','MSKCC','IMDC','Sarc','Rhab','Number_of_Prior_Therapies','Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy','Tumor_Shrinkage','PFS','TM_TC_Ratio','ImmunoPhenotype']]
clinic_data_OS = clinic_data[['Cohort', 'RNA_ID','Sex','Age','MSKCC','IMDC','Sarc','Rhab','Number_of_Prior_Therapies','Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy','Tumor_Shrinkage','OS','TM_TC_Ratio','ImmunoPhenotype']]
clinic_data_testing_everolimus = clinic_data_testing_everolimus[['Cohort', 'RNA_ID','Sex','Age','MSKCC','IMDC','Sarc','Rhab','Number_of_Prior_Therapies','Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy','Tumor_Shrinkage','PFS','TM_TC_Ratio','ImmunoPhenotype']]

In [4]:
len(clinic_data_PFS.columns)

14

In [5]:
type(clinic_data_PFS.TM_TC_Ratio.iloc[0]) == float

False

In [7]:
# Save in csv the dataframe with the column deletion
clinic_data_PFS.to_csv("./Prediction PFS/RNA+Clinic joined/Clinical_data_deleted_columns_PFS.csv")

### Classes categorization

1. PFS CSV

In [8]:
# First, change path to start categorization process
clinic_path ='./Prediction PFS/RNA+Clinic joined/Clinical_data_deleted_columns_PFS.csv' # Sin eliminar la columna PFS
clinic_data = pd.read_csv(clinic_path)

In [9]:
clinic_data.head(5)

Unnamed: 0.1,Unnamed: 0,Cohort,RNA_ID,Sex,Age,MSKCC,IMDC,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,ImmunoPhenotype
0,0,CM-010,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,Male,62.0,FAVORABLE,,0.0,0.0,2.0,962.0,-16.40625,12.164384,0.766909,Infiltrated
1,1,CM-010,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,Female,67.0,FAVORABLE,,0.0,0.0,2.0,254.0,-40.0,5.490411,0.773269,Infiltrated
2,2,CM-010,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,Female,62.0,POOR,,0.0,0.0,1.0,800.0,-89.43662,61.906849,1.782859,Infiltrated
3,3,CM-010,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,Male,60.0,FAVORABLE,,0.0,0.0,3.0,790.0,11.787072,1.249315,5.890573,Excluded
4,4,CM-010,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,Male,76.0,INTERMEDIATE,,0.0,0.0,2.0,1241.0,0.0,5.654795,,No_IF


In [10]:
# Sex-----------------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.Sex[i]=='F' or clinic_data.Sex[i]=='FEMALE':
        clinic_data.Sex[i]='Female'
    elif clinic_data.Sex[i]=='M' or clinic_data.Sex[i]=='MALE':
        clinic_data.Sex[i]='Male'

# MSKCC---------------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.MSKCC[i]!='POOR' and clinic_data.MSKCC[i]!='INTERMEDIATE' and clinic_data.MSKCC[i]!='FAVORABLE':
        clinic_data.MSKCC[i]='NA'

#IMDC-------------------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.IMDC[i]!='POOR' and clinic_data.IMDC[i]!='INTERMEDIATE' and clinic_data.IMDC[i]!='FAVORABLE':
        clinic_data.IMDC[i]='NOT_REPORTED'

# Sarc------------------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.Sarc[i]!=0.0 and clinic_data.Sarc[i]!=1.0:
        clinic_data.Sarc[i]=0

# Rhab------------------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.Rhab[i]!=0.0 and clinic_data.Rhab[i]!=1.0:
        clinic_data.Rhab[i]=0

# Number_of_Prior_Therapies-----------------------------
for i in range (len(clinic_data)):
    if clinic_data.Number_of_Prior_Therapies[i]!=1 and clinic_data.Number_of_Prior_Therapies[i]!=2 and clinic_data.Number_of_Prior_Therapies[i]!=3 and clinic_data.Number_of_Prior_Therapies[i]!=4:
        clinic_data.Number_of_Prior_Therapies[i]=clinic_data.Number_of_Prior_Therapies.median()
# ImmunoPhenotype-----------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.ImmunoPhenotype[i]!='Desert' and clinic_data.ImmunoPhenotype[i]!='Infiltrated' and clinic_data.ImmunoPhenotype[i]!='Excluded':
        clinic_data.ImmunoPhenotype[i]='NA'


# # # TM_TC_Ratio---------------------------------------
for i in range (len(clinic_data)):
    if clinic_data.TM_TC_Ratio[i]==-1.0 or np.isnan(clinic_data.TM_TC_Ratio[i]):
        clinic_data.TM_TC_Ratio[i]=clinic_data.TM_TC_Ratio.median()
    if np.isnan(clinic_data.Tumor_Shrinkage[i]):
        clinic_data.Tumor_Shrinkage[i]=clinic_data.Tumor_Shrinkage.median()
    if clinic_data.Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy[i]==-1 or np.isnan(clinic_data.Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy[i]):
        clinic_data.Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy[i]=clinic_data.Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy.median()
    if clinic_data.Age[i]==-1 or np.isnan(clinic_data.Age[i]):
        clinic_data.Age[i]=clinic_data.Age.median()
       
       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data.Sex[i]='Female'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data.Sex[i]='Male'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data.MSKCC[i]='NA'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinic_data.IMDC[i]='NOT_REPORTED'
A value is trying to be set on a copy of a slice from

In [11]:
clinic_data = pd.get_dummies(clinic_data, columns=['Cohort','Sex', 'MSKCC', 'IMDC', 'ImmunoPhenotype' ], prefix=['Cohort','Sex', 'MSKCC', 'IMDC', 'ImmunoPhenotype' ])
clinic_data=clinic_data.drop(['Unnamed: 0'], axis=1)

In [12]:
clinic_data.head(5)

Unnamed: 0,RNA_ID,Age,Sarc,Rhab,Number_of_Prior_Therapies,Days_from_TumorSample_Collection_and_Start_of_Trial_Therapy,Tumor_Shrinkage,PFS,TM_TC_Ratio,Cohort_CM-009,...,MSKCC_NA,MSKCC_POOR,IMDC_FAVORABLE,IMDC_INTERMEDIATE,IMDC_NOT_REPORTED,IMDC_POOR,ImmunoPhenotype_Desert,ImmunoPhenotype_Excluded,ImmunoPhenotype_Infiltrated,ImmunoPhenotype_NA
0,G138701_RCCBMS-00020-T_v1_RNA_OnPrem,62.0,0.0,0.0,2.0,962.0,-16.40625,12.164384,0.766909,0,...,0,0,0,0,1,0,0,0,1,0
1,G138701_RCCBMS-00097-T_v1_RNA_OnPrem,67.0,0.0,0.0,2.0,254.0,-40.0,5.490411,0.773269,0,...,0,0,0,0,1,0,0,0,1,0
2,G138701_RCCBMS-00141-T_v1_RNA_OnPrem,62.0,0.0,0.0,1.0,800.0,-89.43662,61.906849,1.782859,0,...,0,1,0,0,1,0,0,0,1,0
3,G138701_RCCBMS-00099-T_v1_RNA_OnPrem,60.0,0.0,0.0,3.0,790.0,11.787072,1.249315,5.890573,0,...,0,0,0,0,1,0,0,1,0,0
4,G138701_RCCBMS-00163-T_v1_RNA_OnPrem,76.0,0.0,0.0,2.0,1241.0,0.0,5.654795,1.374775,0,...,0,0,0,0,1,0,0,0,0,1


In [13]:
# Save result dataframe in csv
clinic_data.to_csv("./Prediction PFS/RNA+Clinic joined/Clinical_data_categorized_PFS.csv")