## Import modules

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
module_path = os.path.abspath(os.path.join('..'))

## Load and preprocess

In [2]:
all_data = pd.ExcelFile(module_path + "/data/harel/tableS1.xlsx")

clinical_response_df = all_data.parse('S1A', header = 1) #load in clinical and response data

response_df = clinical_response_df.loc[:,['Sample ID','Response', 'PFS time (months)', 'OS time (months)']] #response data

clinical_df = clinical_response_df.drop(columns = ['Response', 'PFS time (months)', 'OS time (months)']) #clinical data

protein_df = all_data.parse('S1B', header = 1) #load in protein data
protein_df = protein_df.drop(columns = ['T: Protein IDs', 'T: Majority protein IDs', 'T: Protein names', 'T: ENSG'])
protein_df = protein_df.dropna(subset=['T: Gene names']) #drop unlabeled proteins
protein_df = protein_df.fillna(value = 0) #impute missing data with 0

drop_idx = [i for i,j in enumerate(protein_df['T: Gene names']) if ';' in j] #drop rows with multiple protein names
protein_df = protein_df.reset_index(drop = True)
protein_df = protein_df.drop(drop_idx)

protein_names = protein_df['T: Gene names'].tolist() # get axis names
patient_id = protein_df.columns.tolist()

scaler = StandardScaler() #scaling
protein_df_values = protein_df.drop(columns = ['T: Gene names']).T
protein_df_values = scaler.fit(protein_df_values).transform(protein_df_values)
protein_df = pd.DataFrame(protein_df_values).T

protein_df['T: Gene names'] = protein_names #reappend axis names
protein_df.columns = patient_id


## Save to CSV

In [3]:
clinical_df.to_csv(module_path + '/data/harel/clinical_df.csv')
protein_df.to_csv(module_path + '/data/harel/protein_df.csv')
response_df.to_csv(module_path + '/data/harel/response_df.csv')