In [None]:
from google.colab import drive, files
import os
import numpy as np
import pandas as pd
from glob import glob
from collections import defaultdict
import requests
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
#@title Enter working directory name
working_directory = "/content/drive/MyDrive/CMIPB_Files/Redownload"#@param {type:"string"}
if os.path.exists(working_directory) is False:
  os.mkdir(working_directory)

In [None]:
# @title Reading in the cytokine name + gene name conversions from google drive.

sheet_url = 'https://docs.google.com/spreadsheets/d/1EIrUaQB7bJ1BO66x9TLfiXrbjpRyIwMQKru0HIgwr1s/edit#gid=0'
url_1 = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
sheet = pd.read_csv(url_1)
cytokine_mapping = dict(sheet[sheet["Category"]=='Plasma cytokine'][['Feature name','Additional Feature name']].values)
gene_mapping = dict(sheet[sheet["Category"]=='Gene expression'][['Feature name','Additional Feature name']].values)


In [None]:
# @title Download the files if you need to.

def download_tsv(path, name):
  print(requests.get(path).content.decode(), file = open(name, 'w'))

# base_url = 'https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/processed_datasets/training_dataset/master_processed_training_data_TSVs/'

# files = ['abtiter_batchCorrected_data.tsv', 'abtiter_metadata.tsv', 'abtiter_normalized_data.tsv', 'abtiter_raw_data.tsv', 'pbmc_cell_frequency_batchCorrected_data.tsv',
#          'pbmc_cell_frequency_metadata.tsv', 'pbmc_cell_frequency_normalized_data.tsv', 'pbmc_gene_expression_batchCorrected_data.tsv', 'pbmc_gene_expression_metadata.tsv',
#          'pbmc_gene_expression_raw_data.tsv', 'plasma_cytokine_concentrations_batchCorrected_data.tsv', 'plasma_cytokine_concentrations_metadata.tsv', 'plasma_cytokine_concentrations_normalized_data.tsv',
#          'subject_specimen.tsv']

# file_path = os.path.join(working_directory, 'files')
# if os.path.exists(file_path) is False:
#   os.mkdir(file_path)

# for f in files:
#   download_tsv(os.path.join(base_url, f), os.path.join(file_path, f))




In [None]:
# @title Reading in the files with set prefix.

def read_in_file(path, prefix, rename = False):
  data = pd.read_csv(path, sep = '\t').T
  data.index = data.index.map(int)
  if rename != False:
    data.columns = data.columns.map(rename)
  data.columns = [prefix+'_'+p for p in data.columns]
  return data

timepoint = 3
task = 'ccl3_task'
targets = 'Cytokine_CCL3'
data_type = 'batchCorrected'
if os.path.exists(f'/content/drive/MyDrive/CMIPB_Files/{task}/') is False:
  os.mkdir(f'/content/drive/MyDrive/CMIPB_Files/{task}/')
file_path = '/content/drive/MyDrive/CMIPB_Files/Redownload/files'
metadata = pd.read_csv(os.path.join(file_path, 'subject_specimen.tsv'), sep = '\t')
metadata['age'] = metadata.apply(lambda x:int(x.dataset.split('_')[0]) - int(x.year_of_birth.split('-')[0]), axis = 1)

titre_data = read_in_file(os.path.join(file_path, f'abtiter_{data_type}_data.tsv'), 'Titre')

cell_freq_data = read_in_file(os.path.join(file_path, f'pbmc_cell_frequency_{data_type}_data.tsv'), 'Cellfrequency')
cytokine_data = read_in_file(os.path.join(file_path, f'plasma_cytokine_concentrations_{data_type}_data.tsv'), 'Cytokine', rename = cytokine_mapping)
if data_type == 'batchCorrected':
  gex_data = read_in_file(os.path.join(file_path, f'pbmc_gene_expression_batchCorrected_data.tsv'), 'GEX', rename = gene_mapping)
else:
  gex_data = read_in_file(os.path.join(file_path, f'pbmc_gene_expression_raw_data.tsv'), 'GEX', rename = gene_mapping)
merged_obj = pd.merge(right = metadata, left = titre_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = gex_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cytokine_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cell_freq_data, left_index = True, right_on = 'specimen_id', how = 'right')

baseline_values = merged_obj[merged_obj['timepoint']==0]
assert baseline_values['subject_id'].value_counts().max() == 1
prefixes = set(['Titre','Cellfrequency','Cytokine','GEX'])
baseline_feats = baseline_values[[p for p in baseline_values.columns if p.split('_')[0] in prefixes]]
subject = baseline_values[['subject_id', 'biological_sex', 'dataset', 'age', 'infancy_vac']]
subject['biological_sex'] = subject['biological_sex'].map(lambda x:1 if x == 'Female' else 0)
subject['infancy_vac'] = subject['infancy_vac'].map(lambda x:1 if x == 'aP' else 0)
baseline_feats = pd.concat([baseline_feats, subject],axis=1)
day14_values = merged_obj[merged_obj['timepoint']==timepoint][['subject_id', targets]].rename(columns = {targets: 'Target'}).set_index('subject_id')
baseline_feats = pd.merge(left = baseline_feats, right = day14_values, right_index = True, left_on = 'subject_id')
baseline_feats[f'Target_FC'] = baseline_feats.apply(lambda x:(x.Target+1)/(x[targets]+1), axis = 1)
baseline_feats.to_csv(f'/content/drive/MyDrive/CMIPB_Files/{task}/IntegratedData_{data_type}.tsv', sep = '\t')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subject['biological_sex'] = subject['biological_sex'].map(lambda x:1 if x == 'Female' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subject['infancy_vac'] = subject['infancy_vac'].map(lambda x:1 if x == 'aP' else 0)


In [None]:
%ls /content/drive/MyDrive/CMIPB_Files/Redownload/files/

abtiter_batchCorrected_data.tsv              pbmc_gene_expression_batchCorrected.tsv
abtiter_metadata.tsv                         pbmc_gene_expression_metadata.tsv
abtiter_normalized_data.tsv                  pbmc_gene_expression_raw_data.tsv
abtiter_raw_data.tsv                         plasma_cytokine_concentrations_batchCorrected.tsv
pbmc_cell_frequency_batchCorrected_data.tsv  plasma_cytokine_concentrations_metadata.tsv
pbmc_cell_frequency_metadata.tsv             plasma_cytokine_concentrations_normalized_data.tsv
pbmc_cell_frequency_normalized_data.tsv      subject_specimen.tsv


In [None]:
base_url = 'https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/processed_datasets/prediction_dataset/'

files_to_download = ['abtiter_processed_data.tsv', 'pbmc_cell_frequency_processed_data.tsv', 'pbmc_gene_expression_processed_data.tsv',
                     'plasma_cytokine_concentrations_processed_data.tsv', 'subject_specimen.tsv']

file_path = os.path.join(working_directory, 'test_files')
if os.path.exists(file_path) is False:
  os.mkdir(file_path)

for f in files_to_download:
  download_tsv(os.path.join(base_url, f), os.path.join(file_path, f))



In [None]:
!ls {file_path}

abtiter_processed_data.tsv		 plasma_cytokine_concentrations_processed_data.tsv
pbmc_cell_frequency_processed_data.tsv	 subject_specimen.tsv
pbmc_gene_expression_processed_data.tsv


In [None]:
def read_in_file(path, prefix, rename = False):
  data = pd.read_csv(path, sep = '\t').T
  data.index = data.index.map(int)
  if rename != False:
    data.columns = data.columns.map(rename)
  data.columns = [prefix+'_'+p for p in data.columns]
  return data

metadata = pd.read_csv(os.path.join(file_path, 'subject_specimen.tsv'), sep = '\t')
metadata['age'] = metadata.apply(lambda x:int(x.dataset.split('_')[0]) - int(x.year_of_birth.split('-')[0]), axis = 1)
titre_data = read_in_file(os.path.join(file_path, 'abtiter_processed_data.tsv'), 'Titre')
cell_freq_data = read_in_file(os.path.join(file_path, 'pbmc_cell_frequency_processed_data.tsv'), 'Cellfrequency')
cytokine_data = read_in_file(os.path.join(file_path, 'plasma_cytokine_concentrations_processed_data.tsv'), 'Cytokine', rename = cytokine_mapping)
gex_data = read_in_file(os.path.join(file_path, 'pbmc_gene_expression_processed_data.tsv'), 'GEX', rename = gene_mapping)
merged_obj = pd.merge(right = metadata, left = titre_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = gex_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cytokine_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cell_freq_data, left_index = True, right_on = 'specimen_id', how = 'right')
training = merged_obj[merged_obj['dataset']!='2022_dataset']
test = merged_obj[(merged_obj['dataset']=='2022_dataset')&(merged_obj['timepoint']<=0)]

prefixes = set(['Titre','Cellfrequency','Cytokine','GEX'])
test_median_vals = test.groupby(['subject_id']).apply(lambda x:x[[p for p in x.columns if p.split('_')[0] in prefixes]].median()).reset_index()
test_median_vals = pd.merge(left = test_median_vals, right = metadata[metadata['dataset']=='2022_dataset'].drop_duplicates('subject_id').drop(['timepoint'],axis=1), left_on='subject_id',right_on='subject_id', how = 'left')
test_median_vals.to_csv('/content/drive/MyDrive/CMIPB_Files/IntegratedTestData.tsv', sep = '\t')

In [None]:
def read_in_file(path, prefix, rename = False):
  data = pd.read_csv(path, sep = '\t').T
  data.index = data.index.map(int)
  if rename != False:
    data.columns = data.columns.map(rename)
  data.columns = [prefix+'_'+p for p in data.columns]
  return data

file_path = '/content/drive/MyDrive/CMIPB_Files/processed_including_training'

metadata = pd.read_csv(os.path.join(file_path, 'processed_combinedsubject_specimen.tsv'), sep = '\t')
metadata['age'] = metadata.apply(lambda x:int(x.dataset.split('_')[0]) - int(x.year_of_birth.split('-')[0]), axis = 1)
data_type = 'normalized'#'batchCorrected'
titre_data = read_in_file(os.path.join(file_path, f'processed_combinedabtiter_{data_type}_data.tsv'), 'Titre')
cell_freq_data = read_in_file(os.path.join(file_path, f'processed_combinedpbmc_cell_frequency_{data_type}_data.tsv'), 'Cellfrequency')
cytokine_data = read_in_file(os.path.join(file_path, f'processed_combinedplasma_cytokine_concentrations_{data_type}_data.tsv'), 'Cytokine', rename = cytokine_mapping)
if data_type == 'batchCorrected':
  gex_data = read_in_file(os.path.join(file_path, f'processed_combinedpbmc_gene_expression_batchCorrected_data.tsv'), 'GEX', rename = gene_mapping)
else:
  gex_data = read_in_file(os.path.join(file_path, f'processed_combinedpbmc_gene_expression_raw_data.tsv'), 'GEX', rename = gene_mapping)
merged_obj = pd.merge(right = metadata, left = titre_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = gex_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cytokine_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cell_freq_data, left_index = True, right_on = 'specimen_id', how = 'right')
training = merged_obj[merged_obj['dataset']!='2022_dataset']
test = merged_obj[(merged_obj['dataset']=='2022_dataset')&(merged_obj['timepoint']<=0)]

prefixes = set(['Titre','Cellfrequency','Cytokine','GEX'])
test_median_vals = test.groupby(['subject_id']).apply(lambda x:x[[p for p in x.columns if p.split('_')[0] in prefixes]].median()).reset_index()
test_median_vals = pd.merge(left = test_median_vals, right = metadata[metadata['dataset']=='2022_dataset'].drop_duplicates('subject_id').drop(['timepoint'],axis=1), left_on='subject_id',right_on='subject_id', how = 'left')
test_median_vals.to_csv('/content/drive/MyDrive/CMIPB_Files/IntegratedTestData.tsv', sep = '\t')

In [None]:
def combine_training(merged_obj, path = 'IntegratedData_Normalized.tsv'):
  baseline_values = merged_obj[merged_obj['timepoint']==0]
  assert baseline_values['subject_id'].value_counts().max() == 1
  prefixes = set(['Titre','Cellfrequency','Cytokine','GEX'])
  baseline_feats = baseline_values[[p for p in baseline_values.columns if p.split('_')[0] in prefixes]]
  subject = baseline_values[['subject_id', 'biological_sex', 'dataset', 'age', 'infancy_vac']]
  subject['biological_sex'] = subject['biological_sex'].map(lambda x:1 if x == 'Female' else 0)
  subject['infancy_vac'] = subject['infancy_vac'].map(lambda x:1 if x == 'aP' else 0)
  baseline_feats = pd.concat([baseline_feats, subject],axis=1)
  day14_values = merged_obj[merged_obj['timepoint']==14][['subject_id', 'Titre_IgG_PT']].rename(columns = {'Titre_IgG_PT': 'Day14_IgG_Titre'}).set_index('subject_id')
  baseline_feats = pd.merge(left = baseline_feats, right = day14_values, right_index = True, left_on = 'subject_id')
  baseline_feats['Day14_IgG_FC'] = baseline_feats.apply(lambda x:(x.Day14_IgG_Titre+1)/(x['Titre_IgG_PT']+1), axis = 1)
  baseline_feats.fillna('ND').to_csv(path, sep = '\t')

In [None]:
def check_values(feature, original_file, subject_file, joined_file, subject):
  assert feature in original_file.columns
  specimen = subject_file[(subject_file['subject_id'] == subject) & (subject_file['timepoint'] == 0)]
  assert specimen.shape[0] == 1
  specimen = specimen.iloc[0]['specimen_id']
  if str(joined_file[joined_file['subject_id']==subject][feature].iloc[0]) != 'nan':
    assert joined_file[joined_file['subject_id']==subject][feature].iloc[0] == original_file.loc[specimen][feature]
  else:
    assert specimen not in original_file.index

In [None]:
for feature in cell_freq_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, cell_freq_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 cell frequency features.')

In [None]:
for feature in titre_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, titre_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 cell frequency features.')

In [None]:
for feature in cytokine_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, cytokine_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 olink features.')

In [None]:
for feature in gex_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, gex_data, metadata, baseline_feats, subject = subject)
print('Successful check on GEX features.')

In [None]:
total_subjects = metadata['subject_id'].unique()
print(f'Metadata file contains {len(total_subjects)} subjects.')
d0_specimens = set(metadata[(metadata['timepoint']==0)]['specimen_id'].unique())
cell_freq_sp = len(set(cell_freq_data.index).intersection(d0_specimens))
print(f'Cell frequency data at D0 for {cell_freq_sp} subjects.')
gex_sp = len(set(gex_data.index).intersection(d0_specimens))
print(f'GEX data at D0 for {gex_sp} subjects.')
titre_sp = len(set(titre_data.index).intersection(d0_specimens))
print(f'Antibody titre data at D0 for {titre_specimens} subjects.')
cytokine_sp = len(set(cytokine_data.index).intersection(d0_specimens))
print(f'Antibody titre data at D0 for {cytokine_specimens} subjects.')
