<a href="https://colab.research.google.com/github/erichardson97/cell_crushers/blob/main/processing_data/training_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive, files
import os
import numpy as np
import pandas as pd
from glob import glob
from collections import defaultdict
import requests
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
#@title Enter working directory name
working_directory = "/content/drive/MyDrive/CMIPB_Files"#@param {type:"string"}
if os.path.exists(working_directory) is False:
  os.mkdir(working_directory)

In [4]:
# @title Reading in the cytokine name + gene name conversions from google drive.

sheet_url = 'https://docs.google.com/spreadsheets/d/1EIrUaQB7bJ1BO66x9TLfiXrbjpRyIwMQKru0HIgwr1s/edit#gid=0'
url_1 = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
sheet = pd.read_csv(url_1)
cytokine_mapping = dict(sheet[sheet["Category"]=='Plasma cytokine'][['Feature name','Additional Feature name']].values)
gene_mapping = dict(sheet[sheet["Category"]=='Gene expression'][['Feature name','Additional Feature name']].values)


In [9]:
# @title Download the files if you need to.

def download_tsv(path, name):
  print(requests.get(path).content.decode(), file = open(name, 'w'))

base_url = 'https://www.cmi-pb.org/downloads/cmipb_challenge_datasets/current/2nd_challenge/processed_datasets/master_processed_training_data_TSVs/'

files = ['abtiter_batchCorrected_data.tsv', 'abtiter_metadata.tsv', 'abtiter_normalized_data.tsv', 'abtiter_raw_data.tsv', 'pbmc_cell_frequency_batchCorrected_data.tsv',
         'pbmc_cell_frequency_metadata.tsv', 'pbmc_cell_frequency_normalized_data.tsv', 'pbmc_gene_expression_batchCorrected.tsv', 'pbmc_gene_expression_metadata.tsv',
         'pbmc_gene_expression_raw_data.tsv', 'plasma_cytokine_concentrations_batchCorrected.tsv', 'plasma_cytokine_concentrations_metadata.tsv', 'plasma_cytokine_concentrations_normalized_data.tsv',
         'subject_specimen.tsv']

file_path = os.path.join(working_directory, 'files')
if os.path.exists(file_path) is False:
  os.mkdir(file_path)

for f in files:
  download_tsv(os.path.join(base_url, f), os.path.join(file_path, f))




In [12]:
# @title Reading in the files with set prefix.

def read_in_file(path, prefix, rename = False):
  data = pd.read_csv(path, sep = '\t').T
  data.index = data.index.map(int)
  if rename != False:
    data.columns = data.columns.map(rename)
  data.columns = [prefix+'_'+p for p in data.columns]
  return data

metadata = pd.read_csv(os.path.join(file_path, 'subject_specimen.tsv'), sep = '\t')
metadata['age'] = metadata.apply(lambda x:int(x.dataset.split('_')[0]) - int(x.year_of_birth.split('-')[0]), axis = 1)
data_type = 'normalized'#'batchCorrected'
titre_data = read_in_file(os.path.join(file_path, f'abtiter_{data_type}_data.tsv'), 'Titre')
cell_freq_data = read_in_file(os.path.join(file_path, f'pbmc_cell_frequency_{data_type}_data.tsv'), 'Cellfrequency')
cytokine_data = read_in_file(os.path.join(file_path, f'plasma_cytokine_concentrations_{data_type}_data.tsv'), 'Cytokine', rename = cytokine_mapping)
if data_type == 'batchCorrected':
  gex_data = read_in_file(os.path.join(file_path, f'pbmc_gene_expression_batchCorrected_data.tsv'), 'GEX', rename = gene_mapping)
else:
  gex_data = read_in_file(os.path.join(file_path, f'pbmc_gene_expression_raw_data.tsv'), 'GEX', rename = gene_mapping)
merged_obj = pd.merge(right = metadata, left = titre_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = gex_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cytokine_data, left_index = True, right_on = 'specimen_id', how = 'right')
merged_obj = pd.merge(right = merged_obj, left = cell_freq_data, left_index = True, right_on = 'specimen_id', how = 'right')


In [None]:
# data_type = 'normalized'#'batchCorrected'
# metadata = pd.read_csv('abtiter_metadata.tsv', sep = '\t')
# titre_data = read_in_file(f'abtiter_{data_type}_data.tsv', 'Titre')
# cell_freq_data = read_in_file(f'pbmc_cell_frequency_{data_type}_data.tsv', 'Cellfrequency')
# cytokine_data = read_in_file(f'plasma_cytokine_concentrations_{data_type}_data.tsv', 'Cytokine', rename = cytokine_mapping)
# gex_data = read_in_file(f'pbmc_gene_expression_{data_type}_data.tsv', 'GEX', rename = gene_mapping)
# cell_freq_data['subject'] = cell_freq_data.index.map(subject_id)
# cytokine_data['subject'] = cytokine_data.index.map(subject_id)
# gex_data['subject'] = gex_data.index.map(subject_id)
# titre_data['subject'] = titre_data.index.map(subject_id)
# which_data = pd.DataFrame([metadata['subject_id'].unique()]).T
# which_data = which_data.set_index(0)
# which_data['Titre'] = np.array(which_data.index.isin(set(titre_data['subject'].unique())), dtype = int)
# which_data['Cytokine'] = np.array(which_data.index.isin(set(cytokine_data['subject'].unique())), dtype = int)
# which_data['PBMC'] = np.array(which_data.index.isin(set(cell_freq_data['subject'].unique())), dtype = int)
# which_data['GEX'] = np.array(which_data.index.isin(set(gex_data['subject'].unique())), dtype = int)
# which_data.to_csv('/content/drive/MyDrive/CMIPB_Files/Subject_HasData.csv')

In [None]:
baseline_values = merged_obj[merged_obj['timepoint']==0]
assert baseline_values['subject_id'].value_counts().max() == 1
prefixes = set(['Titre','Cellfrequency','Cytokine','GEX'])
baseline_feats = baseline_values[[p for p in baseline_values.columns if p.split('_')[0] in prefixes]]
subject = baseline_values[['subject_id', 'biological_sex', 'dataset', 'age', 'infancy_vac']]
subject['biological_sex'] = subject['biological_sex'].map(lambda x:1 if x == 'Female' else 0)
subject['infancy_vac'] = subject['infancy_vac'].map(lambda x:1 if x == 'aP' else 0)
baseline_feats = pd.concat([baseline_feats, subject],axis=1)
day14_values = merged_obj[merged_obj['timepoint']==14][['subject_id', 'Titre_IgG_PT']].rename(columns = {'Titre_IgG_PT': 'Day14_IgG_Titre'}).set_index('subject_id')
baseline_feats = pd.merge(left = baseline_feats, right = day14_values, right_index = True, left_on = 'subject_id')
baseline_feats['Day14_IgG_FC'] = baseline_feats.apply(lambda x:(x.Day14_IgG_Titre+1)/(x['Titre_IgG_PT']+1), axis = 1)
baseline_feats.fillna('ND').to_csv('IntegratedData_Normalized.tsv', sep = '\t')

In [93]:
def check_values(feature, original_file, subject_file, joined_file, subject):
  assert feature in original_file.columns
  specimen = subject_file[(subject_file['subject_id'] == subject) & (subject_file['timepoint'] == 0)]
  assert specimen.shape[0] == 1
  specimen = specimen.iloc[0]['specimen_id']
  if str(joined_file[joined_file['subject_id']==subject][feature].iloc[0]) != 'nan':
    assert joined_file[joined_file['subject_id']==subject][feature].iloc[0] == original_file.loc[specimen][feature]
  else:
    assert specimen not in original_file.index

In [None]:
for feature in cell_freq_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, cell_freq_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 cell frequency features.')

In [None]:
for feature in titre_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, titre_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 cell frequency features.')

In [97]:
for feature in cytokine_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, cytokine_data, metadata, baseline_feats, subject = subject)
print('Successful check on d0 olink features.')

Specimen id: 1
Specimen id: 19
Specimen id: 27
Specimen id: 37
Specimen id: 45
Specimen id: 55
Specimen id: 63
Specimen id: 70
Specimen id: 77
Specimen id: 87
Specimen id: 96
Specimen id: 102
Specimen id: 109
Specimen id: 114
Specimen id: 121
Specimen id: 131
Specimen id: 138
Specimen id: 146
Specimen id: 153
Specimen id: 160
Specimen id: 167
Specimen id: 174
Specimen id: 181
Specimen id: 191
Specimen id: 201
Specimen id: 208
Specimen id: 216
Specimen id: 223
Specimen id: 232
Specimen id: 241
Specimen id: 248
Specimen id: 255
Specimen id: 266
Specimen id: 274
Specimen id: 281
Specimen id: 293
Specimen id: 300
Specimen id: 310
Specimen id: 317
Specimen id: 324
Specimen id: 332
Specimen id: 342
Specimen id: 349
Specimen id: 355
Specimen id: 360
Specimen id: 369
Specimen id: 376
Specimen id: 385
Specimen id: 392
Specimen id: 397
Specimen id: 405
Specimen id: 412
Specimen id: 419
Specimen id: 427
Specimen id: 434
Specimen id: 441
Specimen id: 450
Specimen id: 458
Specimen id: 11
Specimen i

In [None]:
for feature in gex_data.columns:
  for subject in baseline_feats['subject_id'].unique():
    check_values(feature, gex_data, metadata, baseline_feats, subject = subject)
print('Successful check on GEX features.')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Specimen id: 434
Specimen id: 441
Specimen id: 450
Specimen id: 458
Specimen id: 11
Specimen id: 468
Specimen id: 475
Specimen id: 483
Specimen id: 490
Specimen id: 498
Specimen id: 506
Specimen id: 513
Specimen id: 521
Specimen id: 529
Specimen id: 537
Specimen id: 546
Specimen id: 554
Specimen id: 562
Specimen id: 569
Specimen id: 577
Specimen id: 585
Specimen id: 593
Specimen id: 601
Specimen id: 608
Specimen id: 616
Specimen id: 623
Specimen id: 630
Specimen id: 636
Specimen id: 643
Specimen id: 650
Specimen id: 657
Specimen id: 664
Specimen id: 669
Specimen id: 674
Specimen id: 681
Specimen id: 688
Specimen id: 695
Specimen id: 702
Specimen id: 709
Specimen id: 716
Specimen id: 723
Specimen id: 1
Specimen id: 19
Specimen id: 27
Specimen id: 37
Specimen id: 45
Specimen id: 55
Specimen id: 63
Specimen id: 70
Specimen id: 77
Specimen id: 87
Specimen id: 96
Specimen id: 102
Specimen id: 109
Specimen id: 114
Specimen id: 

In [None]:
total_subjects = metadata['subject_id'].unique()
print(f'Metadata file contains {len(total_subjects)} subjects.')
d0_specimens = set(metadata[(metadata['timepoint']==0)]['specimen_id'].unique())
cell_freq_sp = len(set(cell_freq_data.index).intersection(d0_specimens))
print(f'Cell frequency data at D0 for {cell_freq_sp} subjects.')
gex_sp = len(set(gex_data.index).intersection(d0_specimens))
print(f'GEX data at D0 for {gex_sp} subjects.')
titre_sp = len(set(titre_data.index).intersection(d0_specimens))
print(f'Antibody titre data at D0 for {titre_specimens} subjects.')
cytokine_sp = len(set(cytokine_data.index).intersection(d0_specimens))
print(f'Antibody titre data at D0 for {cytokine_specimens} subjects.')
