<a href="https://colab.research.google.com/github/collinwa/MPCCA/blob/main/analysis_notebooks/preprocess_mesa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import glob
import numpy as np
import pandas as pd
import torch
np.set_printoptions(linewidth = 200)

In [2]:
def filter_data(X, min_mean = None, min_var = None, max_missing = None):
  drop = []
  if min_mean is not None:
    mean_X = X.mean()
    drop += list(mean_X.index[mean_X <= min_mean])
  if min_var is not None:
    var_X = X.var()
    drop += list(var_X.index[var_X <= min_var])
  if max_missing is not None:
    missing_X = pd.isna(X).mean()
    drop += list(missing_X.index[missing_X > max_missing])
  return X.drop(columns = drop)

In [3]:
# exam = '1'
exam = '5'

# Basic datasets
exp_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/rna/phenotype_matrices/PBMC_Exam_' + exam + '.expression.bed.gz'
methyl_fn = '/gpfs/commons/projects/MESA/projects/mapping_mqtls/input/data/topmed_mesa.exam_' + exam + '.multiethnic.normalized_methylation.bed.gz'
metab_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metabolites/phenotype_matrices/Metabolites_Exam_' + exam + '.invnorm.parquet'
protein_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/proteins/phenotype_matrices/Proteins_Exam_' + exam + '.invnorm.parquet'

# Probe info
probe_snp_overlap_fn = '/gpfs/commons/projects/MESA/projects/annotation/EPIC_polymorphicprobes/freeze8.pass_only.phased.mesa_1319samples.maf01.biallelic.SNVs_overlapping_with_probes.txt.gz'
probe_manifest_fn = '/gpfs/commons/projects/MESA/projects/annotation/EPIC_hg38/EPIC.hg38.manifest.tsv.gz'

# Sample/phenotype/batch info (without exam labels?)
sample_info_fn = '/gpfs/commons/projects/MESA/projects/summary_of_data/unique_individuals_across_molecular_data.txt'
methyl_celltype_fn = '/gpfs/commons/projects/MESA/projects/preprocess_methylation/houseman/estimated_cellcounts_houseman.txt'
exp_celltype_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/deconvolution/TOPMed_MESA_RNAseq.2648_freeze_samples.cibersort_lm22.collapsed.txt'
methyl_batch_fn = '/gpfs/commons/projects/MESA/projects/preprocess_methylation/samplesheet/samplesheet_with_feno.mixup_fix.only_BIS.pass_QC.no_duplicates.no_relatives.freeze_8.multiethnic.txt'
exp_batch_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metadata/TOPMed_MESA_RNAseq_2973samples_metadata.txt'

# Sample/phenotype/batch info with exam labels
metab_batch_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metabolites/covariates/Metabolites_Exam_' + exam + '.txt'
phenotype_fp = '/gpfs/commons/projects/MESA/projects/pheno_data/exam_' + exam + '.*.txt'

In [4]:
methyl_data_save_fn='/gpfs/commons/projects/MESA/projects/mpcca/methyl_cca_collated_exam' + exam + '.csv'
metab_data_save_fn='/gpfs/commons/projects/MESA/projects/mpcca/metab_cca_collated_exam' + exam + '.csv'
exp_data_save_fn='/gpfs/commons/projects/MESA/projects/mpcca/exp_cca_collated_exam' + exam + '.csv'
protein_data_save_fn='/gpfs/commons/projects/MESA/projects/mpcca/protein_cca_collated_exam' + exam + '.csv'
meta_df_save_fn='/gpfs/commons/projects/MESA/projects/mpcca/metadata_cca_collated_exam' + exam + '.csv'

In [5]:
sample_info = pd.read_table(sample_info_fn)
metab_batch = pd.read_table(metab_batch_fn, index_col=0).T
exp_batch = pd.read_table(exp_batch_fn, index_col=0)
methyl_batch = pd.read_table(methyl_batch_fn, index_col='TOEID')

methyl_ct = pd.read_table(methyl_celltype_fn, index_col='TOEID')
exp_ct = pd.read_table(exp_celltype_fn, index_col=0)

In [6]:
# Note:There are some expression samples that are not in this sample_info table,
#   so we can't rely on that to find our complete data samples and we need to
#   manually intersect the data types.

# Some columns contain multiple sample labels. Split them so there is one row per sample.
split_columns = ['Metabolomics', 'Methylomics', 'Proteomics', 'Transcriptomics_PBMC']
for column in split_columns:
  sample_info[column] = sample_info[column].str.split(';')
  sample_info = sample_info.explode(column)

sample_info = sample_info[sample_info['Exam'] == int(exam)].drop(
    columns=['Exam', 'Transcriptomics_Mono', 'Transcriptomics_Tcell',
             'nr_molecular_file']).drop_duplicates()

In [7]:
methyl_batch = methyl_batch.drop(
    columns=['Sample_Name', 'Sample_Group', 'Pool_ID', 'sidno', 'DecodedID',
             'LEVELE1.id', 'exam', 'age', 'race', 'race.1', 'sex', 'sex.1',
             'Basename', 'sidno_exam.ID', 'NWDid'])

In [8]:
exp_batch = exp_batch.drop(
    columns=['age', 'exam', 'latest_run_ended', 'participant_id', 'race',
             'sample_type', 'sex', 'study', 'cohort', 'cohort2', 'has_genotype',
             'phase', 'source', 'nwd_id'])

In [9]:
pheno_df = pd.concat(
    [pd.read_table(fn, index_col=0) for fn in glob.glob(phenotype_fp)], axis=1).drop(
    columns=['sex', 'sidno', 'age', 'race', 'race_1', 'sex_1', 'race_white',
             'race_black', 'race_chinese', 'race_hispanic'])

In [10]:
# exp_batch seems to include the samples that have extant RNA-seq data but NA
#   for the sample id in the sample_info file.
# meta_batch has an entry for every data point.
# methyl_batch has an entry for every data point.
meta_df = sample_info.join(
    exp_batch, on='Transcriptomics_PBMC').join(
    metab_batch[['column_2', 'column_3']], on='NWDID').join(
    methyl_batch, on='Methylomics').join(
    methyl_ct.drop(columns='IID'), on='Methylomics').join(
    exp_ct, on='Transcriptomics_PBMC').join(
    pheno_df, on='NWDID')
meta_df['num_nans'] = pd.isna(meta_df).sum(1).values
meta_df.index = meta_df.NWDID

In [11]:
meta_df

Unnamed: 0_level_0,sidno,NWDID,Sex,Race,Age,Metabolomics,Methylomics,Proteomics,Transcriptomics_PBMC,3p_bias,...,smoking_packyears,smoking_age_start,smoking_current,smoking_ever,smoking_former,smoking_quit_smoking,cotinine,smoking_score_smoking,smoking_score_methylation,num_nans
NWDID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NWD331470,10006,NWD331470,male,hispanic,77,,,,TOR379757,,...,5.60,18.0,0.0,1.0,1.0,,,,,45
NWD133576,10008,NWD133576,male,white,69,TOM144001,TOE414643,TOP458029,TOR882863,,...,0.00,,0.0,1.0,1.0,,,-7.37,3.11,16
NWD678628,10016,NWD678628,female,hispanic,83,TOM259632,TOE394564,TOP581313,,,...,0.00,0.0,0.0,0.0,0.0,,7.072136,-10.02,3.79,24
NWD302101,10020,NWD302101,female,white,87,TOM709900,TOE769161,TOP493348,TOR811735,0.22,...,0.00,0.0,0.0,0.0,0.0,,,-6.75,3.37,10
NWD132273,10028,NWD132273,male,white,81,TOM971531,TOE553927,TOP579694,TOR726547,0.16,...,,,,,,,,,,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NWD969537,26428,NWD969537,female,black,76,TOM253836,TOE901842,TOP273686,,,...,0.00,0.0,0.0,0.0,0.0,,8.786068,-5.91,2.96,27
NWD133198,26438,NWD133198,male,black,73,TOM531375,,TOP621660,,,...,7.60,16.0,0.0,1.0,1.0,,,,,52
NWD306904,26453,NWD306904,female,black,67,,,,TOR388124,,...,4.86,67.0,1.0,1.0,0.0,,137.500000,,,42
NWD164446,26461,NWD164446,female,black,63,TOM218907,TOE628110,TOP336067,TOR928571,0.18,...,0.00,,0.0,1.0,1.0,,7.072136,-5.79,-2.07,9


In [12]:
probe_distance = pd.read_table(probe_snp_overlap_fn, usecols=[11, 12]).apply(
    lambda x: x.str.split(';')).explode(['probe', 'distance'])
bad_probes_dist = probe_distance[probe_distance['distance'].astype('int') < 5].probe

probe_chr = pd.read_table(probe_manifest_fn, usecols=[0, 4])
bad_probes_chr = probe_chr[(probe_chr['CpG_chrm'] == 'chrM') |
                           (probe_chr['CpG_chrm'] == 'chrX') |
                           (probe_chr['CpG_chrm'] == 'chrY')].probeID
bad_probes = pd.unique(list(bad_probes_chr) + list(bad_probes_dist))

In [13]:
skip_f = lambda x: False if x==0 else np.random.choice([True, False], p = [0.8, 0.2])
methyl_data = pd.read_table(methyl_fn, index_col='probe_id', skiprows=skip_f).drop(columns = ['#chr', 'start', 'end'])
methyl_data = methyl_data.drop(methyl_data.index.intersection(bad_probes)).T
methyl_data.index.name = 'NWDID'

In [14]:
metab_data = pd.read_parquet(metab_fn).T
protein_data = pd.read_parquet(protein_fn).T

In [15]:
exp_data =  pd.read_table(exp_fn, index_col='gene_id').drop(columns = ['#chr', 'start', 'end']).T
exp_data.index.name = 'NWDID'

In [16]:
common_inds = exp_data.index.intersection(methyl_data.index).intersection(metab_data.index).intersection(protein_data.index).intersection(meta_df.index)

exp_data = exp_data.loc[common_inds]
methyl_data = methyl_data.loc[common_inds]
metab_data = metab_data.loc[common_inds]
protein_data = protein_data.loc[common_inds]

In [17]:
# Some inds are still duplicated in the meta_df, and there isn't always a map
#   between the sample and the ind in the mol dataset. Thus drop duplicate with
#   the most missing data in the phenotype file. This seems to work.
meta_df = meta_df.loc[common_inds].reset_index(drop=True).sort_values(
    by=['NWDID', 'num_nans']).drop_duplicates(subset='NWDID').drop(
        'sidno', axis=1).set_index('NWDID')

In [18]:
meta_df

Unnamed: 0_level_0,Sex,Race,Age,Metabolomics,Methylomics,Proteomics,Transcriptomics_PBMC,3p_bias,RQS,adapter_contam_pct,...,smoking_packyears,smoking_age_start,smoking_current,smoking_ever,smoking_former,smoking_quit_smoking,cotinine,smoking_score_smoking,smoking_score_methylation,num_nans
NWDID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NWD101761,female,hispanic,61,TOM399029,TOE440608,TOP369611,TOR503055,0.22,8.056686,0.0,...,9.96,34.0,0.0,1.0,1.0,,7.072136,-3.86,3.32,6
NWD103464,female,hispanic,59,TOM403817,TOE296333,TOP357129,,,,,...,0.00,0.0,0.0,0.0,0.0,,7.072136,-4.74,3.73,24
NWD104274,male,white,74,TOM312041,TOE345490,TOP892744,TOR816202,,,,...,0.00,,0.0,1.0,1.0,,7.072136,-6.96,3.54,11
NWD105109,male,white,68,TOM501343,TOE965945,TOP880162,TOR687833,,,,...,0.00,0.0,0.0,0.0,0.0,,7.072136,-5.56,3.41,13
NWD108116,male,black,58,TOM153371,TOE604265,TOP552992,TOR830129,0.22,8.822969,0.0,...,0.00,30.0,0.0,1.0,1.0,,7.072136,-2.95,3.51,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NWD989585,male,hispanic,58,TOM497270,TOE523211,TOP323636,TOR893868,0.19,8.882389,0.0,...,10.80,5.0,0.0,1.0,1.0,,7.072136,-7.18,3.68,5
NWD989947,female,hispanic,71,TOM143489,TOE714862,TOP751194,TOR200471,0.17,10.000000,0.0,...,0.00,,0.0,1.0,1.0,,7.072136,-4.46,4.09,10
NWD993367,female,white,70,TOM837246,TOE832975,TOP380058,TOR105058,0.16,8.865249,0.0,...,5.50,15.0,0.0,1.0,1.0,,7.072136,-4.44,2.76,8
NWD993431,female,black,80,TOM434339,TOE174323,TOP383965,,,,,...,0.75,19.0,0.0,1.0,1.0,,7.072136,-7.84,3.09,28


In [19]:
methyl_data = filter_data(methyl_data, max_missing=0)

In [20]:
exp_data.to_csv(exp_data_save_fn)
methyl_data.to_csv(methyl_data_save_fn)
metab_data.to_csv(metab_data_save_fn)
protein_data.to_csv(protein_data_save_fn)
meta_df.to_csv(meta_df_save_fn)