In [None]:
import glob
import numpy as np
import pandas as pd
import torch
import mpcca_collin.MPCCA.micca_model as micca
np.set_printoptions(linewidth = 200)

In [None]:
def filter_data(X, min_mean = None, min_var = None, max_missing = None):
  drop = []
  if min_mean is not None:
    mean_X = X.mean()
    drop += list(mean_X.index[mean_X <= min_mean])
  if min_var is not None:
    var_X = X.var()
    drop += list(var_X.index[var_X <= min_var])
  if max_missing is not None:
    missing_X = pd.isna(X).mean()
    drop += list(missing_X.index[missing_X > max_missing])
  return X.drop(columns = drop)

In [None]:
geno_pc_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/covariates/mesa.1319_samples.covariates.txt'
exp_fn ='/gpfs/commons/datasets/controlled/MESA/SHARe_TOPMed/Transcriptomics/TOPMed_MESA_RNAseq_Pilot_expression_data/TOPMed_MESA_RNAseq_Pilot_RSEMv1.3.0.rsem_genes_tpm.txt'
methyl_fn = '/gpfs/commons/projects/MESA/projects/preprocess_methylation/normalization/norm.beta.pc11.exam1.no_duplicates.no_relatives.txt.gz'
metab_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metabolites/phenotype_matrices/Metabolites_Exam_1.invnorm.parquet'
protein_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/proteins/phenotype_matrices/Proteins_Exam_1.invnorm.parquet'

probe_snp_overlap_fn = '/gpfs/commons/projects/MESA/projects/annotation/EPIC_polymorphicprobes/freeze8.pass_only.phased.mesa_1319samples.maf01.biallelic.SNVs_overlapping_with_probes.txt.gz'
probe_manifest_fn = '/gpfs/commons/projects/MESA/projects/annotation/EPIC_hg38/EPIC.hg38.manifest.tsv.gz'

sample_info_fn = '/gpfs/commons/projects/MESA/projects/summary_of_data/unique_individuals_across_molecular_data.txt'
methyl_celltype_fn = '/gpfs/commons/projects/MESA/projects/preprocess_methylation/houseman/estimated_cellcounts_houseman.txt'
exp_celltype_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/deconvolution/TOPMed_MESA_RNAseq.2648_freeze_samples.cibersort_lm22.collapsed.txt'
methyl_batch_fn = '/gpfs/commons/projects/MESA/projects/preprocess_methylation/samplesheet/samplesheet_with_feno.mixup_fix.only_BIS.pass_QC.no_duplicates.no_relatives.freeze_8.multiethnic.txt'
exp_batch_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metadata/TOPMed_MESA_RNAseq_2973samples_metadata.txt'
metab_batch_fn = '/gpfs/commons/datasets/controlled/MESA/topmed_mesa_exchange/metabolites/covariates/Metabolites_Exam_1.txt'

phenotype_fp = '/gpfs/commons/projects/MESA/projects/pheno_data/exam_1.*.txt'

In [None]:
methyl_data_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/methyl_cca_collated.json"
metab_data_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/metab_cca_collated.json"
exp_data_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/exp_cca_collated.json"
geno_data_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/geno_cca_collated.json"
protein_data_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/protein_cca_collated.json"
meta_df_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/metadata_cca_collated.json"
pheno_df_save_fn="/gpfs/commons/projects/MESA/projects/mpcca/phenotypes_cca_collated.json"

In [None]:
sample_info = pd.read_table(sample_info_fn)
metab_batch = pd.read_table(metab_batch_fn, index_col=0).T
exp_batch = pd.read_table(exp_batch_fn, index_col=0)
methyl_batch = pd.read_table(methyl_batch_fn, index_col='TOEID')

methyl_ct = pd.read_table(methyl_celltype_fn, index_col='TOEID')
exp_ct = pd.read_table(exp_celltype_fn, index_col=0)

In [None]:
split_columns = ['Metabolomics', 'Methylomics', 'Proteomics', 'Transcriptomics_PBMC']
for column in split_columns:
  sample_info[column] = sample_info[column].str.split(';')
  sample_info = sample_info.explode(column)

sample_info = sample_info[(sample_info['Exam'] == 1) & 
                          (~pd.isna(sample_info['Metabolomics'])) &
                          (~pd.isna(sample_info['Methylomics'])) &
                          (~pd.isna(sample_info['Proteomics'])) &
                          (~pd.isna(sample_info['Transcriptomics_PBMC'])) ].drop(
    columns=['Exam', 'Transcriptomics_Mono', 'Transcriptomics_Tcell',
             'nr_molecular_file']).drop_duplicates()

In [None]:
methyl_batch = methyl_batch.drop(
    columns=['Sample_Name', 'Sample_Group', 'Pool_ID', 'sidno', 'DecodedID',
             'LEVELE1.id', 'exam', 'age', 'race', 'race.1', 'sex', 'sex.1',
             'Basename', 'sidno_exam.ID', 'NWDid'])

In [None]:
exp_batch = exp_batch.drop(
    columns=['age', 'exam', 'latest_run_ended', 'participant_id', 'race',
             'sample_type', 'sex', 'study', 'cohort', 'cohort2', 'has_genotype',
             'exclusion_reason', 'phase', 'source', 'nwd_id'])

In [None]:
# NOTE: This metadata df has duplicate individuals contributing multiple
#       samples. Keeping all for reference but must be filtered based on
#       missingness patterns in the covariates. May also contain related
#       individuals?
meta_df = sample_info.join(
    exp_batch, on='Transcriptomics_PBMC').join(
    metab_batch[['bmi1c', 'column_2', 'column_3']], on="NWDID").join(
    methyl_batch, on='Methylomics').join(
    methyl_ct.drop(columns='IID'), on='Methylomics').join(
    exp_ct, on='Transcriptomics_PBMC')
meta_df['num_nans'] = pd.isna(meta_df).sum(1).values
meta_df.reset_index(inplace=True)
meta_df.to_json(meta_df_save_fn)

In [None]:
probe_distance = pd.read_table(probe_snp_overlap_fn, usecols=[11, 12]).apply(
    lambda x: x.str.split(';')).explode(['probe', 'distance'])
bad_probes_dist = probe_distance[probe_distance['distance'].astype('int') < 5].probe

probe_chr = pd.read_table(probe_manifest_fn, usecols=[0, 4])
bad_probes_chr = probe_chr[(probe_chr['CpG_chrm'] == 'chrM') |
                           (probe_chr['CpG_chrm'] == 'chrX') |
                           (probe_chr['CpG_chrm'] == 'chrY')].probeID
bad_probes = pd.unique(list(bad_probes_chr) + list(bad_probes_dist))

In [None]:
skip_f = lambda x: False if x==0 else np.random.choice([True, False], p = [0.8, 0.2])
methyl_data = pd.read_table(methyl_fn, index_col=0, skiprows=skip_f)
methyl_data = methyl_data.drop(methyl_data.index.intersection(bad_probes)).T

In [None]:
geno_data = pd.read_table(geno_pc_fn, index_col=0).drop(columns='gender')
metab_data = pd.read_parquet(metab_fn).T
protein_data = pd.read_parquet(protein_fn).T

In [None]:
exp_data = pd.read_table(exp_fn, index_col=0).drop(columns='transcript_id(s)').T
# Some inds have duplicate samples, keep the one with the least batch/ct nans.
exp_data = meta_df.join(exp_data, on='Transcriptomics_PBMC', how='inner').sort_values(by=['sidno', 'NWDID', 'num_nans']).drop_duplicates(subset='NWDID')
exp_data.set_index('NWDID')
exp_data = exp_data.drop(columns=meta_df.columns)

In [None]:
common_inds = exp_data.index.intersection(geno_data.index).intersection(methyl_data.index).intersection(metab_data.index).intersection(protein_data.index)
geno_data = geno_data.loc[common_inds]
exp_data = exp_data.loc[common_inds]
methyl_data = methyl_data.loc[common_inds]
metab_data = metab_data.loc[common_inds]
protein_data = protein_data.loc[common_inds]

In [None]:
exp_data = filter_data(exp_data, min_mean=0.1, min_var=0)
methyl_data = filter_data(methyl_data, max_missing=0)
# methyl_data.fillna(methyl_data.mean())

In [None]:
# I don't love saving this as json, open to better options.
geno_data.to_json(geno_data_save_fn)
exp_data.to_json(exp_data_save_fn)
methyl_data.to_json(methyl_data_save_fn)
metab_data.to_json(metab_data_save_fn)
protein_data.to_json(protein_data_save_fn)

In [None]:
pheno_df = pd.concat(
    [pd.read_table(fn, index_col=0) for fn in glob.glob(phenotype_fp)], axis=1).drop(
    columns=['sex', 'sidno', 'age', 'race', 'race_1', 'sex_1', 'race_white',
             'race_black', 'race_chinese', 'race_hispanic', 'bmi'])
pheno_df.to_json(pheno_df_save_fn)