In [3]:
import pandas as pd
import scanpy as sc
import anndata
import numpy as np

# Load datasets
datasets = []
for path in [
    'brain/TMT/MSBB_clinical_TMT_1.csv',
    'brain/TMT/Rosmapr1_clinical_TMT.csv',
    'brain/TMT/Rosmapr2_clinical_TMT_1.csv',
    'brain/TMT/RosmapBA637_clinical_TMT_1.csv',
    'brain/TMT/EmoryBA9_clinical_TMT_1.csv',
    'brain/TMT/EmoryBA24_clinical_TMT_1.csv',
]:
    df = pd.read_csv(path)
    
    df.rename(columns={'batch.channel': 'ID'}, inplace=True)

    df.set_index('ID', inplace=True)
    datasets.append(df)

# Drop certain columns from each dataset
columns_to_select = ['age', 'sex','Abeta42','tTau','pTau']  # Replace with actual column names to select
for df in datasets:
    df[columns_to_select] = df[columns_to_select].apply(pd.to_numeric, errors='coerce')
datasets = [df[columns_to_select] for df in datasets]

# Concatenate datasets
concatenated_df = pd.concat(datasets, axis=0)

# Add metadata to AnnData object
adata = sc.AnnData(X=concatenated_df.values)
adata.obs = concatenated_df





In [4]:
adata.obs.index = adata.obs.index.str.replace(r'-\d+', '', regex=True)
print(adata.obs)

                    age  sex        pmi disease_group  batch      study  \
ID                                                                        
MSBB.TMT12_NL_SJ   86.0    0  11.500000       Control      2       MSBB   
MSBB.TMT2_AD_SJ    81.0    0   3.166667            AD      2       MSBB   
MSBB.TMT17_AD_SJ   85.0    1   5.166667            AD      2       MSBB   
MSBB.TMT17_AD_SJ   90.0    0   5.083333            AD      2       MSBB   
MSBB.TMT1_AD_SJ    77.0    0   6.333333            AD      2       MSBB   
...                 ...  ...        ...           ...    ...        ...   
EmoryBA24.b3.130C  58.0    1  12.500000            AD      7  EmoryBA24   
EmoryBA24.b5.130C  65.0    1  11.500000            AD      7  EmoryBA24   
EmoryBA24.b3.130N  67.0    1  10.000000            AD      7  EmoryBA24   
EmoryBA24.b1.127N  55.0    1   4.000000            AD      7  EmoryBA24   
EmoryBA24.b2.127N  55.0    0   4.500000            AD      7  EmoryBA24   

                   batch

In [5]:
# Load AnnData object
proteomics_adata = sc.read('adata_batch_corrected_trial2/TMT_brain_combined_corrected.h5ad')
# Drop the 'batch' column from the obs attribute
proteomics_adata.obs = proteomics_adata.obs.drop(columns=['batch'])
# Remove numbers coming after the '-' sign in the row names
proteomics_adata.obs.index = proteomics_adata.obs.index.str.replace(r'-\d+', '', regex=True)

# Convert the values of the 'group' column in the obs attribute to strings
proteomics_adata.obs['Group'] = proteomics_adata.obs['Group'].astype(str)
# Print the obs attribute
print(proteomics_adata.obs)

                     Group
ID                        
MSBB.TMT1_NL_SJ    Control
MSBB.TMT1_NL_SJ         AD
MSBB.TMT1_NL_SJ         AD
MSBB.TMT2_NL_SJ         AD
MSBB.TMT2_NL_SJ         AD
...                    ...
EmoryBA24.b4.130N       AD
EmoryBA24.b4.128N       AD
EmoryBA24.b4.127C       AD
EmoryBA24.b4.128C       AD
EmoryBA24.b4.130C       AD

[783 rows x 1 columns]


In [6]:
import muon as mu
from muon import MuData
import numpy as np

# Create a MuData object from the two AnnData objects
mdata = MuData({'clinical': adata, 'proteomics': proteomics_adata})

# Print the MuData object to verify
print(mdata)

  from .autonotebook import tqdm as notebook_tqdm


MuData object with n_obs × n_vars = 783 × 4291
  2 modalities
    clinical:	783 x 8
      obs:	'age', 'sex', 'pmi', 'disease_group', 'batch', 'study', 'batch_org', 'BA'
    proteomics:	783 x 4283
      obs:	'Group'


  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [7]:
# Find the indices that are not aligned between adata and proteomics_adata
unaligned_indices = adata.obs.index.symmetric_difference(proteomics_adata.obs.index)
print("Unaligned index names:", unaligned_indices)



Unaligned index names: Index([], dtype='object', name='ID')


In [12]:
# Write the MuData object to file
mdata.write('mudata_prepared_trial3/TMT_brain_mudata.h5mu')

Non-numeric data in adata.X has been converted to NaN.


  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
