# Compile all post-mortem metadata of interest
#### This notebook extracts metadata (including genomic variant info) for post-mortem samples and matches it to the snRNAseq library IDs and saves a per-sample metadata file

In [39]:
# load the libraries
import pandas as pd
import numpy as np

#### Load the data

In [40]:
# load data
full_metadata = pd.read_excel('./raw_data/metadata/dataset_1282_basic_06-16-2023.xlsx')
full_metadata = full_metadata[['projid', 'study', 'pmi', 'age_death', 'msex', 'amyloid',  'ceradsc',  'nft', 'braaksc', 'cogdx', 'niareagansc', 'ad_reagan', 'apoe_genotype']]
bmc_ids = pd.read_csv('./raw_data/metadata/BMC_sample_id_conversion.csv')
our_sample_ids = pd.read_csv('./raw_data/metadata/sample_IDs_ABCA7-LOF_PFC.csv')
lof_annotation = pd.read_csv('./raw_data/metadata/ABCA7_samples.nLOF.tsv', sep = '\t')
batch2_ids = pd.read_csv('./raw_data/metadata/Fastq_paths_432_PFC_HM_updated.csv')

colnames = full_metadata.columns[1:]
dictionary2 = dict(zip(full_metadata['projid'], np.array(full_metadata[colnames])))

prefix = '/home/gridsan/djuna/homer/github/ABCA7lof2/raw_data/cellranger_counts_out/'
suffix = '/outs/molecule_info.h5'

  warn("Workbook contains no default style, apply openpyxl's default")


#### get metadata for batch 1 and 2

In [41]:
# batch 1 metadata
bmc_ids = bmc_ids[bmc_ids['flowcell']=='4819F']
library_id = [y.split(' ')[-1] for y in [x.split(' : ')[0] for x in bmc_ids['sample_ID']]]
sample_number = [y.split(' (')[0] for y in [x.split(' : ')[1] for x in bmc_ids['sample_ID']]]
dictionary = dict(zip(our_sample_ids['JMB ID'], our_sample_ids['PROJID']))
projids = [dictionary[int(x)] for x in sample_number]

path = [prefix+x+suffix for x in library_id]

meta = np.concatenate((np.array(projids).reshape(-1,1), np.array(library_id).reshape(-1,1), np.array(path).reshape(-1,1), np.stack([dictionary2[x] for x in projids])), axis = 1)
df = pd.DataFrame(meta)
df.columns = np.concatenate((np.array(['projid', 'sample_id', 'molecule_h5']), np.array(colnames)))
df['seq_batch'] = 'JBM'

In [42]:
# batch 2 metadata
all_proj = lof_annotation['projid']
batch1_proj = set(df['projid'])
batch2_proj = all_proj[[x not in batch1_proj for x in all_proj]]

temp = batch2_ids[[x in set(batch2_proj) for x in batch2_ids['projid']]]
dictionary3 = dict(zip(temp['projid'], temp['Library_ID']))
library_id = [dictionary3[x] for x in batch2_proj]
path = [prefix+x+suffix for x in library_id]
meta2 = np.concatenate((np.array(batch2_proj).reshape(-1,1), np.array(library_id).reshape(-1,1), np.array(path).reshape(-1,1), np.stack([dictionary2[x] for x in batch2_proj])), axis = 1)
df2 = pd.DataFrame(meta2)
df2.columns = np.concatenate((np.array(['projid', 'sample_id', 'molecule_h5']), np.array(colnames)))
df2['seq_batch'] = '171013Tsa'

#### combine all the metadata across batches

In [43]:
all_metadata = pd.concat((df, df2), axis=0)
rosmap_clinical = pd.read_csv('./raw_data/metadata/ROSMAP_clinical.csv')
dictionary4 = dict(zip(rosmap_clinical['projid'], rosmap_clinical['individualID']))
all_metadata['individualID'] = [dictionary4[x] for x in all_metadata['projid']]

#### extract the variant information

In [44]:
# get high quality WGSids corresponding to selected samples
wgs_qc = pd.read_csv('./raw_data/metadata/WGS_sample_QC_info.csv')
wgs_qc_pass = wgs_qc[wgs_qc['QC']=='Pass']
wgs_qc_pass = wgs_qc_pass.sort_values('GQN', ascending = False)
wgs_qc_pass = wgs_qc_pass[np.invert(wgs_qc_pass.duplicated('projid', keep='first'))]
dictionary5 = dict(zip(wgs_qc_pass['projid'], wgs_qc_pass['WGS_id']))
all_metadata['WGSid'] = [dictionary5[x] for x in all_metadata['projid']]

In [45]:
# get variant info
variant_info = pd.read_csv('./raw_data/ROSMAP_WGS/HIGHandMED_coding_annotations_syn11724057_subset.csv', index_col=0)

keep = set(['splice_region_variant&intron_variant',
       'missense_variant&splice_region_variant',
       'splice_region_variant',
       'frameshift_variant', 
       'splice_acceptor_variant&intron_variant',
       'stop_gained',
       '5_prime_UTR_premature_start_codon_gain_variant',
       'splice_donor_variant&intron_variant',
       'frameshift_variant&splice_region_variant']) # keep potentially high-impact variants

variant_subset = variant_info[[x in keep for x in variant_info['EFFECT']]]

genes = set(['SORL1', 'TREM2', 'ABCA7', 'ATP8B4', 'ABCA1', 'ADAM10'])
variant_subset = variant_subset[[x in genes for x in variant_subset['GENE']]]
variant_subset = variant_subset[variant_subset['FILTER_PASS']]

  variant_info = pd.read_csv('./raw_data/ROSMAP_WGS/HIGHandMED_coding_annotations_syn11724057_subset.csv', index_col=0)


In [87]:
# split info and genotypes
colnames = ['CHROM',
 'POS',
 'REF_x',
 'ALT_0',
 'ALT_1',
 'ALT_2',
 'ID',
 'REF_y',
 'ALT',
 'EFFECT',
 'IMPACT',
 'GENE',
 'GENEID',
 'HGVS_C',
 'HGVC_P',
 'LOF',
 'NMD',
 'MAF']

colnames2 = ['CHROM',
 'POS',
 'REF_x',
 'ALT_0',
 'ALT_1',
 'ALT_2',
 'FILTER_PASS',
 'numalt',
 'AF_0',
 'AF_1',
 'AF_2',
 'ID',
 'REF_y',
 'ALT',
 'EFFECT',
 'IMPACT',
 'GENE',
 'GENEID',
 'HGVS_C',
 'HGVC_P',
 'LOF',
 'NMD',
 '1000Gp3_AF',
 'N -1/-1',
 'N 0/0',
 'N 0/1',
 'N 0/2',
 'N 0/3',
 'N 0/4',
 'N 0/5',
 'N 0/6',
 'N 1/1',
 'N 1/2',
 'N 1/3',
 'N 1/4',
 'N 1/5',
 'N 1/6',
 'N 2/2',
 'N 2/3',
 'N 2/4',
 'N 2/5',
 'N 2/6',
 'N 3/3',
 'N 3/4',
 'N 3/5',
 'N 3/6',
 'N 4/4',
 'N 4/5',
 'N 4/6',
 'N 5/5',
 'N 5/6',
 'N 6/6',
 'MAF']

variant_subset_info = variant_subset[colnames]
variant_subset_geno = variant_subset[variant_subset.columns[[x not in set(colnames2) for x in variant_subset.columns]]]



In [103]:
# select potential LOF variants
index = np.invert(variant_subset_info['LOF'].isnull())
# add Steinberg variant
index2 = variant_subset_info['HGVS_C']=='c.5570+5G>C'
index_all = index | index2

variant_subset_info_LOF = variant_subset_info[index_all]
variant_subset_geno_LOF = variant_subset_geno[index_all]
variant_subset_geno_LOF_our_samples = variant_subset_geno_LOF[all_metadata['WGSid']]

# add frequency of LOF variant-carriers to metadata
all_metadata['ABCA7LoF'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='ABCA7']!='0/0').sum(axis=0))
all_metadata['TREM2High'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='TREM2']!='0/0').sum(axis=0))
all_metadata['ABCA1High'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='ABCA1']!='0/0').sum(axis=0))
all_metadata['ADAM10High'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='ADAM10']!='0/0').sum(axis=0))
all_metadata['ATP8B4High'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='ATP8B4']!='0/0').sum(axis=0))
all_metadata['SORL1High'] = np.array((variant_subset_geno_LOF_our_samples[variant_subset_info_LOF['GENE']=='SORL1']!='0/0').sum(axis=0))

#### save the data

In [96]:
# save supplementary table
variant_subset_info.to_csv('./supplementary_tables/datax.csv')

In [10]:
# save data
var_info = pd.concat((variant_subset_info, variant_subset_geno), axis = 1)

var_info.to_csv('./raw_data/metadata/single_cell_individual_LOF_info.csv')
var_info.to_csv('./supplementary_tables/data_s1.csv')

In [38]:
# save metadata

all_metadata.to_csv('./raw_data/metadata/single_cell_individual_metadata.csv')

# deidentifying the metadata 
n = ['sample_id','WGSid','study', 'pmi', 'age_death',
       'msex', 'amyloid', 'ceradsc', 'nft', 'braaksc', 'cogdx', 'niareagansc',
       'ad_reagan', 'apoe_genotype', 'seq_batch',
       'ABCA7LoF', 'TREM2High', 'ABCA1High', 'ADAM10High', 'ATP8B4High',
       'SORL1High']
temp = all_metadata[n]
temp['seq_batch'] = ['batch_1' if x=='JBM' else 'batch_2' for x in temp['seq_batch']]
temp['age_death'] = [int(x) for x in temp['age_death']]
temp['pmi'] = [int(x) for x in temp['pmi']]
temp['age_death'] = ['>90' if x>90 else x for x in temp['age_death']]

temp.to_csv('./supplementary_tables/data_s2.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['seq_batch'] = ['batch_1' if x=='JBM' else 'batch_2' for x in temp['seq_batch']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['age_death'] = [int(x) for x in temp['age_death']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['pmi'] = [int(x) for x in temp['pmi']]
A value is trying t

In [54]:
# also save variant info for all individuals
variant_subset_geno
dictionary6 = dict(zip(dictionary5.values(), dictionary5.keys()))

variant_subset_geno_LOF2 = variant_subset_geno_LOF[np.array(list(dictionary6.keys()))]
variant_subset_geno_LOF2.columns = [dictionary6[x] for x in variant_subset_geno_LOF2.columns if x ]
vars_temp = np.array((variant_subset_geno_LOF2[variant_subset_info_LOF['GENE']=='ABCA7']!='0/0').sum(axis=0))

df = pd.DataFrame(np.concatenate((np.array(variant_subset_geno_LOF2.columns).reshape(-1,1), vars_temp.reshape(-1,1)), axis = 1))
df.columns = ['projid', 'ABCA7LoF']

len(np.unique(np.array(variant_subset_geno_LOF2.columns)))==variant_subset_geno_LOF2.shape[1]

df2 = pd.concat((variant_subset_info_LOF, variant_subset_geno_LOF2), axis = 1)

df2.to_csv('./processed_data/rosmap_proteomics/all_samples_lof_genotypes.csv')
df.to_csv('./processed_data/rosmap_proteomics/all_samples_lof_summary.csv')