In [None]:
"""
Author: Wen-Jou Chang
Baylor College of Medicine

This notebook analyzes the overlap between Illumina methylation array probes and CoRSIV regions. 
1. processes probe manifest files from both Zhou Lab and Illumina for different array types (HM450, EPIC, MSA)
2. merge and pad different SIV regions to one unified set of CoRSIV regions
3. filters for autosomal probes and identifies which probes overlap with CoRSIV regions for downstream analysis.
4. annotated CoRSIV regions / probes

We used two sets of probe manifest files:

1. Zhou Lab manifest files for probe coordinates, ID, and Gencode annotations
HM450: https://github.com/zhou-lab/InfiniumAnnotationV1/raw/main/Anno/HM450/HM450.hg38.manifest.gencode.v36.tsv.gz
EPIC: https://github.com/zhou-lab/InfiniumAnnotationV1/raw/main/Anno/EPIC/EPIC.hg38.manifest.gencode.v36.tsv.gz
MSA: https://github.com/zhou-lab/InfiniumAnnotationV1/raw/main/Anno/MSA/MSA.hg38.manifest.gencode.v41.tsv.gz  


2. Illumina manifest files for UCSC gene annotations
HM450: https://webdata.illumina.com/downloads/productfiles/humanmethylation450/humanmethylation450_15017482_v1-2.csv
EPIC: https://webdata.illumina.com/downloads/productfiles/methylationEPIC/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip
MSA: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/infiniummethylationscreening/MSA-48v1-0_20102838_A1.csv  

Please download all files above and place them in ILLUMINA_DIR before running the code below.
"""

In [2]:
# initialization

import pandas as pd
from pybedtools import BedTool
import os
from functools import reduce

ILLUMINA_DIR = "PUT YOUR DIRECTORY HERE"


In [None]:
def filter_autosome(array_type):
    
    """
    Filters probe list to only include probes on chr1-22. Limit probes to those starting with cg or ch.
    
    Args:
        type (string):"HM450", "EPIC", "MSA"
    """
    
    autosome = ["chr"+str(i) for i in range(1,23)]
    ver = "36" if array_type != "MSA" else "41"
    df = pd.read_csv(f"{ILLUMINA_DIR}/{array_type}.hg38.manifest.gencode.v{ver}.tsv", header=0, sep="\t")
    df = df[df["probeID"].apply(lambda x: x.startswith("cg") or x.startswith("ch."))]
    df = df[df["CpG_chrm"].apply(lambda x: x in autosome)]
    df = df[["CpG_chrm", "CpG_beg", "CpG_end", "probeID"]]
    df["CpG_beg"] = df["CpG_beg"].astype(int)
    df["CpG_end"] = df["CpG_end"].astype(int)

    # sort by chr then start pos
    chr_order = {f'chr{i}': i for i in range(1, 23)}
    df = df.sort_values(
        by=['CpG_chrm', 'CpG_beg'],
        key=lambda x: x.map(chr_order) if x.name == 'CpG_chrm' else x
    )
        
    df.to_csv(f"data/humanData/{array_type}.hg38.txt", sep="\t", index=0, header=0)
    return df

for array_type in ["HM450", "EPIC"]:
    filter_autosome(array_type)

In [None]:
## Merge Illumina HM450, EPIC, MSA probes
cat HM450.hg38.txt  EPIC.hg38.txt | sort -k1,1 -k2,2n | bedtools merge -i - -d -1 -c 4 -o distinct > HM450_EPIC.clean.bed

## Merge ME, SIV, ESS, CoRSIV regions. For ME,SIV,ESS regions, hg19 coordinates are converted to hg38 coordinates.
cat corsiv2019.txt ESS.hg38.bed ME.hg38.bed SIV.hg38.bed | sort -k1,1 -k2,2n | bedtools merge -i - -d -1 -c 4 -o distinct > data/humanData/corsiv_control/all_corsiv_regions.merged.bed

In [3]:
# filter for regions on chr1-chr22 then pad regions on both ends to have a size with multiple of 100
df = pd.read_csv("data/humanData/corsiv_control/all_corsiv_regions.merged.bed", sep="\t", names=["chr", "start", 'end', "id"])
# autosomes only
autosome = ["chr"+str(i) for i in range(1,23)]
df = df[df["chr"].apply(lambda x: x in autosome)]

df["new_id"] = df.apply(lambda x: f"{str(x['chr'])[3:]}_{x['start']}_{x['end']}", axis=1)
df["block_size"] = df['end'] - df["start"]

# regions with new_id 19_40223366_40223480 and 19_40223493_40223693 will overlap after padding individually, so we merge them first
merge_ids = ['19_40223366_40223480', '19_40223493_40223693']
merge_rows = df[df['new_id'].isin(merge_ids)]
new_start = merge_rows['start'].min()
new_end = merge_rows['end'].max()
new_id = f"19_{new_start}_{new_end}"
merged_ids = ','.join(merge_rows['id'].tolist())

new_row = {
    'chr': 'chr19',
    'start': new_start,
    'end': new_end,
    'new_id': new_id,
    'id': merged_ids,
    'block_size': new_end - new_start
}

df = df[~df['new_id'].isin(merge_ids)]
df = df._append(new_row, ignore_index=True)
  
def pad_to_hundred(row):
    """
    Pads each row (region) so that min(end-start) = 200 and (end-start) % 100 = 0.
    """
    pad_size = (100 - (row['block_size'] % 100)) % 100
    if row["block_size"] + pad_size == 100:
        pad_size += 100
    row['start'] -= (pad_size // 2 + (pad_size % 2)) # pad start with one more bp if odd original block size
    row["end"] += pad_size // 2
    row["block_size"] = row["end"] - row["start"]
    return row
df = df.apply(pad_to_hundred, axis=1)

# sort by chr then start pos
chr_order = {f'chr{i}': i for i in range(1, 23)}
df = df.sort_values(
    by=['chr', 'start'],
    key=lambda x: x.map(chr_order) if x.name == 'chr' else x
)
df[["chr", "start", "end", "new_id", "id"]].to_csv("data/humanData/corsiv_control/corsiv_regions_autosome_padded.bed", sep="\t", index=0, header=0)

In [None]:
# sanity check on padded corsivs
print(df[(df["block_size"]%100!=0) | (df["block_size"] < 200)].to_string()) 
temp = BedTool.from_dataframe(df[["chr", "start", "end", "new_id"]])
res = temp.intersect(temp, wa=True, c=True)
res = res.to_dataframe(names=["chr", "start", "end", "new_id", "count"])
res[res["count"]>1]

In [None]:

corsiv_probe_df = pd.read_csv("data/humanData/corsiv_control/corsiv_HM450_EPIC_probes.bed", sep="\t", names=["Probe_Chr", "Probe_Start", "Probe_End", "Probe_ID", "CoRSIV_Chr", "CoRSIV_Start", "CoRSIV_End", "CoRSIV_ID1", "CoRSIV_ID2"])

hm450_manifest = pd.read_csv(f"{ILLUMINA_DIR}/humanmethylation450_15017482_v1-2.csv", skiprows=7)
hm450_manifest = hm450_manifest[["Name", 'UCSC_RefGene_Name', 'UCSC_RefGene_Group']]
hm450_base = pd.read_csv("cleaned_data/illumina/HM450.hg38.bed", sep="\t", names=["CHR_hg38", "Start_hg38", "End_hg38", "Name"])
hm450_manifest = hm450_manifest.merge(hm450_base, on=["Name"])
hm450_manifest = hm450_manifest[["Name", 'UCSC_RefGene_Name', 'UCSC_RefGene_Group', 'CHR_hg38', 'Start_hg38', 'End_hg38']]
hm450_cleaned = pd.merge(hm450_manifest, corsiv_probe_df, left_on=["Name", 'CHR_hg38', 'Start_hg38', 'End_hg38'], right_on=["Probe_ID", "Probe_Chr", "Probe_Start", "Probe_End"])

epic_manifest = pd.read_csv(f"{ILLUMINA_DIR}/infinium-methylationepic-v-1-0-b5-manifest-file.csv", skiprows=7)
epic_manifest = epic_manifest[["Name", 'UCSC_RefGene_Name', 'UCSC_RefGene_Group', 'CHR_hg38', 'Start_hg38', 'End_hg38']]
epic_cleaned = pd.merge(epic_manifest, corsiv_probe_df, left_on=["Name", 'CHR_hg38', 'Start_hg38', 'End_hg38'], right_on=["Probe_ID", "Probe_Chr", "Probe_Start", "Probe_End"])

manifest_annotation = pd.concat([epic_cleaned, hm450_cleaned])
manifest_annotation.drop_duplicates(inplace=True)
manifest_annotation.loc[
    (manifest_annotation["UCSC_RefGene_Name"].isna()) & (manifest_annotation["UCSC_RefGene_Group"].isna()),
    "UCSC_RefGene_Group"
] = "Intergenic"    
manifest_annotation["Unique_UCSC_RefGene_Name"] = manifest_annotation["UCSC_RefGene_Name"].apply(lambda x: ";".join(set(x.split(";"))) if isinstance(x, str) else "")
manifest_annotation["Unique_UCSC_RefGene_Group"] = manifest_annotation["UCSC_RefGene_Group"].apply(lambda x: ";".join(set(x.split(";"))) if isinstance(x, str) else "")
manifest_annotation.drop(columns=["Name", 'CHR_hg38', 'Start_hg38', 'End_hg38'], inplace=True)


epic_gencode = pd.read_csv(f"{ILLUMINA_DIR}/EPIC.hg38.manifest.gencode.v36.tsv", sep="\t")
hm450_gencode = pd.read_csv(f"{ILLUMINA_DIR}/HM450.hg38.manifest.gencode.v36.tsv", sep="\t")
gencode_annotation = pd.concat([epic_gencode, hm450_gencode])
gencode_annotation.drop_duplicates(inplace=True)
gencode_annotation.rename(columns={"CpG_chrm": "Probe_Chr", "CpG_beg": "Probe_Start", "CpG_end": "Probe_End", "probeID": "Probe_ID", 'genesUniq': "Unique_GencodeV36_Name"}, inplace=True)
gencode_annotation = gencode_annotation[["Probe_ID", "Probe_Chr", "Probe_Start", "Probe_End", 'Unique_GencodeV36_Name']]
corsiv_annotated = pd.merge(gencode_annotation, manifest_annotation, on=["Probe_ID", "Probe_Chr", "Probe_Start", "Probe_End"])
corsiv_annotated['EPIC'] = corsiv_annotated.apply(lambda row: tuple(row[['Probe_Chr', 'Probe_Start', 'Probe_End', 'Probe_ID']]) in set(map(tuple, epic_cleaned[['CHR_hg38', 'Start_hg38', 'End_hg38', 'Name']].values)), axis=1)
corsiv_annotated['HM450'] = corsiv_annotated.apply(lambda row: tuple(row[['Probe_Chr', 'Probe_Start', 'Probe_End', 'Probe_ID']]) in set(map(tuple, hm450_cleaned[['CHR_hg38', 'Start_hg38', 'End_hg38', 'Name']].values)), axis=1)

# Create columns to store corresponding IDs from CoRSIV_ID2
corsiv_annotated['ESS_ID'] = corsiv_annotated['CoRSIV_ID2'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'ESS']))
corsiv_annotated['CoRSIV2019_ID'] = corsiv_annotated['CoRSIV_ID2'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0].isnumeric()]))
corsiv_annotated['ME_ID'] = corsiv_annotated['CoRSIV_ID2'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'ME']))
corsiv_annotated['SIV_ID'] = corsiv_annotated['CoRSIV_ID2'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'SIV']))

# Display the updated dataframe
corsiv_annotated = corsiv_annotated[['Probe_ID', 'Probe_Chr', 'Probe_Start', 'Probe_End',
       'Unique_GencodeV36_Name', 'Unique_UCSC_RefGene_Name', 'Unique_UCSC_RefGene_Group',
       'UCSC_RefGene_Name', 'UCSC_RefGene_Group',
       'CoRSIV_Chr', 'CoRSIV_Start', 'CoRSIV_End', 'CoRSIV_ID1', 'EPIC',
       'HM450', 'ME_ID', 'SIV_ID', 'ESS_ID', 'CoRSIV2019_ID']]
corsiv_annotated.rename(columns={'CoRSIV_ID1': 'CoRSIV_ID'}, inplace=True)

# Sort the DataFrame
chr_order = {f'chr{i}': i for i in range(1, 23)}
corsiv_annotated = corsiv_annotated.sort_values(
    by=['Probe_Chr', 'Probe_Start'],
    key=lambda x: x.map(chr_order) if x.name == 'Probe_Chr' else x
)
corsiv_annotated.to_excel("data/humanData/corsiv_control/corsiv_annotated_manifest.xlsx", index=0)

In [None]:
# get gene body, tes, tss, cpg counts for each CoRSIV region, these information can be found in UCSC genome browser
cd data/humanData

bedtools intersect -a corsiv_control/corsiv_regions_autosome_padded.bed -b hg38_promoters_3kb.tab.txt hg38_gene_bodies.tab.txt hg38_three_prime_region_3kb.tab.txt hg38.CpG.bed.txt Illumina/HM450_EPIC.clean.bed -wa -C -names tss gb tes cpg probect > corsiv_control/corsiv_annotated.bed

In [14]:
import pandas as pd

# Define common column names and file paths
col_names = ["chr", "start", "end", "id"]
files = {
    "ME": "ME.hg38.bed",
    "SIV": "SIV.hg38.bed", 
    "ESS": "ESS.hg38.bed",
    "CoRSIV2019": "corsiv2019.txt",
}

# Read files and add coordinates column
dfs = {}
for name, path in files.items():
    df = pd.read_csv(path, sep="\t", names=col_names)
    df["coords"] = df["chr"] + ":" + df["start"].astype(str) + "-" + df["end"].astype(str)
    dfs[name] = df

# Create columns to store corresponding IDs from CoRSIV_ID2
corsiv_bed = pd.read_csv("../data/humanData/corsiv_control/corsiv_annotated.bed", sep="\t", names=["chr", "start", "end", "CoRSIV_ID", "original_id", "dtype", "count"])
corsiv_bed = corsiv_bed.pivot(index=["chr", "start", "end", "CoRSIV_ID", "original_id"], columns="dtype", values="count").reset_index()
corsiv_bed["block_size"] = corsiv_bed["end"] - corsiv_bed["start"]
corsiv_bed['ME'] = corsiv_bed['original_id'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'ME']))
corsiv_bed['SIV'] = corsiv_bed['original_id'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'SIV']))
corsiv_bed['ESS'] = corsiv_bed['original_id'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0] == 'ESS']))
corsiv_bed['CoRSIV2019'] = corsiv_bed['original_id'].apply(lambda x: ','.join([id for id in x.split(',') if id.split('_')[0].isnumeric()]))

corsiv_bed.drop(columns=["original_id"], inplace=True)

# Create new columns for coordinates from each dataset
for name in ['ME', 'SIV', 'ESS', 'CoRSIV2019']:
    corsiv_bed[f'{name}_COORDS'] = ''
    
    id_to_coords = {}
    if name in dfs:
        id_to_coords = dict(zip(dfs[name]['id'], dfs[name]['coords']))
    
    # Process all rows at once
    corsiv_bed[f'{name}_COORDS'] = corsiv_bed[f'{name}'].apply(
        lambda x: ','.join([id_to_coords.get(id, '') for id in x.split(',')]) if x else ''
    )
# Convert columns to boolean based on whether they contain ids or are empty
corsiv_bed[['ME', 'SIV', 'ESS', 'CoRSIV2019']] = corsiv_bed[['ME', 'SIV', 'ESS', 'CoRSIV2019']].notna() & (corsiv_bed[['ME', 'SIV', 'ESS', 'CoRSIV2019']] != '')
corsiv_bed = corsiv_bed[["CoRSIV_ID", "chr", "start", "end", "cpg", "block_size", "tss", "gb", "tes", "probect", "ME", "SIV", "ESS", "CoRSIV2019", "ME_COORDS", "SIV_COORDS", "ESS_COORDS", "CoRSIV2019_COORDS"]]
corsiv_bed.columns = ["CoRSIV_ID", "Chromosome", "Start", "End", "Number of CpG", "Region Size (bp)", "TSS Counts", "Gene Body Counts", "TES Counts", "Probe Counts", "ME", "SIV", "ESS", "CoRSIV2019", "ME_COORDS", "SIV_COORDS", "ESS_COORDS", "CoRSIV2019_COORDS"]
corsiv_bed.to_csv("../data/humanData/corsiv_control/annotated_corsiv_all.csv", index=False)

In [None]:
df = pd.read_csv("../data/humanData/corsiv_control/annotated_corsiv_all.csv")
cleaned_df = df[["CoRSIV_ID", "Chromosome", "Start", "End", "Number of CpG", "Region Size (bp)", "TSS Counts", "Gene Body Counts", "TES Counts", "Probe Counts"]]
cleaned_df.columns = ['CoRSIV_ID', 'chr', 'start', 'end', 'CpG_count', 'block_size', 'tss_count', 'Gene_body_count', 'tes_count', 'Union Count']
cleaned_df.to_csv("../data/humanData/corsiv_control/annotated_corsiv_clean.csv", index=False) # clean version for downstream