## Imports

In [1]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import subprocess
import pickle

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

## Configuration and Data Download

In [2]:
DORADO_FILE = Path('../Exemplar_Data/annotated_output/pickle_output/08_07_24_GM12878_chr12-112000000-114000000_annotated_valid_kmer.pkl')

# DATA_URLS = {
#     'glori1': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41592-025-02680-9/MediaObjects/41592_2025_2680_MOESM5_ESM.xlsb',
#     'glori2': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41592-025-02680-9/MediaObjects/41592_2025_2680_MOESM3_ESM.xlsb'
#     'm5c': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE225614&format=file&file=GSE225614%5FHEK293T%2DWT%5Fsites%2Etsv%2Egz',
#     'bid_seq': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE179798&format=file&file=GSE179798_HEK293T_mRNA_WT_BID-seq.xlsx',
#     'praise': 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41589-023-01304-7/MediaObjects/41589_2023_1304_MOESM3_ESM.xlsx',
#     # 'inosine': 'refer to figure 6c code'
# }

DATA_DIR = Path("../Exemplar_Data/orthogonal_data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# for name, url in DATA_URLS.items():
#     if url == 'YOUR_URL_HERE':
#         continue
#     output_file = DATA_DIR / Path(url).name
#     if not output_file.exists():
#         subprocess.run(['wget', '-q', '-O', str(output_file), url], check=True)

M6A_GLORI1_FILE = DATA_DIR / "41592_2025_2680_MOESM5_ESM.xlsb"
M6A_GLORI2_FILE = DATA_DIR / "41592_2025_2680_MOESM3_ESM.xlsb"
M5C_FILE = DATA_DIR / "GSE225614_HEK293T-WT_sites.tsv.gz"
PSI_BIDSEQ_FILE = DATA_DIR / "GSE179798_HEK293T_mRNA_WT_BID-seq.xlsx"
PSI_PRAISE_FILE = DATA_DIR / "41589_2023_1304_MOESM3_ESM.xlsx"
INO_FILE = DATA_DIR / "Data_S2_A-to-I_sites_identified_by_slic-seq.xlsx"

## Color Scheme

In [3]:
MOD_COLORS = {
    'm6A': '#0072B2',
    'm5C': '#CC79A7',
    'psi': '#D55E00',
    'inosine': '#009E73'
}

MOD_CODES = {
    'm6A': ['a'],
    'm5C': ['m'],
    'psi': ['17802', 'psi'],
    'inosine': ['17596', 'inosine', 'I']
}

## Data Loader

In [4]:
from typing import Union, Any
import gzip

class OrthogonalDataloader:
    def __init__(self, file_path: Union[Path, str]) -> None:
        self.file_path = Path(file_path)
        if not self.file_path.is_file():
            raise FileNotFoundError(f"File not found: '{self.file_path}'")

    def load_data(self, **kwargs: Any) -> Union[pd.DataFrame, dict, Any]:
        suffixes = self.file_path.suffixes
        compression = 'gzip' if '.gz' in suffixes else None
        
        try:
            if '.xlsb' in suffixes or '.xlsx' in suffixes:
                if 'sheet_name' not in kwargs:
                    kwargs['sheet_name'] = None
                engine = 'pyxlsb' if '.xlsb' in suffixes else None
                return pd.read_excel(self.file_path, engine=engine, **kwargs)
            elif '.csv' in suffixes:
                return pd.read_csv(self.file_path, compression=compression, **kwargs)
            elif '.tsv' in suffixes or '.txt' in suffixes:
                if 'sep' not in kwargs:
                    kwargs['sep'] = '\t'
                return pd.read_csv(self.file_path, compression=compression, **kwargs)
            else:
                raise ValueError(f"Unsupported file type: {''.join(suffixes)}")
        except Exception as e:
            print(f"Failed to load {self.file_path.name}: {e}")
            return None

## Load all data

In [5]:
import os
import gzip

def is_git_lfs_pointer(filepath):
    """Check if file is a Git LFS pointer"""
    try:
        with open(filepath, 'r') as f:
            first_line = f.readline()
            return first_line.startswith('version https://git-lfs.github.com')
    except:
        return False

def detect_and_load_file(filepath):
    """Detect file format and load appropriately"""
    
    if is_git_lfs_pointer(filepath):
        raise ValueError(
            f"File {filepath} is a Git LFS pointer, not actual data!\n"
            f"Run 'git lfs pull' in your repository to download the actual file."
        )
    
    try:
        df = pl.read_parquet(filepath)
        print(f"Loaded as parquet: {len(df):,} sites")
        return df
    except:
        pass
    
    try:
        with gzip.open(filepath, 'rb') as f:
            df = pickle.load(f)
        if isinstance(df, pd.DataFrame):
            df = pl.from_pandas(df)
        print(f"Loaded as gzipped pickle: {len(df):,} sites")
        return df
    except:
        pass
    
    try:
        with open(filepath, 'rb') as f:
            df = pickle.load(f)
        if isinstance(df, pd.DataFrame):
            df = pl.from_pandas(df)
        print(f"Loaded as pickle: {len(df):,} sites")
        return df
    except:
        pass
    
    try:
        compression = 'gzip' if str(filepath).endswith('.gz') else None
        
        df = pd.read_csv(
            filepath,
            sep='\t',
            compression=compression,
            header=None,
            comment='#'
        )
        
        if len(df.columns) >= 12:
            df.columns = [
                'chrom', 'drs_start', 'drs_end', 'name', 'score', 'strand',
                'thick_start', 'thick_end', 'item_rgb', 'block_count', 
                'block_sizes', 'block_starts'
            ] + [f'col_{i}' for i in range(12, len(df.columns))]
        elif len(df.columns) >= 6:
            df.columns = [
                'chrom', 'drs_start', 'drs_end', 'name', 'score', 'strand'
            ] + [f'col_{i}' for i in range(6, len(df.columns))]
        else:
            df.columns = [f'col_{i}' for i in range(len(df.columns))]
        
        if 'name' in df.columns and len(df) > 0:
            sample_name = str(df['name'].iloc[0])
            if '|' in sample_name:
                name_parts = df['name'].str.split('|', expand=True)
                if name_parts.shape[1] >= 4:
                    df['gene_id'] = name_parts[0]
                    df['gene_name'] = name_parts[1]
                    df['mod'] = name_parts[2]
                    df['feature_type'] = name_parts[3]
        
        df = pl.from_pandas(df)
        print(f"Loaded as BED: {len(df):,} sites")
        return df
    except Exception as e:
        print(f"BED loading failed: {e}")
        pass
    
    raise ValueError(
        f"Could not load file {filepath}\n"
        f"Tried: parquet, pickle (gzipped and regular), and BED formats"
    )

drs_df = detect_and_load_file(DORADO_FILE)

Loaded as pickle: 172 sites


In [6]:
def process_orthogonal_sites(df, chr_col: str, pos_col: str):
    if df is None:
        return set()
    sample_chr = str(df[chr_col].iloc[0]) if len(df) > 0 else None
    pos_int = df[pos_col].astype(float).astype(int).astype(str)
    if sample_chr and sample_chr.startswith('chr'):
        sites = set(df[chr_col].astype(str) + '_' + pos_int)
    else:
        sites = set('chr' + df[chr_col].astype(str) + '_' + pos_int)
    return sites

loader = OrthogonalDataloader(M6A_GLORI1_FILE)
glori1_raw = loader.load_data()
if isinstance(glori1_raw, dict):
    for sheet_name, df in glori1_raw.items():
        if '10ng' in sheet_name:
            glori1_df = df
            break

loader = OrthogonalDataloader(M6A_GLORI2_FILE)
glori2_raw = loader.load_data()
if isinstance(glori2_raw, dict):
    for sheet_name, df in glori2_raw.items():
        if '10ng' in sheet_name:
            glori2_df = df
            break

glori1_sites = process_orthogonal_sites(glori1_df, 'Chr', 'Site')
glori2_sites = process_orthogonal_sites(glori2_df, 'Chr', 'Site')
m6a_validated = glori1_sites | glori2_sites

loader = OrthogonalDataloader(M5C_FILE)
m5c_raw = loader.load_data()
m5c_df = m5c_raw[~(m5c_raw['gene_type'] == 'rRNA') & ~(m5c_raw['gene_type'] == 'tRNA')].copy()
m5c_validated = process_orthogonal_sites(m5c_df, 'chromosome', 'position')

loader = OrthogonalDataloader(PSI_BIDSEQ_FILE)
bid_raw = loader.load_data()
bid_df = bid_raw['Sheet1'] if isinstance(bid_raw, dict) else bid_raw
bid_df.columns = bid_df.iloc[2]
bid_df = bid_df[3:].reset_index(drop=True)

loader = OrthogonalDataloader(PSI_PRAISE_FILE)
praise_raw = loader.load_data()
if isinstance(praise_raw, dict):
    for sheet_name in praise_raw.keys():
        if 'dataset 2' in sheet_name.lower():
            praise_df = praise_raw[sheet_name]
            break

praise_df.columns = praise_df.iloc[1]
praise_df = praise_df[2:].reset_index(drop=True)

if 'chr_name' in praise_df.columns:
    praise_df['accession'] = praise_df['chr_name'].astype(str)
    praise_df = praise_df[
        praise_df['accession'].str.startswith('NM_') | 
        praise_df['accession'].str.startswith('XM_')
    ].copy()

def parse_chr_site(chr_site_str):
    try:
        if pd.isna(chr_site_str):
            return None, None
        chr_site_str = str(chr_site_str).strip()
        if '_' not in chr_site_str:
            return None, None
        parts = chr_site_str.split('_')
        if len(parts) != 2:
            return None, None
        chrom = parts[0].strip()
        pos_part = parts[1].strip()
        if '-' in pos_part:
            pos = pos_part.split('-')[0].strip()
        else:
            pos = pos_part.strip()
        return chrom, int(pos)
    except:
        return None, None

if 'chr_site' in praise_df.columns:
    parsed = praise_df['chr_site'].apply(parse_chr_site)
    praise_df['chromosome'] = [x[0] for x in parsed]
    praise_df['genomic_position'] = [x[1] for x in parsed]
    praise_filtered = praise_df[
        (praise_df['chromosome'].notna()) & 
        (praise_df['genomic_position'].notna())
    ].copy()

bid_sites = process_orthogonal_sites(bid_df, 'chr', 'pos')
praise_sites = process_orthogonal_sites(praise_filtered, 'chromosome', 'genomic_position')
psi_validated = bid_sites | praise_sites

loader = OrthogonalDataloader(INO_FILE)
ino_raw = loader.load_data()
keep_locations = ['intergenic', 'exonic', 'UTR3', 'UTR5', 'UTR5;UTR3']
ino_dfs = []
for sheet_name, df in ino_raw.items():
    if 'HEK293T-rep' in sheet_name:
        df_filtered = df[df['Location'].isin(keep_locations)].copy()
        ino_dfs.append(df_filtered)
combined_ino = pd.concat(ino_dfs, ignore_index=True)
ino_validated = process_orthogonal_sites(combined_ino, 'Chromosome', 'position')

print(f"m6A validated: {len(m6a_validated):,}")
print(f"m5C validated: {len(m5c_validated):,}")
print(f"Psi validated: {len(psi_validated):,}")
print(f"Inosine validated: {len(ino_validated):,}")

m6A validated: 108,822
m5C validated: 2,191
Psi validated: 2,114
Inosine validated: 29,745


## Filter DRS to Validated Sites

In [7]:
if 'site_id' not in drs_df.columns:
    drs_df = drs_df.with_columns([
        (pl.col('chrom').cast(pl.Utf8) + '_' + 
         pl.col('drs_end').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
    ])

all_validated = m6a_validated | m5c_validated | psi_validated | ino_validated
drs_df = drs_df.filter(pl.col('site_id').is_in(list(all_validated)))

print(f"Filtered DRS to validated sites: {len(drs_df):,}")

Filtered DRS to validated sites: 51


## Find Multi-Mod Genes

In [8]:
print("Finding genes with multiple modifications...")

validated_genes = {mod_type: set() for mod_type in MOD_CODES.keys()}

for mod_type, codes in MOD_CODES.items():
    codes_str = [str(c) for c in codes]
    
    if mod_type == 'm6A':
        validated_sites = m6a_validated
    elif mod_type == 'm5C':
        validated_sites = m5c_validated
    elif mod_type == 'psi':
        validated_sites = psi_validated
    else:
        validated_sites = ino_validated
    
    drs_mod = drs_df.filter(pl.col('mod').cast(pl.Utf8).is_in(codes_str))
    matched = drs_mod.filter(pl.col('site_id').is_in(list(validated_sites)))
    gene_ids = set(matched['gene_id'].unique().to_list())
    gene_ids.discard(None)
    validated_genes[mod_type] = gene_ids
    
    print(f"  {mod_type}: {len(gene_ids)} genes")

three_way_combos = {
    'm6A_m5C_psi': validated_genes['m6A'] & validated_genes['m5C'] & validated_genes['psi'],
    'm6A_m5C_inosine': validated_genes['m6A'] & validated_genes['m5C'] & validated_genes['inosine'],
    'm6A_psi_inosine': validated_genes['m6A'] & validated_genes['psi'] & validated_genes['inosine'],
    'm5C_psi_inosine': validated_genes['m5C'] & validated_genes['psi'] & validated_genes['inosine']
}

print("\nThree-way combinations:")
for combo_name, genes in three_way_combos.items():
    if len(genes) > 0:
        print(f"  {combo_name}: {len(genes)} genes")

Finding genes with multiple modifications...
  m6A: 12 genes
  m5C: 0 genes
  psi: 0 genes
  inosine: 1 genes

Three-way combinations:


## Create Gene Dataframe

In [9]:
def create_gene_details(drs_df, gene_sets, combinations):
    gene_details_list = []
    
    for combo_name in combinations:
        if combo_name not in gene_sets or len(gene_sets[combo_name]) == 0:
            continue
        
        mods = combo_name.split('_')
        
        for gene_id in gene_sets[combo_name]:
            gene_data = drs_df.filter(pl.col('gene_id') == gene_id)
            
            if len(gene_data) == 0:
                continue
            
            gene_name = gene_data['gene_name'][0]
            chrom = gene_data['chrom'][0]
            strand = gene_data['strand'][0]
            
            mod_counts = {}
            for mod in mods:
                if mod not in MOD_CODES:
                    continue
                mod_sites = gene_data.filter(pl.col('mod').cast(pl.Utf8).is_in([str(c) for c in MOD_CODES[mod]]))
                mod_counts[mod] = len(mod_sites)
            
            min_pos = gene_data['drs_start'].min()
            max_pos = gene_data['drs_end'].max()
            
            gene_details_list.append({
                'gene_id': gene_id,
                'gene_name': gene_name,
                'chromosome': chrom,
                'strand': strand,
                'start': min_pos,
                'end': max_pos,
                'span_kb': round((max_pos - min_pos) / 1000, 2),
                'combination': combo_name,
                'modifications': ','.join(mods),
                **{f'n_{mod}_sites': mod_counts.get(mod, 0) for mod in mods},
                'total_sites': sum(mod_counts.values())
            })
    
    if len(gene_details_list) > 0:
        return pl.DataFrame(gene_details_list).sort('total_sites', descending=True)
    return None

combinations_to_plot = [k for k, v in three_way_combos.items() if len(v) > 0]
multi_mod_df = create_gene_details(drs_df, three_way_combos, combinations_to_plot)

if multi_mod_df is not None:
    print(f"\nFound {len(multi_mod_df)} genes with multiple modifications:")
    print(multi_mod_df)
else:
    print("\nNo multi-modification genes found")


No multi-modification genes found


## Create Swarm Plots

In [10]:
if multi_mod_df is None or len(multi_mod_df) == 0:
    print("No genes to plot")
else:
    combinations = multi_mod_df['combination'].unique().to_list()
    
    for combo_name in combinations:
        print(f"\nPlotting {combo_name}...")
        
        combo_genes = multi_mod_df.filter(
            pl.col('combination') == combo_name
        ).head(10)
        
        if len(combo_genes) == 0:
            continue
        
        gene_list = combo_genes['gene_name'].to_list()
        mods = combo_genes['modifications'][0].split(',')
        
        fig, axes = plt.subplots(len(gene_list), 1, 
                                figsize=(12, 3.5 * len(gene_list)))
        if len(gene_list) == 1:
            axes = [axes]
        
        for ax, gene_name in zip(axes, gene_list):
            gene_data = drs_df.filter(pl.col('gene_name') == gene_name)
            
            if len(gene_data) == 0:
                continue
            
            strand = gene_data['strand'][0]
            
            plot_data = []
            for mod_name in mods:
                mod_sites = gene_data.filter(
                    pl.col('mod').cast(pl.Utf8).is_in([str(c) for c in MOD_CODES[mod_name]])
                )
                
                for row in mod_sites.iter_rows(named=True):
                    plot_data.append({
                        'position': row['drs_start'],
                        'modification': mod_name
                    })
            
            if len(plot_data) == 0:
                continue
            
            plot_df = pd.DataFrame(plot_data)
            
            sns.swarmplot(
                data=plot_df,
                x='position',
                hue='modification',
                palette=MOD_COLORS,
                ax=ax,
                size=6,
                alpha=0.8,
                hue_order=mods
            )
            
            strand_symbol = '→' if strand == '+' else '←'
            strand_text = f"5' {strand_symbol} 3'" if strand == '+' else f"3' {strand_symbol} 5'"
            
            ax.set_title(f"{gene_name} ({strand} strand) | {strand_text}", 
                       fontsize=14, fontweight='bold', pad=10)
            ax.set_xlabel("Genomic Position (bp)", fontsize=12)
            ax.set_ylabel("")
            ax.set_yticks([])
            
            if strand == '-':
                ax.invert_xaxis()
            
            ax.ticklabel_format(style='plain', axis='x')
            ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x):,}'))
            
            ax.legend(title='Modification', bbox_to_anchor=(1.02, 1), 
                     loc='upper left', frameon=True)
            ax.grid(axis='x', alpha=0.3, linestyle='--')
        
        plt.tight_layout()
        plt.show()
        
        print(f"  Displayed {combo_name} swarm plot")

No genes to plot
