## Setup and Imports

In [1]:
"""
=================================================================================
RNA Modification Comparison Notebook
=================================================================================
Purpose: Compare Direct RNA Sequencing (DRS) modification calls with orthogonal
         validation methods for HEK293 and GM12878 cell lines

Modifications analyzed:
- m6A (N6-methyladenosine) - GLORI-Seq 1.0 & 2.0+
- m5C (5-methylcytosine) - Published dataset
- Pseudouridine (Œ®) - BID-Seq & PRAISE
- Inosine (A-to-I editing) - SLIC-seq
- 2'-O-Methylation - Tang et al.

Author: [Your Name]
Date: [Current Date]
=================================================================================
"""

# Standard libraries
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from collections import defaultdict
from typing import Any, Union, Dict, Set
import gzip
import pickle
from functools import reduce

# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
from matplotlib_venn import venn2, venn3
import seaborn as sns

# Analysis libraries
from sklearn.metrics import mean_squared_error

# Configure matplotlib for publication-quality figures
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Helvetica']
mpl.rcParams['figure.dpi'] = 600
mpl.rcParams['savefig.dpi'] = 600
mpl.rcParams['savefig.transparent'] = True
mpl.rcParams['savefig.bbox'] = 'tight'
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

print("‚úì All imports successful!")
print("‚úì Matplotlib configured for publication-quality output")

‚úì All imports successful!
‚úì Matplotlib configured for publication-quality output


## Configuration

In [2]:
"""
=================================================================================
CONFIGURATION - UPDATE THESE PATHS FOR YOUR SYSTEM
=================================================================================
"""

# Base directory
BASE_DIR = Path("/Volumes/AJS_SSD")

# DRS data directories
HEK293_DRS_DIR = BASE_DIR / "HEK293" / "modkit_output" / "mod_specific_dataframes"
GM12878_DRS_DIR = BASE_DIR / "GM12878" / "mod_specific_dataframes"

# Orthogonal data paths
ORTHOG_DIR = BASE_DIR / "HEK293" / "orthogonal_datasets"

# m6A files
M6A_GLORI1_NEW = ORTHOG_DIR / "m6A" / "GLORI_1.0" / "41592_2025_2680_MOESM5_ESM(1).xlsb"
M6A_GLORI2_FILE = ORTHOG_DIR / "m6A" / "GLORI_2.0+" / "41592_2025_2680_MOESM3_ESM.xlsb"

# m5C file
M5C_FILE = ORTHOG_DIR / "m5C" / "GSE225614_HEK293T-WT_sites.tsv.gz"

# Pseudouridine files
PSI_BIDSEQ_FILE = ORTHOG_DIR / "psi" / "BID-Seq" / "GSE179798_HEK293T_mRNA_WT_BID-seq.xlsx"
PSI_PRAISE_FILE = ORTHOG_DIR / "psi" / "PRAISE" / "41589_2015_BFnchembio1836_MOESM158_ESM.xlsx"

# Inosine file
INO_FILE = ORTHOG_DIR / "inosine" / "Data_S2_A-to-I_sites_identified_by_slic-seq.xlsx"

# 2'OMe file
TWOME_FILE = ORTHOG_DIR / "2OMe" / "1-s2.0-S2667237524000365-mmc3.xlsx"

# GENCODE annotation
GENCODE_GTF = BASE_DIR / "HEK293" / "gencode_annotations" / "gencode.v47.annotation.gtf"

# Output directories
OUTPUT_DIR = BASE_DIR / "HEK293" / "scripts" / "notebooks" / "Plots" / "Plots_Updated_GLORI_Same_Sample"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

VALIDATED_DIR = BASE_DIR / "HEK293" / "orthogonal_validated"
VALIDATED_DIR.mkdir(parents=True, exist_ok=True)

print("‚úì Configuration loaded")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Validated sites directory: {VALIDATED_DIR}")

‚úì Configuration loaded
  Output directory: /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample
  Validated sites directory: /Volumes/AJS_SSD/HEK293/orthogonal_validated


## Color Scheme Definition

In [3]:
"""
=================================================================================
COLOR SCHEME - Consistent colors for all plots
=================================================================================
"""

MODIFICATION_COLORS = {
    # m6A Technologies
    'm6a': {
        'DRS_HEK293': '#1f77b4',       # Blue
        'DRS_GM12878': '#aec7e8',      # Light blue
        'GLORI1': '#ff7f0e',           # Orange
        'GLORI2': '#d62728',           # Red
        'GLORI_combined': '#9467bd',   # Purple
        # Overlap colors for 3-way Venns
        'HEK_GLORI1': '#bcbd22',
        'HEK_GLORI2': '#17becf',
        'GLORI1_GLORI2': '#e377c2',
        'ALL_THREE': '#7f7f7f',
    },
    
    # m5C Technologies
    'm5c': {
        'DRS_HEK293': '#2ca02c',
        'DRS_GM12878': '#98df8a',
        'Orthogonal': '#e377c2',
        'HEK_Orth': '#8c564b',
        'GM_Orth': '#c49c94',
        'HEK_GM': '#bcbd22',
        'ALL_THREE': '#7f7f7f',
    },
    
    # Pseudouridine
    'psi': {
        'DRS_HEK293': '#9467bd',
        'DRS_GM12878': '#c5b0d5',
        'BID-seq': '#8c564b',
        'PRAISE': '#ff7f0e',
        'Combined': '#d62728',
        'HEK_BID': '#bcbd22',
        'HEK_PRAISE': '#17becf',
        'BID_PRAISE': '#e377c2',
        'ALL_THREE': '#7f7f7f',
    },
    
    # Inosine
    'inosine': {
        'DRS_HEK293': '#17becf',
        'DRS_GM12878': '#9edae5',
        'Orthogonal': '#ff7f0e',
        'HEK_Orth': '#8c564b',
        'GM_Orth': '#c49c94',
        'HEK_GM': '#bcbd22',
        'ALL_THREE': '#7f7f7f',
    },
    
    # 2'-O-Methylation
    '2ome': {
        'DRS_HEK293_A': '#e377c2',
        'DRS_HEK293_C': '#f7b6d2',
        'DRS_HEK293_G': '#7f7f7f',
        'DRS_HEK293_U': '#c7c7c7',
        'DRS_GM12878_A': '#1f77b4',
        'DRS_GM12878_C': '#aec7e8',
        'DRS_GM12878_G': '#ff7f0e',
        'DRS_GM12878_U': '#ffbb78',
        'Orthogonal_A': '#2ca02c',
        'Orthogonal_C': '#98df8a',
        'Orthogonal_G': '#d62728',
        'Orthogonal_U': '#ff9896',
    }
}

ALPHA = 0.7

HEATMAP_CMAPS = {
    'm6a_glori1': 'YlOrRd',
    'm6a_glori2': 'YlGnBu',
    'm6a_combined': 'RdPu',
    'm5c': 'Greens',
    'psi': 'Purples',
    'inosine': 'Blues',
    '2ome': 'Oranges',
}

print("‚úì Color scheme loaded")

‚úì Color scheme loaded


## Data Loading Class

In [4]:
"""
=================================================================================
UNIVERSAL DATA LOADER
=================================================================================
"""

class OrthogonalDataloader:
    """Universal dataloader for various file types"""
    def __init__(self, file_path: Union[Path, str]) -> None:
        self.file_path = Path(file_path)
        if not self.file_path.is_file():
            raise FileNotFoundError(f"File not found: '{self.file_path}'")

    def load_data(self, **kwargs: Any) -> Union[pd.DataFrame, Dict[str, pd.DataFrame], Any]:
        """Load data from file"""
        suffixes = self.file_path.suffixes
        compression = 'gzip' if '.gz' in suffixes else None
        
        print(f"  Loading '{self.file_path.name}'...")
        
        try:
            if '.pkl' in suffixes:
                if compression == 'gzip':
                    with gzip.open(self.file_path, 'rb') as f:
                        return pickle.load(f, **kwargs)
                else:
                    with open(self.file_path, 'rb') as f:
                        return pickle.load(f, **kwargs)
            
            elif '.xlsb' in suffixes or '.xlsx' in suffixes:
                if 'sheet_name' not in kwargs:
                    kwargs['sheet_name'] = None
                engine = 'pyxlsb' if '.xlsb' in suffixes else None
                return pd.read_excel(self.file_path, engine=engine, **kwargs)

            elif '.csv' in suffixes:
                return pd.read_csv(self.file_path, compression=compression, **kwargs)

            elif '.tsv' in suffixes or '.txt' in suffixes:
                if 'sep' not in kwargs:
                    kwargs['sep'] = '\t'
                return pd.read_csv(self.file_path, compression=compression, **kwargs)

            else:
                raise ValueError(f"Unsupported file type: {''.join(suffixes)}")
        
        except Exception as e:
            print(f"  ‚ùå Failed to load {self.file_path.name}: {e}")
            return None

print("‚úì DataLoader class defined")

‚úì DataLoader class defined


## Load DRS Data

In [5]:
"""
=================================================================================
LOAD DRS DATA (Dorado/Modkit Output)
=================================================================================
"""

def load_drs_data(base_dir: Path, cell_line: str) -> Dict[str, Any]:
    """Load all DRS modification data for a cell line"""
    mod_mapping = {
        'a': 'm6a',
        '17802': 'psi',
        '17596': 'inosine',
        'm': 'm5c',
        '19227': '2OMeU',
        '19228': '2OMeC',
        '19229': '2OMeG',
        '69426': '2OMeA'
    }
    
    drs_data = {}
    
    if not base_dir.exists():
        print(f"  ‚ö†Ô∏è  Directory not found: {base_dir}")
        return drs_data
    
    parquet_files = [f for f in base_dir.iterdir() if f.suffix == '.parquet' and not f.name.startswith('.')]
    
    print(f"  Found {len(parquet_files)} files")
    
    for file in parquet_files:
        try:
            if 'filtered_' in file.name:
                mod_code = file.name.split('filtered_')[1].split('_')[0]
                mod_key = mod_mapping.get(mod_code, mod_code)
                
                df = pl.read_parquet(file)
                full_key = f"{cell_line}_{mod_key}"
                drs_data[full_key] = df
                print(f"    ‚úì {full_key}: {len(df):,} rows")
        except Exception as e:
            print(f"    ‚ùå Failed {file.name}: {e}")
    
    return drs_data

print("\n" + "="*80)
print("LOADING DRS DATA")
print("="*80)

print("\nüìÇ Loading HEK293 DRS data...")
hek293_drs_data = load_drs_data(HEK293_DRS_DIR, 'HEK293')

print("\nüìÇ Loading GM12878 DRS data...")
gm12878_drs_data = load_drs_data(GM12878_DRS_DIR, 'GM12878')

# Combine
dorado_mods_dict = {**hek293_drs_data, **gm12878_drs_data}

print(f"\n‚úì Loaded {len(dorado_mods_dict)} DRS datasets")
print(f"  Available: {list(dorado_mods_dict.keys())}")


LOADING DRS DATA

üìÇ Loading HEK293 DRS data...
  Found 8 files
    ‚úì HEK293_m6a: 8,235,544 rows
    ‚úì HEK293_psi: 8,312,485 rows
    ‚úì HEK293_inosine: 8,235,544 rows
    ‚úì HEK293_m5c: 6,732,738 rows
    ‚úì HEK293_2OMeU: 8,312,485 rows
    ‚úì HEK293_2OMeC: 6,732,738 rows
    ‚úì HEK293_2OMeG: 7,104,902 rows
    ‚úì HEK293_2OMeA: 8,235,544 rows

üìÇ Loading GM12878 DRS data...
  Found 8 files
    ‚úì GM12878_m6a: 8,408,754 rows
    ‚úì GM12878_psi: 8,122,125 rows
    ‚úì GM12878_inosine: 8,408,754 rows
    ‚úì GM12878_m5c: 7,161,645 rows
    ‚úì GM12878_2OMeU: 8,122,125 rows
    ‚úì GM12878_2OMeC: 7,161,645 rows
    ‚úì GM12878_2OMeG: 7,575,765 rows
    ‚úì GM12878_2OMeA: 8,408,754 rows

‚úì Loaded 16 DRS datasets
  Available: ['HEK293_m6a', 'HEK293_psi', 'HEK293_inosine', 'HEK293_m5c', 'HEK293_2OMeU', 'HEK293_2OMeC', 'HEK293_2OMeG', 'HEK293_2OMeA', 'GM12878_m6a', 'GM12878_psi', 'GM12878_inosine', 'GM12878_m5c', 'GM12878_2OMeU', 'GM12878_2OMeC', 'GM12878_2OMeG', 'GM12878_2

## Parse GENCODE GTF Functions

In [6]:
"""
=================================================================================
GENCODE GTF PARSING FUNCTIONS
=================================================================================
Must be defined BEFORE loading PRAISE data
"""

def parse_gencode_gtf(gtf_path):
    """Parse GENCODE GTF for transcript-to-genomic mapping"""
    transcript_exons = defaultdict(list)
    gene_transcripts = defaultdict(list)
    transcript_info = {}
    
    opener = gzip.open if str(gtf_path).endswith('.gz') else open
    
    with opener(gtf_path, 'rt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue
            
            chrom, source, feature, start, end, score, strand, frame, attributes = fields
            
            attr_dict = {}
            for attr in attributes.split(';'):
                attr = attr.strip()
                if not attr:
                    continue
                parts = attr.split(' ', 1)
                if len(parts) == 2:
                    key, val = parts
                    attr_dict[key] = val.strip('"')
            
            transcript_id = attr_dict.get('transcript_id')
            gene_name = attr_dict.get('gene_name')
            
            if feature == 'transcript':
                transcript_type = attr_dict.get('transcript_type', '')
                tags = attr_dict.get('tag', '')
                transcript_info[transcript_id] = {
                    'gene_name': gene_name,
                    'chrom': chrom,
                    'strand': strand,
                    'transcript_type': transcript_type,
                    'is_canonical': 'Ensembl_canonical' in tags or 'MANE_Select' in tags,
                }
                if gene_name:
                    gene_transcripts[gene_name].append(transcript_id)
            
            elif feature == 'exon':
                exon_number = int(attr_dict.get('exon_number', 0))
                if transcript_id:
                    transcript_exons[transcript_id].append({
                        'chrom': chrom,
                        'start': int(start),
                        'end': int(end),
                        'strand': strand,
                        'exon_number': exon_number
                    })
    
    for tid in transcript_exons:
        transcript_exons[tid].sort(key=lambda x: x['exon_number'])
    
    gene_to_transcript = {}
    for gene_name, transcripts in gene_transcripts.items():
        canonical = [t for t in transcripts if transcript_info.get(t, {}).get('is_canonical', False)]
        if canonical:
            best = canonical[0]
        else:
            def transcript_length(tid):
                exons = transcript_exons.get(tid, [])
                return sum(e['end'] - e['start'] + 1 for e in exons)
            protein_coding = [t for t in transcripts 
                            if transcript_info.get(t, {}).get('transcript_type') == 'protein_coding']
            best = max(protein_coding, key=transcript_length) if protein_coding else max(transcripts, key=transcript_length)
        gene_to_transcript[gene_name] = best
    
    return dict(transcript_exons), gene_to_transcript, transcript_info

def transcript_to_genomic(transcript_id, position, transcript_exons):
    """Convert transcript position to genomic position"""
    if transcript_id not in transcript_exons:
        return None, None, None
    exons = transcript_exons[transcript_id]
    if not exons:
        return None, None, None
    
    chrom = exons[0]['chrom']
    strand = exons[0]['strand']
    
    if strand == '-':
        exons = list(reversed(exons))
    
    cumulative_length = 0
    for exon in exons:
        exon_length = exon['end'] - exon['start'] + 1
        if cumulative_length + exon_length >= position:
            offset = position - cumulative_length - 1
            genomic_pos = exon['start'] + offset if strand == '+' else exon['end'] - offset
            return chrom, genomic_pos, strand
        cumulative_length += exon_length
    
    return chrom, None, strand

print("‚úì GENCODE GTF functions loaded")

‚úì GENCODE GTF functions loaded


## Load m6A Orthogonal Data

In [7]:
# ============================================================
# GLORI-1 (NEW)
# ============================================================
print("\nüìÇ Loading GLORI-1 (NEW)...")
loader = OrthogonalDataloader(M6A_GLORI1_NEW)
glori1_raw = loader.load_data()
if isinstance(glori1_raw, dict):
    print(f"  Multi-sheet Excel: {len(glori1_raw)} sheets")
    new_glori1_df = None
    for sheet_name, df in glori1_raw.items():
        if '10ng' in sheet_name:
            new_glori1_df = df
            print(f"  ‚úì Using sheet: '{sheet_name}'")
            break
    if new_glori1_df is None:
        raise ValueError("Could not find GLORI-1 sheet with '10ng' in name")
    glori1_raw = new_glori1_df
else:
    print(f"  Single DataFrame")

def process_new_glori1(df):
    """Process new GLORI-1 with 2 replicates (already in percentage)"""
    df = df.copy()
    if 'm6A_level_mean' in df.columns:
        return df
    # Data is already in percentage, no need to multiply by 100
    df['m6A_level_rep1_pct'] = df['m6A_level_rep1 (%)']
    df['m6A_level_rep2_pct'] = df['m6A_level_rep2 (%)']
    df['m6A_level_rep3_pct'] = df['m6A_level_rep3 (%)']
    df['m6A_level_mean'] = (df['m6A_level_rep1_pct'] + df['m6A_level_rep2_pct'] + df['m6A_level_rep3_pct']) / 3
    print(f"  ‚úì Processed: {len(df):,} sites, Mean: {df['m6A_level_mean'].mean():.2f}%")
    return df

new_glori1 = process_new_glori1(glori1_raw)

# """
# =================================================================================
# LOAD m6A ORTHOGONAL DATA (GLORI-Seq)
# =================================================================================
# """

# print("\n" + "="*80)
# print("LOADING m6A ORTHOGONAL DATA")
# print("="*80)

# # ============================================================
# # GLORI-1 (NEW)
# # ============================================================
# print("\nüìÇ Loading GLORI-1 (NEW)...")
# loader = OrthogonalDataloader(M6A_GLORI1_NEW)
# glori1_raw = loader.load_data()

# if isinstance(glori1_raw, dict):
#     print(f"  Multi-sheet Excel: {len(glori1_raw)} sheets")
#     new_glori1_df = None
#     for sheet_name, df in glori1_raw.items():
#         if 'm6A_level_rep1' in df.columns and 'm6A_level_rep2' in df.columns:
#             new_glori1_df = df
#             print(f"  ‚úì Using sheet: '{sheet_name}'")
#             break
#     if new_glori1_df is None:
#         raise ValueError("Could not find GLORI-1 sheet with correct columns")
#     glori1_raw = new_glori1_df
# else:
#     print(f"  Single DataFrame")

# def process_new_glori1(df):
#     """Process new GLORI-1 with 2 replicates"""
#     df = df.copy()
#     if 'm6A_level_mean' in df.columns:
#         return df
#     df['m6A_level_rep1_pct'] = df['m6A_level_rep1'] * 100
#     df['m6A_level_rep2_pct'] = df['m6A_level_rep2'] * 100
#     df['m6A_level_mean'] = (df['m6A_level_rep1_pct'] + df['m6A_level_rep2_pct']) / 2
#     df['AGCov_mean'] = (df['AGCov_rep1'] + df['AGCov_rep2']) / 2
#     print(f"  ‚úì Processed: {len(df):,} sites, Mean: {df['m6A_level_mean'].mean():.2f}%")
#     return df

# new_glori1 = process_new_glori1(glori1_raw)

# ============================================================
# GLORI-2 (10ng only)
# ============================================================
print("\nüìÇ Loading GLORI-2 (10ng)...")
loader = OrthogonalDataloader(M6A_GLORI2_FILE)
glori2_raw = loader.load_data()

def process_glori2_10ng(sheets_dict):
    """Load and process only the 10ng GLORI-2 data"""
    # Find the 10ng sheet
    target_df = None
    for sheet_name, df in sheets_dict.items():
        if '10ng' in sheet_name and ('mRNA_input' in sheet_name or 'ng_mRNA' in sheet_name):
            target_df = df.copy()
            print(f"  ‚úì Found sheet: '{sheet_name}'")
            break
    
    if target_df is None:
        raise ValueError("Could not find 10ng GLORI-2 sheet")
    
    # Find all m6A_level columns (replicates)
    m6a_cols = [col for col in target_df.columns if 'm6A_level' in col and col != 'm6A_level_mean']
    
    if m6a_cols:
        # Calculate mean across replicates
        target_df['m6A_level_mean'] = target_df[m6a_cols].mean(axis=1)
        print(f"  ‚úì Averaged {len(m6a_cols)} replicates")
    
    print(f"  ‚úì Processed: {len(target_df):,} sites, Mean: {target_df['m6A_level_mean'].mean():.2f}%")
    return target_df

glori2_10ng = process_glori2_10ng(glori2_raw)
combined_glori_2 = glori2_10ng

print("\n‚úì m6A orthogonal data loaded")


üìÇ Loading GLORI-1 (NEW)...
  Loading '41592_2025_2680_MOESM5_ESM(1).xlsb'...
  Multi-sheet Excel: 2 sheets
  ‚úì Using sheet: '10ng_mRNA_input'
  ‚úì Processed: 76,452 sites, Mean: 46.70%

üìÇ Loading GLORI-2 (10ng)...
  Loading '41592_2025_2680_MOESM3_ESM.xlsb'...
  ‚úì Found sheet: '10ng_mRNA_input'
  ‚úì Averaged 3 replicates
  ‚úì Processed: 101,613 sites, Mean: 45.40%

‚úì m6A orthogonal data loaded


## Load Other Orthogonal Data

In [8]:
"""
=================================================================================
LOAD OTHER ORTHOGONAL DATA
=================================================================================
"""

# ============================================================
# m5C
# ============================================================
print("\nüìÇ Loading m5C...")
loader = OrthogonalDataloader(M5C_FILE)
m5c_raw = loader.load_data()
m5c_orthogonal_df = m5c_raw[
    ~(m5c_raw['gene_type'] == 'rRNA') & ~(m5c_raw['gene_type'] == 'tRNA')
].copy()
print(f"  ‚úì {len(m5c_orthogonal_df):,} sites")

# ============================================================
# BID-seq
# ============================================================
print("\nüìÇ Loading BID-seq...")
loader = OrthogonalDataloader(PSI_BIDSEQ_FILE)
bid_raw = loader.load_data()
bid_seq_df = bid_raw['Sheet1'] if isinstance(bid_raw, dict) else bid_raw
bid_seq_df.columns = bid_seq_df.iloc[2]
bid_seq_df = bid_seq_df[3:].reset_index(drop=True)
print(f"  ‚úì {len(bid_seq_df):,} sites")

# ============================================================
# PRAISE
# ============================================================
print("\nüìÇ Loading PRAISE...")
loader = OrthogonalDataloader(PSI_PRAISE_FILE)
praise_raw = loader.load_data()

if isinstance(praise_raw, dict):
    praise_df = None
    for sheet_name in praise_raw.keys():
        if 'human' in sheet_name.lower() and 'sites' in sheet_name.lower():
            praise_df = praise_raw[sheet_name]
            print(f"  ‚úì Using: '{sheet_name}'")
            break
    if praise_df is None:
        raise ValueError("PRAISE sheet not found")
else:
    praise_df = praise_raw

# Map to genomic coordinates
print("  Parsing GENCODE...")
transcript_exons, gene_to_transcript, transcript_info = parse_gencode_gtf(GENCODE_GTF)

results = []
for idx, row in praise_df.iterrows():
    gene_name = row['gene']
    position = row['Postion']
    transcript_id = gene_to_transcript.get(gene_name)
    if transcript_id:
        chrom, genomic_pos, strand = transcript_to_genomic(transcript_id, position, transcript_exons)
    else:
        transcript_id, chrom, genomic_pos, strand = None, None, None, None
    results.append({
        'transcript_id': transcript_id,
        'chromosome': chrom,
        'genomic_position': genomic_pos,
        'strand': strand
    })

praise_with_genomic = praise_df.copy()
for col in ['transcript_id', 'chromosome', 'genomic_position', 'strand']:
    praise_with_genomic[col] = [r[col] for r in results]

praise_filtered = praise_with_genomic[
    (praise_with_genomic['chromosome'].notna()) & 
    (praise_with_genomic['genomic_position'].notna()) &
    (~praise_with_genomic['gene'].str.contains('trna', case=False, na=False)) &
    (~praise_with_genomic['gene'].str.contains('rrna', case=False, na=False))
]
print(f"  ‚úì {len(praise_filtered):,} sites mapped")

# ============================================================
# Inosine
# ============================================================
print("\nüìÇ Loading Inosine...")
loader = OrthogonalDataloader(INO_FILE)
ino_raw = loader.load_data()

keep_locations = ['intergenic', 'exonic', 'UTR3', 'UTR5', 'UTR5;UTR3']
ino_dfs = []
for sheet_name, df in ino_raw.items():
    if 'HEK293T-rep' in sheet_name:
        rep_num = sheet_name.split('rep')[1][0]
        df_filtered = df[df['Location'].isin(keep_locations)].copy()
        df_filtered['replicate'] = f'ino_{rep_num}'
        ino_dfs.append(df_filtered)

combined_ino = pd.concat(ino_dfs, ignore_index=True)
print(f"  ‚úì {len(combined_ino):,} sites")

# ============================================================
# 2'OMe
# ============================================================
print("\nüìÇ Loading 2'OMe...")
loader = OrthogonalDataloader(TWOME_FILE)
twome_raw = loader.load_data()

twome_df = None
for sheet_name in twome_raw.keys():
    if 'HEK293T' in sheet_name or 'HEK293' in sheet_name:
        twome_df = twome_raw[sheet_name]
        break

df = twome_df.copy()
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)
df_condensed = df[['Chr', 'Position', 'Strand', 'Nm', 'ID']].copy()
df_condensed = df_condensed.rename(columns={'ID': 'Gene'})
df_condensed['Position'] = pd.to_numeric(df_condensed['Position'])

OMe_A = df_condensed[df_condensed['Nm'] == 'A'].copy()
OMe_C = df_condensed[df_condensed['Nm'] == 'C'].copy()
OMe_G = df_condensed[df_condensed['Nm'] == 'G'].copy()
OMe_U = df_condensed[df_condensed['Nm'] == 'U'].copy()

print(f"  ‚úì A={len(OMe_A)}, C={len(OMe_C)}, G={len(OMe_G)}, U={len(OMe_U)}")

print("\n‚úì All orthogonal data loaded")


üìÇ Loading m5C...
  Loading 'GSE225614_HEK293T-WT_sites.tsv.gz'...
  ‚úì 2,191 sites

üìÇ Loading BID-seq...
  Loading 'GSE179798_HEK293T_mRNA_WT_BID-seq.xlsx'...
  ‚úì 543 sites

üìÇ Loading PRAISE...
  Loading '41589_2015_BFnchembio1836_MOESM158_ESM.xlsx'...
  ‚úì Using: 'œà sites in human'
  Parsing GENCODE...
  ‚úì 1,900 sites mapped

üìÇ Loading Inosine...
  Loading 'Data_S2_A-to-I_sites_identified_by_slic-seq.xlsx'...
  ‚úì 54,010 sites

üìÇ Loading 2'OMe...
  Loading '1-s2.0-S2667237524000365-mmc3.xlsx'...
  ‚úì A=314, C=650, G=645, U=450

‚úì All orthogonal data loaded


## Utility Function

In [9]:
"""
=================================================================================
UTILITY FUNCTIONS
=================================================================================
"""

def get_drs_sites(mod_dict: Dict, cell_line: str, mod: str) -> Set[str]:
    """Extract DRS sites as set"""
    key = f"{cell_line}_{mod}"
    if key not in mod_dict:
        return set()
    
    df = mod_dict[key]
    if 'Adjusted_Mod_Proportion' in df.columns:
        df_filtered = df.filter((pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20))
    else:
        df_filtered = df.filter(pl.col('Score') >= 20)
    
    sample_chr = df_filtered['Chromosome'][0] if len(df_filtered) > 0 else None
    if sample_chr and not str(sample_chr).startswith('chr'):
        sites = set(('chr' + df_filtered['Chromosome'].cast(pl.Utf8) + '_' + 
                    df_filtered['End'].cast(pl.Int64).cast(pl.Utf8)).to_list())
    else:
        sites = set((df_filtered['Chromosome'].cast(pl.Utf8) + '_' + 
                    df_filtered['End'].cast(pl.Int64).cast(pl.Utf8)).to_list())
    return sites

def get_drs_values(mod_dict: Dict, cell_line: str, mod: str) -> pl.DataFrame:
    """Extract DRS sites WITH values"""
    key = f"{cell_line}_{mod}"
    if key not in mod_dict:
        return None
    
    df = mod_dict[key]
    if 'Adjusted_Mod_Proportion' in df.columns:
        df_filtered = df.filter((pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20))
    else:
        df_filtered = df.filter(pl.col('Score') >= 20)
    
    sample_chr = df_filtered['Chromosome'][0] if len(df_filtered) > 0 else None
    if sample_chr and not str(sample_chr).startswith('chr'):
        df_filtered = df_filtered.with_columns([
            ('chr' + pl.col('Chromosome').cast(pl.Utf8) + '_' + 
             pl.col('End').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
        ])
    else:
        df_filtered = df_filtered.with_columns([
            (pl.col('Chromosome').cast(pl.Utf8) + '_' + 
             pl.col('End').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
        ])
    return df_filtered.select(['site_id', 'Adjusted_Mod_Proportion'])

def process_orthogonal_sites(df, chr_col: str, pos_col: str) -> Set[str]:
    """Extract orthogonal sites as set"""
    if df is None:
        return set()
    
    if isinstance(df, pl.DataFrame):
        sample_chr = df[chr_col][0] if len(df) > 0 else None
        if sample_chr and not str(sample_chr).startswith('chr'):
            sites = set(('chr' + df[chr_col].cast(pl.Utf8) + '_' + 
                        df[pos_col].cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).to_list())
        else:
            sites = set((df[chr_col].cast(pl.Utf8) + '_' + 
                        df[pos_col].cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).to_list())
    else:
        sample_chr = str(df[chr_col].iloc[0]) if len(df) > 0 else None
        pos_int = df[pos_col].astype(float).astype(int).astype(str)
        if sample_chr and sample_chr.startswith('chr'):
            sites = set(df[chr_col].astype(str) + '_' + pos_int)
        else:
            sites = set('chr' + df[chr_col].astype(str) + '_' + pos_int)
    return sites

def process_orthogonal_values(df, chr_col: str, pos_col: str, value_col: str) -> pl.DataFrame:
    """Extract orthogonal sites WITH values"""
    if isinstance(df, pl.DataFrame):
        sample_chr = df[chr_col][0] if len(df) > 0 else None
        if sample_chr and not str(sample_chr).startswith('chr'):
            df_processed = df.with_columns([
                ('chr' + pl.col(chr_col).cast(pl.Utf8) + '_' + 
                 pl.col(pos_col).cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
            ])
        else:
            df_processed = df.with_columns([
                (pl.col(chr_col).cast(pl.Utf8) + '_' + 
                 pl.col(pos_col).cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
            ])
    else:
        ortho_clean = df.dropna(subset=[pos_col])
        sample_chr = str(ortho_clean[chr_col].iloc[0]) if len(ortho_clean) > 0 else None
        pos_int = ortho_clean[pos_col].astype(float).astype(int).astype(str)
        if sample_chr and sample_chr.startswith('chr'):
            site_ids = ortho_clean[chr_col].astype(str) + '_' + pos_int
        else:
            site_ids = 'chr' + ortho_clean[chr_col].astype(str) + '_' + pos_int
        df_processed = pl.DataFrame({
            'site_id': site_ids.tolist(),
            value_col: ortho_clean[value_col].tolist()
        })
    return df_processed.select(['site_id', value_col])

print("‚úì Utility functions loaded")

‚úì Utility functions loaded


## GLORI Combintation

In [10]:
"""
=================================================================================
GLORI COMBINATION HELPER (FIXED)
=================================================================================
"""

def create_glori_combined_values(new_glori1, combined_glori_2, mode='intersection'):
    """Create combined GLORI dataset"""
    
    # GLORI-1: Ensure proper types before converting to polars
    glori1_clean = new_glori1.copy()
    glori1_clean['Chr'] = glori1_clean['Chr'].astype(str)
    glori1_clean['Site'] = pd.to_numeric(glori1_clean['Site'], errors='coerce').astype('Int64')
    glori1_clean['m6A_level_mean'] = pd.to_numeric(glori1_clean['m6A_level_mean'], errors='coerce')
    glori1_clean = glori1_clean.dropna(subset=['Site', 'm6A_level_mean'])
    
    glori1_pl = pl.from_pandas(glori1_clean[['Chr', 'Site', 'm6A_level_mean']]).with_columns([
        (pl.col('Chr').cast(pl.Utf8) + '_' + pl.col('Site').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
    ]).select(['site_id', 'm6A_level_mean'])
    
    # GLORI-2: Ensure proper types before converting to polars
    glori2_clean = combined_glori_2.copy()
    glori2_clean['Chr'] = glori2_clean['Chr'].astype(str)
    glori2_clean['Site'] = pd.to_numeric(glori2_clean['Site'], errors='coerce').astype('Int64')
    glori2_clean['m6A_level_mean'] = pd.to_numeric(glori2_clean['m6A_level_mean'], errors='coerce')
    glori2_clean = glori2_clean.dropna(subset=['Site', 'm6A_level_mean'])
    
    glori2_pl = pl.from_pandas(glori2_clean[['Chr', 'Site', 'm6A_level_mean']]).with_columns([
        (pl.col('Chr').cast(pl.Utf8) + '_' + pl.col('Site').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
    ]).select(['site_id', 'm6A_level_mean'])
    
    if mode == 'intersection':
        combined = glori1_pl.join(
            glori2_pl.rename({'m6A_level_mean': 'm6A_level_mean_g2'}), 
            on='site_id', how='inner'
        ).with_columns([
            ((pl.col('m6A_level_mean') + pl.col('m6A_level_mean_g2')) / 2).alias('m6A_combined')
        ]).select(['site_id', 'm6A_combined'])
    else:
        glori1_prep = glori1_pl.select([pl.col('site_id'), pl.col('m6A_level_mean').alias('value')])
        glori2_prep = glori2_pl.select([pl.col('site_id'), pl.col('m6A_level_mean').alias('value')])
        all_glori = pl.concat([glori1_prep, glori2_prep])
        combined = all_glori.group_by('site_id').agg([
            pl.col('value').mean().alias('m6A_combined')
        ]).select(['site_id', 'm6A_combined'])
    
    return combined

print("‚úÖ GLORI combination helper loaded (fixed)")

‚úÖ GLORI combination helper loaded (fixed)


## Venn Diagram Helpers

In [11]:
"""
=================================================================================
VENN DIAGRAM HELPER FUNCTIONS
=================================================================================
"""

def plot_venn2_colored(sets, labels, colors, title, ax=None):
    """Plot 2-way Venn with custom colors"""
    if ax is None:
        fig, ax = plt.subplots(figsize=(4, 4))
    plt.sca(ax)
    venn2(sets, set_labels=labels, set_colors=colors, alpha=ALPHA)
    ax.set_title(title, fontweight='bold', fontsize=14)
    return ax

def plot_venn3_colored(sets, labels, patch_colors, title, ax=None):
    """Plot 3-way Venn with custom colors"""
    if ax is None:
        fig, ax = plt.subplots(figsize=(5, 5))
    plt.sca(ax)
    venn = venn3(sets, set_labels=labels)
    for region_id, color in patch_colors.items():
        patch = venn.get_patch_by_id(region_id)
        if patch:
            patch.set_facecolor(color)
            patch.set_alpha(ALPHA)
    ax.set_title(title, fontweight='bold', fontsize=14)
    return ax

print("‚úì Venn helpers loaded")

‚úì Venn helpers loaded


In [12]:
"""
=================================================================================
OVERLAP PERCENTAGE ANALYSIS
=================================================================================
"""

def calculate_overlap_percentages(set1, set2, set1_name, set2_name):
    """
    Calculate comprehensive overlap statistics between two sets
    
    Returns:
    --------
    dict with overlap metrics
    """
    intersection = set1 & set2
    union = set1 | set2
    
    # Jaccard Index: intersection / union
    jaccard = len(intersection) / len(union) if len(union) > 0 else 0
    
    # Percentage of set1 found in set2
    pct_set1_in_set2 = (len(intersection) / len(set1) * 100) if len(set1) > 0 else 0
    
    # Percentage of set2 found in set1
    pct_set2_in_set1 = (len(intersection) / len(set2) * 100) if len(set2) > 0 else 0
    
    # Overlap coefficient: intersection / min(set1, set2)
    overlap_coef = (len(intersection) / min(len(set1), len(set2))) * 100 if min(len(set1), len(set2)) > 0 else 0
    
    return {
        'comparison': f'{set1_name} vs {set2_name}',
        'set1_name': set1_name,
        'set2_name': set2_name,
        'set1_size': len(set1),
        'set2_size': len(set2),
        'intersection': len(intersection),
        'union': len(union),
        'jaccard_index': jaccard * 100,  # Convert to percentage
        f'pct_{set1_name}_in_{set2_name}': pct_set1_in_set2,
        f'pct_{set2_name}_in_{set1_name}': pct_set2_in_set1,
        'overlap_coefficient': overlap_coef
    }

def analyze_m6a_overlap_agreement(dorado_mods_dict, new_glori1, combined_glori_2, 
                                   cell_line='HEK293', output_path=None):
    """
    Compare Dorado-GLORI overlap against GLORI-GLORI overlap
    
    Key question: Is Dorado's agreement with orthogonal methods similar to 
    the agreement between the two orthogonal methods themselves?
    """
    
    if output_path is None:
        output_path = OUTPUT_DIR / f'm6a_overlap_analysis_{cell_line}.csv'
    
    print("\n" + "="*80)
    print(f"m6A OVERLAP ANALYSIS - {cell_line.upper()}")
    print("="*80)
    
    # Get site sets
    drs_sites = get_drs_sites(dorado_mods_dict, cell_line, 'm6a')
    glori1_sites = process_orthogonal_sites(new_glori1, 'Chr', 'Site')
    glori2_sites = process_orthogonal_sites(combined_glori_2, 'Chr', 'Site')
    
    print(f"\nüìä Dataset Sizes:")
    print(f"  {cell_line} DRS:  {len(drs_sites):>8,} sites")
    print(f"  GLORI-1:         {len(glori1_sites):>8,} sites")
    print(f"  GLORI-2:         {len(glori2_sites):>8,} sites")
    
    # Calculate all pairwise overlaps
    results = []
    
    # 1. Dorado vs GLORI-1
    print(f"\n{'‚îÄ'*80}")
    print(f"1Ô∏è‚É£  {cell_line} DRS vs GLORI-1")
    print(f"{'‚îÄ'*80}")
    result1 = calculate_overlap_percentages(drs_sites, glori1_sites, 
                                            f'{cell_line}_DRS', 'GLORI1')
    results.append(result1)
    print(f"  Overlap: {result1['intersection']:,} sites")
    print(f"  Jaccard: {result1['jaccard_index']:.2f}%")
    print(f"  {result1['pct_HEK293_DRS_in_GLORI1' if cell_line=='HEK293' else 'pct_GM12878_DRS_in_GLORI1']:.2f}% of {cell_line} sites found in GLORI-1")
    print(f"  {result1['pct_GLORI1_in_HEK293_DRS' if cell_line=='HEK293' else 'pct_GLORI1_in_GM12878_DRS']:.2f}% of GLORI-1 sites found in {cell_line}")
    
    # 2. Dorado vs GLORI-2
    print(f"\n{'‚îÄ'*80}")
    print(f"2Ô∏è‚É£  {cell_line} DRS vs GLORI-2")
    print(f"{'‚îÄ'*80}")
    result2 = calculate_overlap_percentages(drs_sites, glori2_sites, 
                                            f'{cell_line}_DRS', 'GLORI2')
    results.append(result2)
    print(f"  Overlap: {result2['intersection']:,} sites")
    print(f"  Jaccard: {result2['jaccard_index']:.2f}%")
    print(f"  {result2['pct_HEK293_DRS_in_GLORI2' if cell_line=='HEK293' else 'pct_GM12878_DRS_in_GLORI2']:.2f}% of {cell_line} sites found in GLORI-2")
    print(f"  {result2['pct_GLORI2_in_HEK293_DRS' if cell_line=='HEK293' else 'pct_GLORI2_in_GM12878_DRS']:.2f}% of GLORI-2 sites found in {cell_line}")
    
    # 3. GLORI-1 vs GLORI-2 (GOLD STANDARD COMPARISON)
    print(f"\n{'‚îÄ'*80}")
    print(f"‚≠ê GLORI-1 vs GLORI-2 (Orthogonal Method Agreement)")
    print(f"{'‚îÄ'*80}")
    result3 = calculate_overlap_percentages(glori1_sites, glori2_sites, 
                                            'GLORI1', 'GLORI2')
    results.append(result3)
    print(f"  Overlap: {result3['intersection']:,} sites")
    print(f"  Jaccard: {result3['jaccard_index']:.2f}%")
    print(f"  {result3['pct_GLORI1_in_GLORI2']:.2f}% of GLORI-1 sites found in GLORI-2")
    print(f"  {result3['pct_GLORI2_in_GLORI1']:.2f}% of GLORI-2 sites found in GLORI-1")
    
    # Create summary DataFrame
    summary_df = pd.DataFrame(results)
    
    # Comparative Analysis
    print(f"\n{'='*80}")
    print("üìä COMPARATIVE ANALYSIS")
    print(f"{'='*80}")
    
    glori_jaccard = result3['jaccard_index']
    drs_glori1_jaccard = result1['jaccard_index']
    drs_glori2_jaccard = result2['jaccard_index']
    
    print(f"\nüéØ Jaccard Index Comparison:")
    print(f"  GLORI-1 vs GLORI-2:     {glori_jaccard:>6.2f}%  ‚≠ê (orthogonal baseline)")
    print(f"  {cell_line} vs GLORI-1: {drs_glori1_jaccard:>6.2f}%  ({drs_glori1_jaccard/glori_jaccard*100:>5.1f}% of baseline)")
    print(f"  {cell_line} vs GLORI-2: {drs_glori2_jaccard:>6.2f}%  ({drs_glori2_jaccard/glori_jaccard*100:>5.1f}% of baseline)")
    
    avg_drs_jaccard = (drs_glori1_jaccard + drs_glori2_jaccard) / 2
    print(f"\n  Average {cell_line} agreement: {avg_drs_jaccard:.2f}% ({avg_drs_jaccard/glori_jaccard*100:.1f}% of baseline)")
    
    # Interpretation
    print(f"\nüí° Interpretation:")
    if avg_drs_jaccard >= glori_jaccard * 0.8:
        print(f"  ‚úÖ {cell_line} DRS shows STRONG agreement with orthogonal methods")
        print(f"     (comparable to inter-method agreement)")
    elif avg_drs_jaccard >= glori_jaccard * 0.6:
        print(f"  ‚ö†Ô∏è  {cell_line} DRS shows MODERATE agreement with orthogonal methods")
        print(f"     (lower than inter-method agreement)")
    else:
        print(f"  ‚ùå {cell_line} DRS shows WEAK agreement with orthogonal methods")
        print(f"     (substantially lower than inter-method agreement)")
    
    # Save results
    summary_df.to_csv(output_path, index=False)
    print(f"\n‚úÖ Saved detailed results to {output_path}")
    
    # Create visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot 1: Jaccard Index comparison
    ax1 = axes[0]
    comparisons = [f'{cell_line}\nvs\nGLORI-1', f'{cell_line}\nvs\nGLORI-2', 'GLORI-1\nvs\nGLORI-2']
    jaccard_values = [drs_glori1_jaccard, drs_glori2_jaccard, glori_jaccard]
    colors_plot = ['#1f77b4', '#1f77b4', '#ff7f0e']
    
    bars = ax1.bar(comparisons, jaccard_values, color=colors_plot, alpha=0.7, edgecolor='black')
    ax1.axhline(y=glori_jaccard, color='#ff7f0e', linestyle='--', linewidth=2, 
                label='Orthogonal Baseline', alpha=0.7)
    ax1.set_ylabel('Jaccard Index (%)', fontsize=12, fontweight='bold')
    ax1.set_title(f'm6A Overlap Agreement - {cell_line}', fontsize=14, fontweight='bold')
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    
    # Plot 2: Overlap coefficient comparison
    ax2 = axes[1]
    overlap_coefs = [result1['overlap_coefficient'], 
                     result2['overlap_coefficient'], 
                     result3['overlap_coefficient']]
    
    bars2 = ax2.bar(comparisons, overlap_coefs, color=colors_plot, alpha=0.7, edgecolor='black')
    ax2.axhline(y=result3['overlap_coefficient'], color='#ff7f0e', linestyle='--', 
                linewidth=2, label='Orthogonal Baseline', alpha=0.7)
    ax2.set_ylabel('Overlap Coefficient (%)', fontsize=12, fontweight='bold')
    ax2.set_title('Overlap Coefficient Comparison', fontsize=14, fontweight='bold')
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    
    viz_path = output_path.with_suffix('.pdf')
    plt.savefig(viz_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úÖ Saved visualization to {viz_path}")
    
    return summary_df

print("‚úÖ Overlap analysis functions loaded")

‚úÖ Overlap analysis functions loaded


## Heatmap Functions

In [13]:
"""
=================================================================================
HEATMAP FUNCTIONS
=================================================================================
"""

def plot_single_heatmap(ax, drs_df, ortho_df, drs_col, ortho_col, title, colormap):
    """Plot single 2D histogram heatmap"""
    merged = drs_df.join(ortho_df, on='site_id', how='inner')
    
    if len(merged) == 0:
        ax.text(0.5, 0.5, 'No overlapping sites', ha='center', va='center', fontsize=12)
        ax.set_title(title, fontweight='bold', fontsize=11)
        return
    
    drs_values = merged[drs_col].to_numpy()
    ortho_values = merged[ortho_col].to_numpy()
    
    bandwidth = 5
    bins = np.arange(0, 100 + bandwidth, bandwidth)
    hist, xedges, yedges = np.histogram2d(drs_values, ortho_values, bins=bins)
    
    im = ax.imshow(hist.T, norm=LogNorm(vmin=1, vmax=10**3),
                   origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]],
                   aspect='auto', cmap=colormap, interpolation='nearest')
    
    cbar = plt.colorbar(im, ax=ax, ticks=np.logspace(0, 3, 4))
    cbar.set_ticklabels(['$10^0$', '$10^1$', '$10^2$', '$10^3$'])
    cbar.set_label('Site count', fontsize=10)
    
    correlation = np.corrcoef(drs_values, ortho_values)[0, 1]
    
    ax.set_xlabel('DRS Mod %', fontsize=11)
    ax.set_ylabel('Orthogonal Mod %', fontsize=11)
    ax.set_title(title, fontweight='bold', fontsize=12)
    ax.plot([0, 100], [0, 100], 'k--', linewidth=1.5)
    
    stats_text = f'n = {len(merged):,}\nr = {correlation:.3f}'
    ax.text(0.05, 0.95, stats_text, transform=ax.transAxes, 
            fontsize=10, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

def plot_m6a_heatmaps_complete(dorado_mods_dict, new_glori1, combined_glori_2, mode='both', output_path=None):
    """Complete m6A heatmaps - all 4 comparisons"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'm6a_heatmaps_{mode}_complete.pdf'
    
    print(f"\nm6A Heatmaps - {mode.upper()}")
    
    glori1_df = process_orthogonal_values(new_glori1, 'Chr', 'Site', 'm6A_level_mean')
    glori2_df = process_orthogonal_values(combined_glori_2, 'Chr', 'Site', 'm6A_level_mean')
    glori_int = create_glori_combined_values(new_glori1, combined_glori_2, mode='intersection')
    glori_union = create_glori_combined_values(new_glori1, combined_glori_2, mode='union')
    
    if mode == 'HEK293':
        drs_df = get_drs_values(dorado_mods_dict, 'HEK293', 'm6a')
        fig = plt.figure(figsize=(16, 8))
        
        ax1 = plt.subplot(2, 2, 1)
        plot_single_heatmap(ax1, drs_df, glori1_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'HEK293 vs GLORI-1', HEATMAP_CMAPS['m6a_glori1'])
        ax2 = plt.subplot(2, 2, 2)
        plot_single_heatmap(ax2, drs_df, glori2_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'HEK293 vs GLORI-2', HEATMAP_CMAPS['m6a_glori2'])
        ax3 = plt.subplot(2, 2, 3)
        plot_single_heatmap(ax3, drs_df, glori_int, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'HEK293 vs GLORI-1 ‚à© GLORI-2', HEATMAP_CMAPS['m6a_combined'])
        ax4 = plt.subplot(2, 2, 4)
        plot_single_heatmap(ax4, drs_df, glori_union, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'HEK293 vs GLORI-1 ‚à™ GLORI-2', 'magma')
        fig.suptitle('m6A Modification Levels - HEK293', fontsize=16, fontweight='bold')
    
    elif mode == 'GM12878':
        drs_df = get_drs_values(dorado_mods_dict, 'GM12878', 'm6a')
        if drs_df is None:
            print("  ‚ö†Ô∏è  GM12878 data not available")
            return
        fig = plt.figure(figsize=(16, 8))
        
        ax1 = plt.subplot(2, 2, 1)
        plot_single_heatmap(ax1, drs_df, glori1_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'GM12878 vs GLORI-1', HEATMAP_CMAPS['m6a_glori1'])
        ax2 = plt.subplot(2, 2, 2)
        plot_single_heatmap(ax2, drs_df, glori2_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'GM12878 vs GLORI-2', HEATMAP_CMAPS['m6a_glori2'])
        ax3 = plt.subplot(2, 2, 3)
        plot_single_heatmap(ax3, drs_df, glori_int, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'GM12878 vs GLORI-1 ‚à© GLORI-2', HEATMAP_CMAPS['m6a_combined'])
        ax4 = plt.subplot(2, 2, 4)
        plot_single_heatmap(ax4, drs_df, glori_union, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'GM12878 vs GLORI-1 ‚à™ GLORI-2', 'magma')
        fig.suptitle('m6A Modification Levels - GM12878', fontsize=16, fontweight='bold')
    
    elif mode == 'both':
        hek_drs = get_drs_values(dorado_mods_dict, 'HEK293', 'm6a')
        gm_drs = get_drs_values(dorado_mods_dict, 'GM12878', 'm6a')
        
        if gm_drs is not None:
            fig = plt.figure(figsize=(16, 16))
            n_rows = 4
        else:
            fig = plt.figure(figsize=(16, 8))
            n_rows = 2
        
        # HEK293
        ax1 = plt.subplot(n_rows, 2, 1)
        plot_single_heatmap(ax1, hek_drs, glori1_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'HEK293 vs GLORI-1', HEATMAP_CMAPS['m6a_glori1'])
        ax2 = plt.subplot(n_rows, 2, 2)
        plot_single_heatmap(ax2, hek_drs, glori2_df, 'Adjusted_Mod_Proportion', 
                           'm6A_level_mean', 'HEK293 vs GLORI-2', HEATMAP_CMAPS['m6a_glori2'])
        ax3 = plt.subplot(n_rows, 2, 3)
        plot_single_heatmap(ax3, hek_drs, glori_int, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'HEK293 vs GLORI-1 ‚à© GLORI-2', HEATMAP_CMAPS['m6a_combined'])
        ax4 = plt.subplot(n_rows, 2, 4)
        plot_single_heatmap(ax4, hek_drs, glori_union, 'Adjusted_Mod_Proportion', 
                           'm6A_combined', 'HEK293 vs GLORI-1 ‚à™ GLORI-2', 'magma')
        
        # GM12878 (if available)
        if gm_drs is not None:
            ax5 = plt.subplot(4, 2, 5)
            plot_single_heatmap(ax5, gm_drs, glori1_df, 'Adjusted_Mod_Proportion', 
                               'm6A_level_mean', 'GM12878 vs GLORI-1', HEATMAP_CMAPS['m6a_glori1'])
            ax6 = plt.subplot(4, 2, 6)
            plot_single_heatmap(ax6, gm_drs, glori2_df, 'Adjusted_Mod_Proportion', 
                               'm6A_level_mean', 'GM12878 vs GLORI-2', HEATMAP_CMAPS['m6a_glori2'])
            ax7 = plt.subplot(4, 2, 7)
            plot_single_heatmap(ax7, gm_drs, glori_int, 'Adjusted_Mod_Proportion', 
                               'm6A_combined', 'GM12878 vs GLORI-1 ‚à© GLORI-2', HEATMAP_CMAPS['m6a_combined'])
            ax8 = plt.subplot(4, 2, 8)
            plot_single_heatmap(ax8, gm_drs, glori_union, 'Adjusted_Mod_Proportion', 
                               'm6A_combined', 'GM12878 vs GLORI-1 ‚à™ GLORI-2', 'magma')
        
        fig.suptitle('m6A Modification Levels - Both Cell Lines', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

print("‚úì Heatmap functions loaded")

‚úì Heatmap functions loaded


## m6A Venn Diagrams

In [14]:
"""
=================================================================================
m6A VENN DIAGRAMS - SYMMETRIC LAYOUTS
=================================================================================
"""

def plot_m6a_venns(dorado_mods_dict, new_glori1, combined_glori_2, 
                   mode='both', glori_combine_mode='intersection', output_path=None):
    """m6A Venns - symmetric layouts for HEK293 and GM12878"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'm6a_venns_{mode}_{glori_combine_mode}.pdf'
    
    print(f"\nm6A Venns - {mode.upper()} - {glori_combine_mode.upper()}")
    
    colors = MODIFICATION_COLORS['m6a']
    glori1_sites = process_orthogonal_sites(new_glori1, 'Chr', 'Site')
    glori2_sites = process_orthogonal_sites(combined_glori_2, 'Chr', 'Site')
    
    glori_combined = glori1_sites & glori2_sites if glori_combine_mode == 'intersection' else glori1_sites | glori2_sites
    combine_label = 'GLORI-1 ‚à© GLORI-2' if glori_combine_mode == 'intersection' else 'GLORI-1 ‚à™ GLORI-2'
    
    print(f"  GLORI-1: {len(glori1_sites):,}")
    print(f"  GLORI-2: {len(glori2_sites):,}")
    print(f"  {combine_label}: {len(glori_combined):,}")
    
    if mode in ['HEK293', 'GM12878']:
        drs_sites = get_drs_sites(dorado_mods_dict, mode, 'm6a')
        print(f"  {mode} DRS: {len(drs_sites):,}")
        
        color_drs = colors['DRS_HEK293'] if mode == 'HEK293' else colors['DRS_GM12878']
        
        # SYMMETRIC LAYOUT: 2x2 grid (same for both cell lines)
        fig = plt.figure(figsize=(10, 10))
        
        # 1. DRS vs Combined GLORI
        ax1 = plt.subplot(2, 2, 1)
        plot_venn2_colored([drs_sites, glori_combined], [f'{mode} DRS', combine_label],
                          [color_drs, colors['GLORI_combined']], 
                          f'm6A: {mode} vs GLORI Combined', ax1)
        
        # 2. Three-way (DRS, GLORI-1, GLORI-2)
        ax2 = plt.subplot(2, 2, 2)
        patch_colors = {
            '100': color_drs, '010': colors['GLORI1'], '001': colors['GLORI2'],
            '110': colors['HEK_GLORI1'], '101': colors['HEK_GLORI2'],
            '011': colors['GLORI1_GLORI2'], '111': colors['ALL_THREE']
        }
        plot_venn3_colored([drs_sites, glori1_sites, glori2_sites],
                          [mode, 'GLORI-1', 'GLORI-2'], patch_colors, 
                          f'm6A: {mode} All Methods', ax2)
        
        # 3. DRS vs GLORI-1 only
        ax3 = plt.subplot(2, 2, 3)
        plot_venn2_colored([drs_sites, glori1_sites], [f'{mode} DRS', 'GLORI-1'],
                          [color_drs, colors['GLORI1']], 
                          f'm6A: {mode} vs GLORI-1', ax3)
        
        # 4. DRS vs GLORI-2 only
        ax4 = plt.subplot(2, 2, 4)
        plot_venn2_colored([drs_sites, glori2_sites], [f'{mode} DRS', 'GLORI-2'],
                          [color_drs, colors['GLORI2']], 
                          f'm6A: {mode} vs GLORI-2', ax4)
        
        fig.suptitle(f'm6A Sites - {mode} ({glori_combine_mode})', fontsize=16, fontweight='bold')
    
    elif mode == 'both':
        hek_sites = get_drs_sites(dorado_mods_dict, 'HEK293', 'm6a')
        gm_sites = get_drs_sites(dorado_mods_dict, 'GM12878', 'm6a')
        
        print(f"  HEK293: {len(hek_sites):,}, GM12878: {len(gm_sites):,}")
        
        fig = plt.figure(figsize=(15, 10))
        
        # Row 1: Overall comparisons
        ax1 = plt.subplot(3, 3, 1)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['DRS_GM12878'], '001': colors['GLORI_combined'],
            '110': '#bcbd22', '101': '#17becf', '011': '#e377c2', '111': '#7f7f7f'
        }
        plot_venn3_colored([hek_sites, gm_sites, glori_combined],
                          ['HEK293', 'GM12878', 'GLORI'], patch_colors, 
                          'm6A: Both Cells vs GLORI', ax1)
        
        ax2 = plt.subplot(3, 3, 2)
        plot_venn2_colored([hek_sites, gm_sites], ['HEK293', 'GM12878'],
                          [colors['DRS_HEK293'], colors['DRS_GM12878']], 
                          'm6A: Cell Line Comparison', ax2)
        
        ax3 = plt.subplot(3, 3, 3)
        plot_venn2_colored([glori1_sites, glori2_sites], ['GLORI-1', 'GLORI-2'],
                          [colors['GLORI1'], colors['GLORI2']], 
                          'm6A: GLORI Methods', ax3)
        
        # Row 2: HEK293
        ax4 = plt.subplot(3, 3, 4)
        plot_venn2_colored([hek_sites, glori_combined], ['HEK293', combine_label],
                          [colors['DRS_HEK293'], colors['GLORI_combined']], 
                          'm6A: HEK293 vs GLORI', ax4)
        
        ax5 = plt.subplot(3, 3, 5)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['GLORI1'], '001': colors['GLORI2'],
            '110': colors['HEK_GLORI1'], '101': colors['HEK_GLORI2'],
            '011': colors['GLORI1_GLORI2'], '111': colors['ALL_THREE']
        }
        plot_venn3_colored([hek_sites, glori1_sites, glori2_sites],
                          ['HEK293', 'GLORI-1', 'GLORI-2'], patch_colors, 
                          'm6A: HEK293 All Methods', ax5)
        
        ax6 = plt.subplot(3, 3, 6)
        plot_venn2_colored([hek_sites, glori1_sites], ['HEK293', 'GLORI-1'],
                          [colors['DRS_HEK293'], colors['GLORI1']], 
                          'm6A: HEK293 vs GLORI-1', ax6)
        
        # Row 3: GM12878 (SYMMETRIC TO HEK293)
        ax7 = plt.subplot(3, 3, 7)
        plot_venn2_colored([gm_sites, glori_combined], ['GM12878', combine_label],
                          [colors['DRS_GM12878'], colors['GLORI_combined']], 
                          'm6A: GM12878 vs GLORI', ax7)
        
        ax8 = plt.subplot(3, 3, 8)
        patch_colors = {
            '100': colors['DRS_GM12878'], '010': colors['GLORI1'], '001': colors['GLORI2'],
            '110': colors['HEK_GLORI1'], '101': colors['HEK_GLORI2'],
            '011': colors['GLORI1_GLORI2'], '111': colors['ALL_THREE']
        }
        plot_venn3_colored([gm_sites, glori1_sites, glori2_sites],
                          ['GM12878', 'GLORI-1', 'GLORI-2'], patch_colors, 
                          'm6A: GM12878 All Methods', ax8)
        
        ax9 = plt.subplot(3, 3, 9)
        plot_venn2_colored([gm_sites, glori1_sites], ['GM12878', 'GLORI-1'],
                          [colors['DRS_GM12878'], colors['GLORI1']], 
                          'm6A: GM12878 vs GLORI-1', ax9)
        
        fig.suptitle(f'm6A Sites - Complete ({glori_combine_mode})', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

print("‚úì m6A Venn function loaded (symmetric)")

‚úì m6A Venn function loaded (symmetric)


## Other Modification Venn Diagrams

In [15]:
"""
=================================================================================
OTHER MODIFICATION VENN DIAGRAMS - SYMMETRIC LAYOUTS
=================================================================================
"""

# ============================================================
# m5C
# ============================================================

def plot_m5c_venns(dorado_mods_dict, m5c_orthogonal_df, mode='both', output_path=None):
    """m5C Venns - symmetric for HEK293 and GM12878"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'm5c_venns_{mode}.pdf'
    
    print(f"\nm5C Venns - {mode.upper()}")
    colors = MODIFICATION_COLORS['m5c']
    orth_sites = process_orthogonal_sites(m5c_orthogonal_df, 'chromosome', 'position')
    print(f"  Orthogonal: {len(orth_sites):,}")
    
    if mode in ['HEK293', 'GM12878']:
        drs_sites = get_drs_sites(dorado_mods_dict, mode, 'm5c')
        print(f"  {mode} DRS: {len(drs_sites):,}, Overlap: {len(drs_sites & orth_sites):,}")
        
        color_drs = colors['DRS_HEK293'] if mode == 'HEK293' else colors['DRS_GM12878']
        
        # SYMMETRIC LAYOUT: Single comparison (only 1 orthogonal method)
        fig, ax = plt.subplots(figsize=(5, 5))
        plot_venn2_colored([drs_sites, orth_sites], [f'{mode} DRS', 'Orthogonal'],
                          [color_drs, colors['Orthogonal']], 
                          f'm5C: {mode} vs Orthogonal', ax)
        fig.suptitle(f'm5C Sites - {mode}', fontsize=16, fontweight='bold')
        
    elif mode == 'both':
        hek_sites = get_drs_sites(dorado_mods_dict, 'HEK293', 'm5c')
        gm_sites = get_drs_sites(dorado_mods_dict, 'GM12878', 'm5c')
        
        print(f"  HEK293: {len(hek_sites):,}, GM12878: {len(gm_sites):,}")
        
        fig = plt.figure(figsize=(10, 10))
        
        # Row 1: Overall
        ax1 = plt.subplot(2, 2, 1)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['DRS_GM12878'], '001': colors['Orthogonal'],
            '110': colors['HEK_GM'], '101': colors['HEK_Orth'], '011': colors['GM_Orth'],
            '111': colors['ALL_THREE']
        }
        plot_venn3_colored([hek_sites, gm_sites, orth_sites],
                          ['HEK293', 'GM12878', 'Orthogonal'], patch_colors, 
                          'm5C: Three-way', ax1)
        
        ax2 = plt.subplot(2, 2, 2)
        plot_venn2_colored([hek_sites, gm_sites], ['HEK293', 'GM12878'],
                          [colors['DRS_HEK293'], colors['DRS_GM12878']], 
                          'm5C: Cell Lines', ax2)
        
        # Row 2: Cell-specific (SYMMETRIC)
        ax3 = plt.subplot(2, 2, 3)
        plot_venn2_colored([hek_sites, orth_sites], ['HEK293 DRS', 'Orthogonal'],
                          [colors['DRS_HEK293'], colors['Orthogonal']], 
                          'm5C: HEK293 vs Orthogonal', ax3)
        
        ax4 = plt.subplot(2, 2, 4)
        plot_venn2_colored([gm_sites, orth_sites], ['GM12878 DRS', 'Orthogonal'],
                          [colors['DRS_GM12878'], colors['Orthogonal']], 
                          'm5C: GM12878 vs Orthogonal', ax4)
        
        fig.suptitle('m5C Sites - Complete', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

# ============================================================
# Pseudouridine
# ============================================================

def plot_psi_venns(dorado_mods_dict, bid_seq_df, praise_filtered, mode='both', output_path=None):
    """Pseudouridine Venns - symmetric for HEK293 and GM12878"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'psi_venns_{mode}.pdf'
    
    print(f"\nPseudouridine Venns - {mode.upper()}")
    colors = MODIFICATION_COLORS['psi']
    
    bid_sites = process_orthogonal_sites(bid_seq_df, 'chr', 'pos')
    praise_sites = process_orthogonal_sites(praise_filtered, 'chromosome', 'genomic_position')
    combined_orth = bid_sites & praise_sites
    all_orth = bid_sites | praise_sites
    
    print(f"  BID-seq: {len(bid_sites):,}, PRAISE: {len(praise_sites):,}")
    print(f"  BID ‚à© PRAISE: {len(combined_orth):,}, BID ‚à™ PRAISE: {len(all_orth):,}")
    
    if mode in ['HEK293', 'GM12878']:
        drs_sites = get_drs_sites(dorado_mods_dict, mode, 'psi')
        print(f"  {mode} DRS: {len(drs_sites):,}")
        
        color_drs = colors['DRS_HEK293'] if mode == 'HEK293' else colors['DRS_GM12878']
        
        # SYMMETRIC LAYOUT: 2x2 grid
        fig = plt.figure(figsize=(10, 10))
        
        # 1. DRS vs All Orthogonal (Union)
        ax1 = plt.subplot(2, 2, 1)
        plot_venn2_colored([drs_sites, all_orth], [f'{mode} DRS', 'All Orthogonal'],
                          [color_drs, colors['Combined']], 
                          f'Œ®: {mode} vs All Orthogonal', ax1)
        
        # 2. Three-way (DRS, PRAISE, BID-seq)
        ax2 = plt.subplot(2, 2, 2)
        patch_colors = {
            '100': color_drs, '010': colors['PRAISE'], '001': colors['BID-seq'],
            '110': colors['HEK_PRAISE'], '101': colors['HEK_BID'], '011': colors['BID_PRAISE'],
            '111': colors['ALL_THREE']
        }
        plot_venn3_colored([drs_sites, praise_sites, bid_sites],
                          [mode, 'PRAISE', 'BID-seq'], patch_colors, 
                          f'Œ®: {mode} All Methods', ax2)
        
        # 3. DRS vs PRAISE only
        ax3 = plt.subplot(2, 2, 3)
        plot_venn2_colored([drs_sites, praise_sites], [f'{mode} DRS', 'PRAISE'],
                          [color_drs, colors['PRAISE']], 
                          f'Œ®: {mode} vs PRAISE', ax3)
        
        # 4. DRS vs BID-seq only
        ax4 = plt.subplot(2, 2, 4)
        plot_venn2_colored([drs_sites, bid_sites], [f'{mode} DRS', 'BID-seq'],
                          [color_drs, colors['BID-seq']], 
                          f'Œ®: {mode} vs BID-seq', ax4)
        
        fig.suptitle(f'Pseudouridine (Œ®) Sites - {mode}', fontsize=16, fontweight='bold')
    
    elif mode == 'both':
        hek_sites = get_drs_sites(dorado_mods_dict, 'HEK293', 'psi')
        gm_sites = get_drs_sites(dorado_mods_dict, 'GM12878', 'psi')
        
        print(f"  HEK293: {len(hek_sites):,}, GM12878: {len(gm_sites):,}")
        
        fig = plt.figure(figsize=(15, 10))
        
        # Row 1: Overall
        ax1 = plt.subplot(3, 3, 1)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['DRS_GM12878'], '001': colors['Combined'],
            '110': '#bcbd22', '101': '#17becf', '011': '#e377c2', '111': '#7f7f7f'
        }
        plot_venn3_colored([hek_sites, gm_sites, all_orth],
                          ['HEK293', 'GM12878', 'All Orth'], patch_colors, 
                          'Œ®: Both Cells vs Orth', ax1)
        
        ax2 = plt.subplot(3, 3, 2)
        plot_venn2_colored([hek_sites, gm_sites], ['HEK293', 'GM12878'],
                          [colors['DRS_HEK293'], colors['DRS_GM12878']], 
                          'Œ®: Cell Lines', ax2)
        
        ax3 = plt.subplot(3, 3, 3)
        plot_venn2_colored([praise_sites, bid_sites], ['PRAISE', 'BID-seq'],
                          [colors['PRAISE'], colors['BID-seq']], 
                          'Œ®: Orth Methods', ax3)
        
        # Row 2: HEK293 (SYMMETRIC LAYOUT)
        ax4 = plt.subplot(3, 3, 4)
        plot_venn2_colored([hek_sites, all_orth], ['HEK293 DRS', 'All Orth'],
                          [colors['DRS_HEK293'], colors['Combined']], 
                          'Œ®: HEK293 vs All Orth', ax4)
        
        ax5 = plt.subplot(3, 3, 5)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['PRAISE'], '001': colors['BID-seq'],
            '110': colors['HEK_PRAISE'], '101': colors['HEK_BID'], '011': colors['BID_PRAISE'],
            '111': colors['ALL_THREE']
        }
        plot_venn3_colored([hek_sites, praise_sites, bid_sites],
                          ['HEK293', 'PRAISE', 'BID'], patch_colors, 
                          'Œ®: HEK293 All Methods', ax5)
        
        ax6 = plt.subplot(3, 3, 6)
        plot_venn2_colored([hek_sites, praise_sites], ['HEK293', 'PRAISE'],
                          [colors['DRS_HEK293'], colors['PRAISE']], 
                          'Œ®: HEK293 vs PRAISE', ax6)
        
        # Row 3: GM12878 (SYMMETRIC TO HEK293)
        ax7 = plt.subplot(3, 3, 7)
        plot_venn2_colored([gm_sites, all_orth], ['GM12878 DRS', 'All Orth'],
                          [colors['DRS_GM12878'], colors['Combined']], 
                          'Œ®: GM12878 vs All Orth', ax7)
        
        ax8 = plt.subplot(3, 3, 8)
        patch_colors = {
            '100': colors['DRS_GM12878'], '010': colors['PRAISE'], '001': colors['BID-seq'],
            '110': colors['HEK_PRAISE'], '101': colors['HEK_BID'], '011': colors['BID_PRAISE'],
            '111': colors['ALL_THREE']
        }
        plot_venn3_colored([gm_sites, praise_sites, bid_sites],
                          ['GM12878', 'PRAISE', 'BID'], patch_colors, 
                          'Œ®: GM12878 All Methods', ax8)
        
        ax9 = plt.subplot(3, 3, 9)
        plot_venn2_colored([gm_sites, praise_sites], ['GM12878', 'PRAISE'],
                          [colors['DRS_GM12878'], colors['PRAISE']], 
                          'Œ®: GM12878 vs PRAISE', ax9)
        
        fig.suptitle('Pseudouridine (Œ®) Sites - Complete', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

# ============================================================
# Inosine
# ============================================================

def plot_inosine_venns(dorado_mods_dict, combined_ino, mode='both', output_path=None):
    """Inosine Venns - symmetric for HEK293 and GM12878"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'inosine_venns_{mode}.pdf'
    
    print(f"\nInosine Venns - {mode.upper()}")
    colors = MODIFICATION_COLORS['inosine']
    orth_sites = process_orthogonal_sites(combined_ino, 'Chromosome', 'position')
    print(f"  Orthogonal: {len(orth_sites):,}")
    
    if mode in ['HEK293', 'GM12878']:
        drs_sites = get_drs_sites(dorado_mods_dict, mode, 'inosine')
        print(f"  {mode} DRS: {len(drs_sites):,}, Overlap: {len(drs_sites & orth_sites):,}")
        
        color_drs = colors['DRS_HEK293'] if mode == 'HEK293' else colors['DRS_GM12878']
        
        # SYMMETRIC LAYOUT: Single comparison
        fig, ax = plt.subplots(figsize=(5, 5))
        plot_venn2_colored([drs_sites, orth_sites], [f'{mode} DRS', 'Orthogonal'],
                          [color_drs, colors['Orthogonal']], 
                          f'Inosine: {mode} vs Orthogonal', ax)
        fig.suptitle(f'Inosine Sites - {mode}', fontsize=16, fontweight='bold')
        
    elif mode == 'both':
        hek_sites = get_drs_sites(dorado_mods_dict, 'HEK293', 'inosine')
        gm_sites = get_drs_sites(dorado_mods_dict, 'GM12878', 'inosine')
        
        print(f"  HEK293: {len(hek_sites):,}, GM12878: {len(gm_sites):,}")
        
        fig = plt.figure(figsize=(10, 10))
        
        # Row 1: Overall
        ax1 = plt.subplot(2, 2, 1)
        patch_colors = {
            '100': colors['DRS_HEK293'], '010': colors['DRS_GM12878'], '001': colors['Orthogonal'],
            '110': colors['HEK_GM'], '101': colors['HEK_Orth'], '011': colors['GM_Orth'],
            '111': colors['ALL_THREE']
        }
        plot_venn3_colored([hek_sites, gm_sites, orth_sites],
                          ['HEK293', 'GM12878', 'Orth'], patch_colors, 
                          'Inosine: Three-way', ax1)
        
        ax2 = plt.subplot(2, 2, 2)
        plot_venn2_colored([hek_sites, gm_sites], ['HEK293', 'GM12878'],
                          [colors['DRS_HEK293'], colors['DRS_GM12878']], 
                          'Inosine: Cell Lines', ax2)
        
        # Row 2: Cell-specific (SYMMETRIC)
        ax3 = plt.subplot(2, 2, 3)
        plot_venn2_colored([hek_sites, orth_sites], ['HEK293 DRS', 'Orthogonal'],
                          [colors['DRS_HEK293'], colors['Orthogonal']], 
                          'Inosine: HEK293 vs Orth', ax3)
        
        ax4 = plt.subplot(2, 2, 4)
        plot_venn2_colored([gm_sites, orth_sites], ['GM12878 DRS', 'Orthogonal'],
                          [colors['DRS_GM12878'], colors['Orthogonal']], 
                          'Inosine: GM12878 vs Orth', ax4)
        
        fig.suptitle('Inosine Sites - Complete', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

# ============================================================
# 2'-O-Methylation
# ============================================================

def plot_2ome_venns(dorado_mods_dict, OMe_A, OMe_C, OMe_G, OMe_U, mode='both', output_path=None):
    """2'OMe Venns - symmetric for HEK293 and GM12878"""
    if output_path is None:
        output_path = OUTPUT_DIR / f'2ome_venns_{mode}.pdf'
    
    print(f"\n2'OMe Venns - {mode.upper()}")
    colors = MODIFICATION_COLORS['2ome']
    
    orth_A = process_orthogonal_sites(OMe_A, 'Chr', 'Position')
    orth_C = process_orthogonal_sites(OMe_C, 'Chr', 'Position')
    orth_G = process_orthogonal_sites(OMe_G, 'Chr', 'Position')
    orth_U = process_orthogonal_sites(OMe_U, 'Chr', 'Position')
    orth_all = orth_A | orth_C | orth_G | orth_U
    
    print(f"  Orth: A={len(orth_A)}, C={len(orth_C)}, G={len(orth_G)}, U={len(orth_U)}, Total={len(orth_all)}")
    
    if mode in ['HEK293', 'GM12878']:
        drs_A = get_drs_sites(dorado_mods_dict, mode, '2OMeA')
        drs_C = get_drs_sites(dorado_mods_dict, mode, '2OMeC')
        drs_G = get_drs_sites(dorado_mods_dict, mode, '2OMeG')
        drs_U = get_drs_sites(dorado_mods_dict, mode, '2OMeU')
        drs_all = drs_A | drs_C | drs_G | drs_U
        
        print(f"  {mode}: A={len(drs_A)}, C={len(drs_C)}, G={len(drs_G)}, U={len(drs_U)}, Total={len(drs_all)}")
        
        # SYMMETRIC LAYOUT: 2x3 grid (same for both cell lines)
        fig = plt.figure(figsize=(15, 10))
        
        # Row 1: Overall + individual bases A & C
        ax1 = plt.subplot(2, 3, 1)
        color_drs_main = colors[f'DRS_{mode.upper()}_A']
        plot_venn2_colored([drs_all, orth_all], [f'{mode} DRS', 'Orthogonal'],
                          [color_drs_main, colors['Orthogonal_A']], 
                          f"2'OMe: {mode} Combined", ax1)
        
        ax2 = plt.subplot(2, 3, 2)
        plot_venn2_colored([drs_A, orth_A], [f'{mode} DRS', 'Orth'],
                          [colors[f'DRS_{mode.upper()}_A'], colors['Orthogonal_A']], 
                          f"2'OMe-A: {mode}", ax2)
        
        ax3 = plt.subplot(2, 3, 3)
        plot_venn2_colored([drs_C, orth_C], [f'{mode} DRS', 'Orth'],
                          [colors[f'DRS_{mode.upper()}_C'], colors['Orthogonal_C']], 
                          f"2'OMe-C: {mode}", ax3)
        
        # Row 2: Individual bases G & U + summary
        ax4 = plt.subplot(2, 3, 4)
        plot_venn2_colored([drs_G, orth_G], [f'{mode} DRS', 'Orth'],
                          [colors[f'DRS_{mode.upper()}_G'], colors['Orthogonal_G']], 
                          f"2'OMe-G: {mode}", ax4)
        
        ax5 = plt.subplot(2, 3, 5)
        plot_venn2_colored([drs_U, orth_U], [f'{mode} DRS', 'Orth'],
                          [colors[f'DRS_{mode.upper()}_U'], colors['Orthogonal_U']], 
                          f"2'OMe-U: {mode}", ax5)
        
        ax6 = plt.subplot(2, 3, 6)
        ax6.axis('off')
        total_overlap = len(drs_all & orth_all)
        summary = f"""
        2'O-Methylation
        {mode} vs HEK293T
        
        DRS Total:   {len(drs_all):,}
          A: {len(drs_A):,}
          C: {len(drs_C):,}
          G: {len(drs_G):,}
          U: {len(drs_U):,}
        
        Orth Total:  {len(orth_all):,}
        
        Overlap:     {total_overlap:,}
        """
        ax6.text(0.1, 0.5, summary, fontsize=10, verticalalignment='center', fontfamily='monospace')
        
        fig.suptitle(f"2'-O-Methylation - {mode}", fontsize=16, fontweight='bold')
    
    elif mode == 'both':
        hek_A = get_drs_sites(dorado_mods_dict, 'HEK293', '2OMeA')
        hek_C = get_drs_sites(dorado_mods_dict, 'HEK293', '2OMeC')
        hek_G = get_drs_sites(dorado_mods_dict, 'HEK293', '2OMeG')
        hek_U = get_drs_sites(dorado_mods_dict, 'HEK293', '2OMeU')
        hek_all = hek_A | hek_C | hek_G | hek_U
        
        gm_A = get_drs_sites(dorado_mods_dict, 'GM12878', '2OMeA')
        gm_C = get_drs_sites(dorado_mods_dict, 'GM12878', '2OMeC')
        gm_G = get_drs_sites(dorado_mods_dict, 'GM12878', '2OMeG')
        gm_U = get_drs_sites(dorado_mods_dict, 'GM12878', '2OMeU')
        gm_all = gm_A | gm_C | gm_G | gm_U
        
        print(f"  HEK293: {len(hek_all):,}, GM12878: {len(gm_all):,}")
        
        fig = plt.figure(figsize=(18, 12))
        
        # Row 1: Overall comparisons
        ax1 = plt.subplot(3, 4, 1)
        patch_colors = {
            '100': colors['DRS_HEK293_A'], '010': colors['DRS_GM12878_A'], '001': colors['Orthogonal_A'],
            '110': '#bcbd22', '101': '#17becf', '011': '#e377c2', '111': '#7f7f7f'
        }
        plot_venn3_colored([hek_all, gm_all, orth_all],
                          ['HEK', 'GM', 'Orth'], patch_colors, "2'OMe: Three-way", ax1)
        
        ax2 = plt.subplot(3, 4, 2)
        plot_venn2_colored([hek_all, gm_all], ['HEK293', 'GM12878'],
                          [colors['DRS_HEK293_A'], colors['DRS_GM12878_A']], 
                          "2'OMe: Cell Lines", ax2)
        
        ax3 = plt.subplot(3, 4, 3)
        plot_venn2_colored([hek_all, orth_all], ['HEK293', 'Orth'],
                          [colors['DRS_HEK293_A'], colors['Orthogonal_A']], 
                          "2'OMe: HEK vs Orth", ax3)
        
        ax4 = plt.subplot(3, 4, 4)
        plot_venn2_colored([gm_all, orth_all], ['GM12878', 'Orth'],
                          [colors['DRS_GM12878_A'], colors['Orthogonal_A']], 
                          "2'OMe: GM vs Orth", ax4)
        
        # Row 2: HEK293 by base (SYMMETRIC)
        for idx, (base, hek, orth, hek_color, orth_color) in enumerate([
            ('A', hek_A, orth_A, colors['DRS_HEK293_A'], colors['Orthogonal_A']),
            ('C', hek_C, orth_C, colors['DRS_HEK293_C'], colors['Orthogonal_C']),
            ('G', hek_G, orth_G, colors['DRS_HEK293_G'], colors['Orthogonal_G']),
            ('U', hek_U, orth_U, colors['DRS_HEK293_U'], colors['Orthogonal_U'])
        ]):
            ax = plt.subplot(3, 4, 5 + idx)
            plot_venn2_colored([hek, orth], ['HEK DRS', 'Orth'],
                              [hek_color, orth_color], f"2'OMe-{base}: HEK", ax)
        
        # Row 3: GM12878 by base (SYMMETRIC TO HEK293)
        for idx, (base, gm, orth, gm_color, orth_color) in enumerate([
            ('A', gm_A, orth_A, colors['DRS_GM12878_A'], colors['Orthogonal_A']),
            ('C', gm_C, orth_C, colors['DRS_GM12878_C'], colors['Orthogonal_C']),
            ('G', gm_G, orth_G, colors['DRS_GM12878_G'], colors['Orthogonal_G']),
            ('U', gm_U, orth_U, colors['DRS_GM12878_U'], colors['Orthogonal_U'])
        ]):
            ax = plt.subplot(3, 4, 9 + idx)
            plot_venn2_colored([gm, orth], ['GM DRS', 'Orth'],
                              [gm_color, orth_color], f"2'OMe-{base}: GM", ax)
        
        fig.suptitle("2'-O-Methylation - Both Cell Lines", fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
    plt.show()
    print(f"‚úì Saved to {output_path}")

print("‚úì All Venn functions loaded (symmetric layouts)")

‚úì All Venn functions loaded (symmetric layouts)


## RMSE Calculcation Function

In [16]:
"""
=================================================================================
RMSE/RMSD CALCULATION FUNCTION (FIXED)
=================================================================================
"""

def calculate_rmse(drs_df, ortho_df, drs_col, ortho_col, 
                   chr_col_drs, pos_col_drs, chr_col_ortho, pos_col_ortho,
                   comparison_name, scale_ortho_by_100=False, show_examples=False,
                   ortho_already_processed=False):
    """
    Calculate RMSE between DRS and orthogonal (matched sites only)
    
    Parameters:
    -----------
    ortho_already_processed : bool
        If True, ortho_df already has 'site_id' column (skip coordinate processing)
    """
    
    print(f"\n{'‚îÄ'*70}")
    print(f"{comparison_name}")
    print(f"{'‚îÄ'*70}")
    
    # Process DRS
    drs_processed = drs_df.with_columns([
        (pl.col(chr_col_drs).cast(pl.Utf8) + '_' + 
         pl.col(pos_col_drs).cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
    ]).select(['site_id', drs_col])
    
    # Process Orthogonal (or use as-is if already processed)
    if ortho_already_processed:
        # Already has site_id and value columns
        ortho_processed = ortho_df
        print(f"  Using pre-processed orthogonal data")
    else:
        # Need to create site IDs
        if isinstance(ortho_df, pl.DataFrame):
            sample_chr = ortho_df[chr_col_ortho][0] if len(ortho_df) > 0 else None
            if sample_chr and not str(sample_chr).startswith('chr'):
                ortho_processed = ortho_df.with_columns([
                    ('chr' + pl.col(chr_col_ortho).cast(pl.Utf8) + '_' + 
                     pl.col(pos_col_ortho).cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
                ])
            else:
                ortho_processed = ortho_df.with_columns([
                    (pl.col(chr_col_ortho).cast(pl.Utf8) + '_' + 
                     pl.col(pos_col_ortho).cast(pl.Float64).cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
                ])
            ortho_processed = ortho_processed.select(['site_id', ortho_col])
        else:
            ortho_clean = ortho_df.dropna(subset=[pos_col_ortho])
            sample_chr = str(ortho_clean[chr_col_ortho].iloc[0]) if len(ortho_clean) > 0 else None
            pos_int = ortho_clean[pos_col_ortho].astype(float).astype(int).astype(str)
            if sample_chr and sample_chr.startswith('chr'):
                site_ids = ortho_clean[chr_col_ortho].astype(str) + '_' + pos_int
            else:
                site_ids = 'chr' + ortho_clean[chr_col_ortho].astype(str) + '_' + pos_int
            ortho_processed = pl.DataFrame({
                'site_id': site_ids.tolist(),
                ortho_col: ortho_clean[ortho_col].tolist()
            })
        
        if scale_ortho_by_100:
            ortho_processed = ortho_processed.with_columns([
                (pl.col(ortho_col) * 100).alias(ortho_col)
            ])
    
    # Inner join
    merged = drs_processed.join(ortho_processed, on='site_id', how='inner')
    print(f"  Matched: {len(merged):,}")
    
    if len(merged) == 0:
        print("  ‚ö†Ô∏è  No overlap")
        return None
    
    drs_values = merged[drs_col].to_numpy()
    ortho_values = merged[ortho_col].to_numpy()
    
    valid_mask = ~(np.isnan(drs_values) | np.isnan(ortho_values))
    if not valid_mask.all():
        drs_values = drs_values[valid_mask]
        ortho_values = ortho_values[valid_mask]
    
    if len(drs_values) == 0:
        return None
    
    differences = drs_values - ortho_values
    rmse = np.sqrt(mean_squared_error(drs_values, ortho_values))
    mae = np.mean(np.abs(differences))
    correlation = np.corrcoef(drs_values, ortho_values)[0, 1]
    
    if show_examples:
        print(f"  Examples: ", end="")
        for i in range(min(3, len(drs_values))):
            print(f"({drs_values[i]:.1f},{ortho_values[i]:.1f}) ", end="")
        print()
    
    print(f"  RMSE: {rmse:.2f}%, MAE: {mae:.2f}%, r: {correlation:.3f}")
    
    return {
        'comparison': comparison_name,
        'n_sites': len(drs_values),
        'rmse': rmse,
        'rmsd': rmse,
        'mae': mae,
        'correlation': correlation,
        'r_squared': correlation**2,
        'mean_diff': np.mean(differences),
        'std_diff': np.std(differences),
        'drs_mean': drs_values.mean(),
        'ortho_mean': ortho_values.mean()
    }

print("‚úì RMSE function loaded (fixed)")

‚úì RMSE function loaded (fixed)


## RMSE for DRS vs Orthogonal

In [17]:
"""
=================================================================================
RMSE: DRS vs ORTHOGONAL - ALL CELL LINES (FIXED)
=================================================================================
"""

def calculate_all_rmse(dorado_mods_dict, new_glori1, combined_glori_2, 
                       m5c_orthogonal_df, bid_seq_df, praise_filtered,
                       cell_line='HEK293', show_examples=False):
    """Calculate RMSE for all modifications"""
    
    print("\n" + "="*80)
    print(f"RMSE: DRS vs ORTHOGONAL - {cell_line.upper()}")
    print("="*80)
    
    results = {}
    cell_lines = ['HEK293', 'GM12878'] if cell_line == 'both' else [cell_line]
    
    for cl in cell_lines:
        if f'{cl}_m6a' not in dorado_mods_dict:
            print(f"\n‚ö†Ô∏è  {cl} data not available")
            continue
        
        print(f"\n{'üî¥'*35} {cl} {'üî¥'*35}")
        
        # Get filtered DRS
        drs_m6a = dorado_mods_dict[f'{cl}_m6a'].filter(
            (pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20)
        )
        
        # m6A - GLORI-1
        result = calculate_rmse(drs_m6a, new_glori1, 'Adjusted_Mod_Proportion', 'm6A_level_mean',
                               'Chromosome', 'End', 'Chr', 'Site',
                               f"{cl} vs GLORI-1", False, show_examples, 
                               ortho_already_processed=False)
        if result: results[f'{cl}_m6A_GLORI1'] = result
        
        # m6A - GLORI-2
        result = calculate_rmse(drs_m6a, combined_glori_2, 'Adjusted_Mod_Proportion', 'm6A_level_mean',
                               'Chromosome', 'End', 'Chr', 'Site',
                               f"{cl} vs GLORI-2", False, show_examples,
                               ortho_already_processed=False)
        if result: results[f'{cl}_m6A_GLORI2'] = result
        
        # m6A - GLORI Intersection (ALREADY PROCESSED - just has site_id and m6A_combined)
        glori_int = create_glori_combined_values(new_glori1, combined_glori_2, 'intersection')
        result = calculate_rmse(drs_m6a, glori_int, 'Adjusted_Mod_Proportion', 'm6A_combined',
                               'Chromosome', 'End', 'site_id', 'site_id',  # Dummy values, won't be used
                               f"{cl} vs GLORI-1‚à©GLORI-2", False, show_examples,
                               ortho_already_processed=True)  # ‚≠ê KEY FIX
        if result: results[f'{cl}_m6A_GLORI_int'] = result
        
        # m5C
        if f'{cl}_m5c' in dorado_mods_dict:
            drs_m5c = dorado_mods_dict[f'{cl}_m5c'].filter(
                (pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20)
            )
            result = calculate_rmse(drs_m5c, m5c_orthogonal_df, 'Adjusted_Mod_Proportion', 'ratio',
                                   'Chromosome', 'End', 'chromosome', 'position',
                                   f"{cl} vs m5C Orth", True, show_examples,
                                   ortho_already_processed=False)
            if result: results[f'{cl}_m5C'] = result
        
        # Pseudouridine - BID-seq
        if f'{cl}_psi' in dorado_mods_dict:
            drs_psi = dorado_mods_dict[f'{cl}_psi'].filter(
                (pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20)
            )
            
            bid_numeric = bid_seq_df.copy()
            bid_numeric['Frac_Ave %'] = pd.to_numeric(bid_numeric['Frac_Ave %'], errors='coerce')
            bid_numeric['pos'] = pd.to_numeric(bid_numeric['pos'], errors='coerce')
            bid_numeric = bid_numeric.dropna(subset=['Frac_Ave %', 'pos'])
            
            result = calculate_rmse(drs_psi, bid_numeric, 'Adjusted_Mod_Proportion', 'Frac_Ave %',
                                   'Chromosome', 'End', 'chr', 'pos',
                                   f"{cl} vs BID-seq", False, show_examples,
                                   ortho_already_processed=False)
            if result: results[f'{cl}_Psi_BID'] = result
            
            # Pseudouridine - PRAISE
            praise_avg = praise_filtered.dropna(subset=['genomic_position']).copy()
            praise_avg['stop_avg_pct'] = ((
                praise_avg['rep1-difference of stop rate'] + 
                praise_avg['rep2-difference of stop rate'] + 
                praise_avg['rep3-difference of stop rate']
            ) / 3) * 100
            
            result = calculate_rmse(drs_psi, praise_avg, 'Adjusted_Mod_Proportion', 'stop_avg_pct',
                                   'Chromosome', 'End', 'chromosome', 'genomic_position',
                                   f"{cl} vs PRAISE", False, show_examples,
                                   ortho_already_processed=False)
            if result: results[f'{cl}_Psi_PRAISE'] = result
    
    # Summary
    if results:
        summary_df = pd.DataFrame(results).T.round(3)
        col_order = ['comparison', 'n_sites', 'correlation', 'r_squared', 
                     'rmse', 'mae', 'mean_diff', 'std_diff', 'drs_mean', 'ortho_mean']
        summary_df = summary_df[col_order]
        
        print("\n" + "="*80)
        print("SUMMARY")
        print("="*80)
        print("\n" + summary_df.to_string())
        
        summary_path = OUTPUT_DIR / f'RMSE_summary_{cell_line}.csv'
        summary_df.to_csv(summary_path, index=False)
        print(f"\n‚úì Saved to {summary_path}")
        
        return summary_df
    
    return None

print("‚úì RMSE DRS vs Orthogonal function loaded (fixed)")

‚úì RMSE DRS vs Orthogonal function loaded (fixed)


## RMSE Between Cell Lines (GM12878 vs HEK293)

In [18]:
"""
=================================================================================
RMSE BETWEEN CELL LINES: GM12878 vs HEK293 (FIXED)
=================================================================================
"""

def calculate_cell_line_rmse(dorado_mods_dict, modifications=['m6a', 'm5c', 'psi', 'inosine']):
    """Calculate RMSE between GM12878 and HEK293 at matching positions"""
    
    print("\n" + "="*80)
    print("RMSE: GM12878 vs HEK293 (Cell Line Comparison)")
    print("="*80)
    
    results = {}
    
    for mod in modifications:
        hek_key = f'HEK293_{mod}'
        gm_key = f'GM12878_{mod}'
        
        print(f"\n{'‚îÄ'*70}")
        print(f"{mod.upper()}")
        print(f"{'‚îÄ'*70}")
        
        if hek_key not in dorado_mods_dict or gm_key not in dorado_mods_dict:
            print(f"  ‚ö†Ô∏è  Data not available for both cell lines")
            continue
        
        # Filter both
        hek_df = dorado_mods_dict[hek_key].filter(
            (pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20)
        )
        gm_df = dorado_mods_dict[gm_key].filter(
            (pl.col('Adjusted_Mod_Proportion') >= 20) & (pl.col('Score') >= 20)
        )
        
        # Add site IDs
        hek_processed = hek_df.with_columns([
            (pl.col('Chromosome').cast(pl.Utf8) + '_' + 
             pl.col('End').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
        ]).select(['site_id', 'Adjusted_Mod_Proportion'])
        
        gm_processed = gm_df.with_columns([
            (pl.col('Chromosome').cast(pl.Utf8) + '_' + 
             pl.col('End').cast(pl.Int64).cast(pl.Utf8)).alias('site_id')
        ]).select(['site_id', 'Adjusted_Mod_Proportion'])
        
        # Inner join
        merged = hek_processed.join(
            gm_processed.rename({'Adjusted_Mod_Proportion': 'GM_Mod'}), 
            on='site_id', how='inner'
        )
        
        print(f"  HEK: {len(hek_processed):,}, GM: {len(gm_processed):,}, Matched: {len(merged):,}")
        
        if len(merged) == 0:
            continue
        
        hek_values = merged['Adjusted_Mod_Proportion'].to_numpy()
        gm_values = merged['GM_Mod'].to_numpy()
        
        valid_mask = ~(np.isnan(hek_values) | np.isnan(gm_values))
        hek_values = hek_values[valid_mask]
        gm_values = gm_values[valid_mask]
        
        if len(hek_values) == 0:
            continue
        
        differences = hek_values - gm_values
        rmse = np.sqrt(mean_squared_error(hek_values, gm_values))
        mae = np.mean(np.abs(differences))
        correlation = np.corrcoef(hek_values, gm_values)[0, 1]
        
        print(f"  RMSE: {rmse:.2f}%, r: {correlation:.3f}, Diff: {np.mean(differences):+.2f}¬±{np.std(differences):.2f}%")
        
        results[mod] = {
            'modification': mod.upper(),
            'n_sites': len(hek_values),
            'rmse': rmse,
            'mae': mae,
            'correlation': correlation,
            'r_squared': correlation**2,
            'mean_diff': np.mean(differences),
            'std_diff': np.std(differences),
            'hek_mean': hek_values.mean(),
            'gm_mean': gm_values.mean()
        }
    
    if results:
        summary_df = pd.DataFrame(results).T.round(3)
        
        print("\n" + "="*80)
        print("CELL LINE RMSE SUMMARY")
        print("="*80)
        print("\n" + summary_df.to_string())
        
        summary_path = OUTPUT_DIR / 'RMSE_cell_line_comparison.csv'
        summary_df.to_csv(summary_path, index=False)
        print(f"\n‚úì Saved to {summary_path}")
        
        # Visualization - FIXED INDEXING
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
        
        colors_plot = ['#1f77b4', '#2ca02c', '#9467bd', '#17becf']
        
        # Bar plots
        ax1.barh(summary_df['modification'], summary_df['rmse'], 
                color=colors_plot[:len(summary_df)], alpha=0.7)
        ax1.set_xlabel('RMSE (%)')
        ax1.set_title('RMSE: GM12878 vs HEK293', fontweight='bold')
        ax1.grid(axis='x', alpha=0.3)
        
        ax2.barh(summary_df['modification'], summary_df['correlation'], 
                color=colors_plot[:len(summary_df)], alpha=0.7)
        ax2.set_xlabel('Correlation')
        ax2.set_title('Correlation: GM12878 vs HEK293', fontweight='bold')
        ax2.set_xlim([0, 1])
        ax2.grid(axis='x', alpha=0.3)
        
        ax3.barh(summary_df['modification'], summary_df['n_sites'], 
                color=colors_plot[:len(summary_df)], alpha=0.7)
        ax3.set_xlabel('Matched Sites')
        ax3.set_title('Overlapping Sites', fontweight='bold')
        ax3.grid(axis='x', alpha=0.3)
        
        # Scatter - FIXED: Use enumerate to get integer index
        for i, (idx, row) in enumerate(summary_df.iterrows()):
            ax4.scatter(row['correlation'], row['rmse'], s=row['n_sites']/50, 
                       alpha=0.7, label=row['modification'], color=colors_plot[i])
        ax4.set_xlabel('Correlation')
        ax4.set_ylabel('RMSE (%)')
        ax4.set_title('RMSE vs Correlation', fontweight='bold')
        ax4.legend()
        ax4.grid(alpha=0.3)
        
        fig.suptitle('Cell Line Agreement: GM12878 vs HEK293', fontsize=16, fontweight='bold')
        plt.tight_layout()
        
        viz_path = OUTPUT_DIR / 'RMSE_cell_line_comparison_plot.pdf'
        plt.savefig(viz_path, format='pdf', dpi=600, bbox_inches='tight')
        plt.show()
        print(f"\n‚úì Visualization saved to {viz_path}")
        
        return summary_df
    
    return None

print("‚úì Cell line RMSE function loaded (fixed)")

‚úì Cell line RMSE function loaded (fixed)


## Save Validated Sites

In [19]:
"""
=================================================================================
SAVE ORTHOGONALLY VALIDATED SITES
=================================================================================
"""

def save_validated_sites(dorado_mods_dict, new_glori1, combined_glori_2, 
                         m5c_orthogonal_df, bid_seq_df, praise_filtered, combined_ino,
                         cell_line='HEK293'):
    """Save sites validated by both DRS and orthogonal methods"""
    
    print("\n" + "="*80)
    print(f"SAVING VALIDATED SITES - {cell_line.upper()}")
    print("="*80)
    
    cell_lines = ['HEK293', 'GM12878'] if cell_line == 'both' else [cell_line]
    
    for cl in cell_lines:
        if f'{cl}_m6a' not in dorado_mods_dict:
            print(f"\n‚ö†Ô∏è  {cl} data not available")
            continue
        
        print(f"\n{'‚îÄ'*70}")
        print(f"{cl}")
        print(f"{'‚îÄ'*70}")
        
        output_dir = VALIDATED_DIR / cl
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # m6A - GLORI-1
        drs_m6a = get_drs_values(dorado_mods_dict, cl, 'm6a')
        glori1_vals = process_orthogonal_values(new_glori1, 'Chr', 'Site', 'm6A_level_mean')
        
        validated = drs_m6a.join(glori1_vals.rename({'m6A_level_mean': 'GLORI1_value'}), 
                                on='site_id', how='inner')
        if len(validated) > 0:
            path = output_dir / f'{cl}_m6A_GLORI1_validated.csv'
            validated.write_csv(path)
            print(f"  ‚úì m6A-GLORI1: {len(validated):,} sites ‚Üí {path.name}")
        
        # m6A - GLORI-2
        glori2_vals = process_orthogonal_values(combined_glori_2, 'Chr', 'Site', 'm6A_level_mean')
        validated = drs_m6a.join(glori2_vals.rename({'m6A_level_mean': 'GLORI2_value'}), 
                                on='site_id', how='inner')
        if len(validated) > 0:
            path = output_dir / f'{cl}_m6A_GLORI2_validated.csv'
            validated.write_csv(path)
            print(f"  ‚úì m6A-GLORI2: {len(validated):,} sites ‚Üí {path.name}")
        
        # m5C
        if f'{cl}_m5c' in dorado_mods_dict:
            drs_m5c = get_drs_values(dorado_mods_dict, cl, 'm5c')
            m5c_vals = process_orthogonal_values(m5c_orthogonal_df, 'chromosome', 'position', 'ratio')
            m5c_vals = m5c_vals.with_columns([(pl.col('ratio') * 100).alias('ratio')])
            
            validated = drs_m5c.join(m5c_vals.rename({'ratio': 'Orth_value'}), 
                                    on='site_id', how='inner')
            if len(validated) > 0:
                path = output_dir / f'{cl}_m5C_validated.csv'
                validated.write_csv(path)
                print(f"  ‚úì m5C: {len(validated):,} sites ‚Üí {path.name}")
        
        # Pseudouridine - BID-seq
        if f'{cl}_psi' in dorado_mods_dict:
            drs_psi = get_drs_values(dorado_mods_dict, cl, 'psi')
            
            bid_numeric = bid_seq_df.copy()
            bid_numeric['Frac_Ave %'] = pd.to_numeric(bid_numeric['Frac_Ave %'], errors='coerce')
            bid_numeric['pos'] = pd.to_numeric(bid_numeric['pos'], errors='coerce')
            bid_numeric = bid_numeric.dropna(subset=['Frac_Ave %', 'pos'])
            
            bid_vals = process_orthogonal_values(bid_numeric, 'chr', 'pos', 'Frac_Ave %')
            validated = drs_psi.join(bid_vals.rename({'Frac_Ave %': 'BID_value'}), 
                                    on='site_id', how='inner')
            if len(validated) > 0:
                path = output_dir / f'{cl}_Psi_BIDseq_validated.csv'
                validated.write_csv(path)
                print(f"  ‚úì Œ®-BIDseq: {len(validated):,} sites ‚Üí {path.name}")
        
        # Inosine
        if f'{cl}_inosine' in dorado_mods_dict:
            drs_ino = get_drs_values(dorado_mods_dict, cl, 'inosine')
            ino_sites = process_orthogonal_sites(combined_ino, 'Chromosome', 'position')
            
            validated = drs_ino.filter(pl.col('site_id').is_in(list(ino_sites)))
            if len(validated) > 0:
                path = output_dir / f'{cl}_Inosine_validated.csv'
                validated.write_csv(path)
                print(f"  ‚úì Inosine: {len(validated):,} sites ‚Üí {path.name}")
    
    print(f"\n‚úì Validated sites saved to {VALIDATED_DIR}")

print("‚úì Save validated sites function loaded")

‚úì Save validated sites function loaded


## Master Execution

In [20]:
"""
=================================================================================
MASTER EXECUTION FUNCTION
=================================================================================
"""

def run_complete_analysis(cell_line='both', include_rmse=True, 
                          include_cell_line_rmse=True, 
                          save_validated=True,
                          show_rmse_examples=False):
    """
    Complete analysis for any cell line(s)
    
    Parameters:
    -----------
    cell_line : str
        'HEK293', 'GM12878', or 'both'
    include_rmse : bool
        Calculate RMSE vs orthogonal methods
    include_cell_line_rmse : bool
        Calculate RMSE between GM12878 and HEK293
    save_validated : bool
        Save validated sites to CSV
    show_rmse_examples : bool
        Show example paired values in RMSE
    """
    
    print("\n" + "="*80)
    print(f"üöÄ COMPLETE ANALYSIS: {cell_line.upper()} üöÄ")
    print("="*80)
    
    # ============================================================
    # VENN DIAGRAMS
    # ============================================================
    print("\n" + "üìä"*40)
    print("VENN DIAGRAMS")
    print("üìä"*40)
    
    print("\nüî¥ m6A")
    plot_m6a_venns(dorado_mods_dict, new_glori1, combined_glori_2, 
                   mode=cell_line, glori_combine_mode='intersection')
    plot_m6a_venns(dorado_mods_dict, new_glori1, combined_glori_2, 
                   mode=cell_line, glori_combine_mode='union')
    
    print("\nüü¢ m5C")
    plot_m5c_venns(dorado_mods_dict, m5c_orthogonal_df, mode=cell_line)
    
    print("\nüü£ Pseudouridine")
    plot_psi_venns(dorado_mods_dict, bid_seq_df, praise_filtered, mode=cell_line)
    
    print("\nüîµ Inosine")
    plot_inosine_venns(dorado_mods_dict, combined_ino, mode=cell_line)
    
    print("\nüü§ 2'OMe")
    plot_2ome_venns(dorado_mods_dict, OMe_A, OMe_C, OMe_G, OMe_U, mode=cell_line)

    # ============================================================
    # OVERLAP PERCENTAGE ANALYSIS
    # ============================================================
    print("\n" + "üîç"*40)
    print("OVERLAP ANALYSIS")
    print("üîç"*40)
    
    if cell_line in ['HEK293', 'GM12878']:
        overlap_summary = analyze_m6a_overlap_agreement(
            dorado_mods_dict, new_glori1, combined_glori_2, cell_line=cell_line
        )
    elif cell_line == 'both':
        print("\nüî¥ HEK293")
        overlap_hek = analyze_m6a_overlap_agreement(
            dorado_mods_dict, new_glori1, combined_glori_2, cell_line='HEK293'
        )
        print("\nüîµ GM12878")
        overlap_gm = analyze_m6a_overlap_agreement(
            dorado_mods_dict, new_glori1, combined_glori_2, cell_line='GM12878'
        )
    
    # ============================================================
    # HEATMAPS
    # ============================================================
    print("\n" + "üìà"*40)
    print("HEATMAPS")
    print("üìà"*40)
    
    print("\nüî¥ m6A Heatmaps")
    plot_m6a_heatmaps_complete(dorado_mods_dict, new_glori1, combined_glori_2, mode=cell_line)
    
    # ============================================================
    # RMSE - DRS vs ORTHOGONAL
    # ============================================================
    rmse_summary = None
    if include_rmse:
        print("\n" + "üìä"*40)
        print("RMSE: DRS vs ORTHOGONAL")
        print("üìä"*40)
        
        rmse_summary = calculate_all_rmse(
            dorado_mods_dict, new_glori1, combined_glori_2,
            m5c_orthogonal_df, bid_seq_df, praise_filtered,
            cell_line=cell_line, show_examples=show_rmse_examples
        )
    
    # ============================================================
    # RMSE - BETWEEN CELL LINES
    # ============================================================
    cell_line_rmse = None
    if include_cell_line_rmse and cell_line == 'both':
        print("\n" + "üîÑ"*40)
        print("RMSE: GM12878 vs HEK293")
        print("üîÑ"*40)
        
        cell_line_rmse = calculate_cell_line_rmse(dorado_mods_dict)
    
    # ============================================================
    # SAVE VALIDATED SITES
    # ============================================================
    if save_validated:
        print("\n" + "üíæ"*40)
        print("SAVING VALIDATED SITES")
        print("üíæ"*40)
        
        save_validated_sites(dorado_mods_dict, new_glori1, combined_glori_2,
                            m5c_orthogonal_df, bid_seq_df, praise_filtered, combined_ino,
                            cell_line=cell_line)
    
    # ============================================================
    # SUMMARY
    # ============================================================
    print("\n" + "="*80)
    print("‚úì‚úì‚úì ANALYSIS COMPLETE ‚úì‚úì‚úì")
    print("="*80)
    
    print(f"\nüìÇ Output directory: {OUTPUT_DIR}")
    print("\nGenerated files:")
    print("  üìä Venn Diagrams:")
    print("     - m6a_venns_*_intersection.pdf")
    print("     - m6a_venns_*_union.pdf")
    print("     - m5c_venns_*.pdf")
    print("     - psi_venns_*.pdf")
    print("     - inosine_venns_*.pdf")
    print("     - 2ome_venns_*.pdf")
    
    print("\n  üìà Heatmaps:")
    print("     - m6a_heatmaps_*_complete.pdf")
    
    if include_rmse:
        print("\n  üìä RMSE Summaries:")
        print("     - RMSE_summary_*.csv")
    
    if include_cell_line_rmse and cell_line == 'both':
        print("\n  üîÑ Cell Line Comparison:")
        print("     - RMSE_cell_line_comparison.csv")
        print("     - RMSE_cell_line_comparison_plot.pdf")
    
    if save_validated:
        print(f"\n  üíæ Validated Sites:")
        print(f"     - {VALIDATED_DIR}/")
    
    return {
        'rmse_orthogonal': rmse_summary,
        'rmse_cell_lines': cell_line_rmse
    }

print("‚úì Master execution function loaded")

‚úì Master execution function loaded


## Run Everything

In [21]:
"""
=================================================================================
EXECUTE: RUN COMPLETE ANALYSIS
=================================================================================
"""

# ============================================================
# Choose your mode and run
# ============================================================

# Option 1: HEK293 only
# results = run_complete_analysis('HEK293', include_rmse=True, 
#                                 include_cell_line_rmse=False, 
#                                 save_validated=True)

# Option 2: GM12878 only
# results = run_complete_analysis('GM12878', include_rmse=True, 
#                                 include_cell_line_rmse=False, 
#                                 save_validated=True)

# Option 3: Both cell lines + cell line comparison (RECOMMENDED)
results = run_complete_analysis('both', 
                               include_rmse=True, 
                               include_cell_line_rmse=True, 
                               save_validated=True,
                               show_rmse_examples=False)

print("\n" + "üéâ"*40)
print("ALL DONE!")
print("üéâ"*40)


üöÄ COMPLETE ANALYSIS: BOTH üöÄ

üìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìä
VENN DIAGRAMS
üìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìä

üî¥ m6A

m6A Venns - BOTH - INTERSECTION
  GLORI-1: 76,452
  GLORI-2: 101,613
  GLORI-1 ‚à© GLORI-2: 69,243
  HEK293: 67,517, GM12878: 71,468


  plt.tight_layout()
  plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()


‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/m6a_venns_both_intersection.pdf

m6A Venns - BOTH - UNION
  GLORI-1: 76,452
  GLORI-2: 101,613
  GLORI-1 ‚à™ GLORI-2: 108,822
  HEK293: 67,517, GM12878: 71,468


  plt.tight_layout()
  plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()


‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/m6a_venns_both_union.pdf

üü¢ m5C

m5C Venns - BOTH
  Orthogonal: 2,191


1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp


  HEK293: 18,159, GM12878: 31,473
‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/m5c_venns_both.pdf

üü£ Pseudouridine

Pseudouridine Venns - BOTH
  BID-seq: 543, PRAISE: 1,801
  BID ‚à© PRAISE: 1, BID ‚à™ PRAISE: 2,343


  plt.show()
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp


  HEK293: 3,103, GM12878: 3,877
‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/psi_venns_both.pdf

üîµ Inosine

Inosine Venns - BOTH
  Orthogonal: 29,745


  plt.show()
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp


  HEK293: 6,956, GM12878: 12,986
‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/inosine_venns_both.pdf

üü§ 2'OMe

2'OMe Venns - BOTH
  Orth: A=314, C=650, G=645, U=450, Total=2059


  plt.show()
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp


  HEK293: 4,215, GM12878: 8,035
‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/2ome_venns_both.pdf

üîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîç
OVERLAP ANALYSIS
üîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîçüîç

üî¥ HEK293

m6A OVERLAP ANALYSIS - HEK293


  plt.show()



üìä Dataset Sizes:
  HEK293 DRS:    67,517 sites
  GLORI-1:           76,452 sites
  GLORI-2:          101,613 sites

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
1Ô∏è‚É£  HEK293 DRS vs GLORI-1
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
  Overlap: 33,316 sites
  Jaccard: 30.11%
  49.34% of HEK293 sites found in GLORI-1
  43.58% of GLORI-1 sites found in HEK293

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()


‚úÖ Saved visualization to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/m6a_overlap_analysis_HEK293.pdf

üîµ GM12878

m6A OVERLAP ANALYSIS - GM12878

üìä Dataset Sizes:
  GM12878 DRS:    71,468 sites
  GLORI-1:           76,452 sites
  GLORI-2:          101,613 sites

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
1Ô∏è‚É£  GM12878 DRS vs GLORI-1
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
  Overlap: 33,062 sites
  Jaccard: 28.79%
  46.26% of GM12878 sites found in GLORI-1
  43.25% of GLORI-1 sites found in GM12878

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
  plt.savefig(output_path, format='pdf', dpi=600, bbox_inches='tight')
1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()


‚úì Saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/m6a_heatmaps_both_complete.pdf

üìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìä
RMSE: DRS vs ORTHOGONAL
üìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìäüìä

RMSE: DRS vs ORTHOGONAL - BOTH

üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥ HEK293 üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥üî¥

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î

1 extra bytes in post.stringData array
'created' timestamp seems very low; regarding as unix timestamp
  plt.show()



‚úì Visualization saved to /Volumes/AJS_SSD/HEK293/scripts/notebooks/Plots/Plots_Updated_GLORI_Same_Sample/RMSE_cell_line_comparison_plot.pdf

üíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæ
SAVING VALIDATED SITES
üíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæüíæ

SAVING VALIDATED SITES - BOTH

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
HEK293
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
  ‚úì m6A-GLORI1: 33,316 sites ‚Üí HEK293_m6A_GLOR

## Spot Checks

In [22]:
! pip install pyarrow



In [41]:
def spot_check(df, chrom, gene_start, gene_end, df_chrom_col, df_start_col, df_end_col):
    df_sites_in_gene = df[
        (df[df_chrom_col] == chrom) &
        (pd.to_numeric(df[df_end_col]) >= gene_start) & 
        (pd.to_numeric(df[df_end_col]) <= gene_end) &
        (pd.to_numeric(df[df_start_col]) > gene_start) & 
        (pd.to_numeric(df[df_start_col]) < gene_end) 
    ]
    print(f'The number of mods within the gene for the df is {len(df_sites_in_gene)}')
    print(df_sites_in_gene)
    return None

In [42]:
new_glori1

spot_check(new_glori1, chrom = 'chr9', gene_start = 127_505_500, gene_end = 127_508_500, df_chrom_col = 'Chr', df_start_col = 'Site', df_end_col = 'Site')

The number of mods within the gene for the df is 10
        Chr       Site Strand    Gene  m6A_level_rep1 (%)  m6A_level_rep2 (%)  \
71124  chr9  127505538      -  NIBAN2              20.536              29.885   
71125  chr9  127505632      -  NIBAN2              57.778              76.364   
71126  chr9  127506795      -  NIBAN2              12.360              14.865   
71127  chr9  127506829      -  NIBAN2              11.215              16.667   
71128  chr9  127506856      -  NIBAN2              18.310              29.167   
71129  chr9  127506873      -  NIBAN2              57.778              40.541   
71130  chr9  127506963      -  NIBAN2              78.261              76.471   
71131  chr9  127507143      -  NIBAN2              62.500              60.606   
71132  chr9  127507386      -  NIBAN2              14.286              16.667   
71133  chr9  127507944      -  NIBAN2              13.793              13.043   

       m6A_level_rep3 (%)  m6A_level_rep1_pct  m6A_level

In [43]:
combined_glori_2
spot_check(combined_glori_2, chrom = 'chr9', gene_start = 127_505_500, gene_end = 127_508_500, df_chrom_col = 'Chr', df_start_col = 'Site', df_end_col = 'Site')

The number of mods within the gene for the df is 10
        Chr       Site Strand    Gene  m6A_level_rep1 (%)  m6A_level_rep2 (%)  \
94610  chr9  127505538      -  NIBAN2              18.889              27.933   
94611  chr9  127505632      -  NIBAN2              57.143              71.111   
94612  chr9  127506795      -  NIBAN2              16.667              11.852   
94613  chr9  127506829      -  NIBAN2              17.188              12.230   
94614  chr9  127506856      -  NIBAN2              36.364              35.135   
94615  chr9  127506873      -  NIBAN2              46.250              59.615   
94616  chr9  127506963      -  NIBAN2              64.865              77.778   
94617  chr9  127507143      -  NIBAN2              44.737              51.724   
94618  chr9  127507386      -  NIBAN2              12.632              17.273   
94619  chr9  127507944      -  NIBAN2              15.681              16.152   

       m6A_level_rep3 (%)  m6A_level_mean  
94610       

In [44]:
# GPR137 = m5c_orthogonal_df[m5c_orthogonal_df['gene_name'].str.contains('NIBAN2')]
spot_check(m5c_orthogonal_df, chrom = '9', gene_start = 127_505_500, gene_end = 127_508_500, df_chrom_col = 'chromosome', df_start_col = 'position', df_end_col = 'position')
# GPR137

The number of mods within the gene for the df is 1
     chromosome   position strand       gene_type                gene_name  \
2238          9  127506470      -  protein_coding  ENSG00000136830(NIBAN2)   

     gene_pos  unconverted  converted  ratio  
2238    72520           44         11    0.8  


In [45]:
bid_seq_df
spot_check(bid_seq_df, chrom = 'chr9', gene_start = 127_505_500, gene_end = 127_508_500, df_chrom_col = 'chr', df_start_col = 'pos', df_end_col = 'pos')

The number of mods within the gene for the df is 1
2    chr        pos    name        refseq      seg strand Deletion_rep1  \
63  chr9  127506478  NIBAN2  NM_001035534  3' UTR       -      0.511111   

2  Deletion_rep2 Deletion_rep3 Deletion_Ave Motif_1 Motif_2 Frac_rep1 %  \
63      0.386364      0.410256      0.43591   GGTGT     NaN    93.35806   

2  Frac_rep2 % Frac_rep3 % Frac_Ave % Deletion count_rep1 Deletion count_rep2  \
63   80.261231   83.135909  85.585067                  23                  16   

2  Deletion count_rep3  
63                  14  


In [28]:
praise_filtered

Unnamed: 0,Accession Number,Postion,rep1-difference of stop rate,rep2-difference of stop rate,rep3-difference of stop rate,Region,gene,transcript_id,chromosome,genomic_position,strand
0,NM_000041,1035,0.4146,0.4594,0.4285,stop codon,APOE,ENST00000252486.9,chr19,44909262.0,+
1,NM_000081,8851,0.4615,0.5333,0.2500,CDS,LYST,ENST00000389793.7,chr1,235792037.0,-
2,NM_000100,315,0.3830,0.3782,0.3140,CDS,CSTB,ENST00000640406.1,chr21,43774445.0,-
3,NM_000112,4152,0.4666,0.6363,0.4444,3' UTR,SLC26A2,ENST00000286298.5,chr5,149983498.0,+
5,NM_000120,1120,0.5454,0.3500,0.0902,CDS,EPHX1,ENST00000614058.4,chr1,225840017.0,+
...,...,...,...,...,...,...,...,...,...,...,...
2069,NR_110021,927,0.5136,0.5510,0.4181,noncoding,DKC1,ENST00000696588.1,chrX,154765165.0,+
2070,NR_110022,450,0.3653,0.3333,0.4680,noncoding,DKC1,ENST00000696588.1,chrX,154764688.0,+
2071,NR_110023,450,0.4743,0.4166,0.4893,noncoding,DKC1,ENST00000696588.1,chrX,154764688.0,+
2072,NR_110266,2190,0.3601,0.3333,0.1849,noncoding,TMEM161B,ENST00000296595.11,chr5,88220712.0,-


In [29]:
combined_ino

Unnamed: 0,Gene ID,Chromosome,position,strand,coverage,truncated reads,Gene symbol,Location,repeatfamily,replicate
0,ENSG00000284733.1,chr1,492159,-,63,51,"OR4F29(dist=40462),RF00026(dist=24217)",intergenic,./.,ino_1
1,ENSG00000284662.1,chr1,727131,-,29,16,"OR4F16(dist=40458),RNU6-1199P(dist=31102)",intergenic,./.,ino_1
2,ENSG00000284662.1,chr1,727162,-,332,292,"OR4F16(dist=40489),RNU6-1199P(dist=31071)",intergenic,./.,ino_1
3,ENSG00000284662.1,chr1,727708,-,33,19,"OR4F16(dist=41035),RNU6-1199P(dist=30525)",intergenic,./.,ino_1
4,ENSG00000284662.1,chr1,753268,-,6,6,"OR4F16(dist=66595),RNU6-1199P(dist=4965)",intergenic,SINE/Alu,ino_1
...,...,...,...,...,...,...,...,...,...,...
54005,ENSG00000239225.1,chrY,11207389,-,17,6,"TTTY23(dist=1295427),RN7SL702P(dist=1066084)",intergenic,SINE/Alu,ino_3
54006,ENSG00000239225.1,chrY,11208349,-,8,6,"TTTY23(dist=1296387),RN7SL702P(dist=1065124)",intergenic,SINE/Alu,ino_3
54007,ENSG00000239225.1,chrY,11210258,-,22,6,"TTTY23(dist=1298296),RN7SL702P(dist=1063215)",intergenic,SINE/Alu,ino_3
54008,-,chrY,12163926,-,32,25,"NONE(dist=NONE),RN7SL702P(dist=109547)",intergenic,SINE/Alu,ino_3
