## Setup and Imports

In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import pickle
import os
from typing import Dict, List, Set, Tuple, Optional

# Try custom plotting style
try:
    from genometechlab_plotting import setup_style, get_colors
    setup_style('inline')
    HAS_CUSTOM_STYLE = True
except ImportError:
    HAS_CUSTOM_STYLE = False
    plt.style.use('seaborn-v0_8-darkgrid')

# Try IntervalTree for overlap detection
try:
    from intervaltree import IntervalTree
    HAS_INTERVALTREE = True
except ImportError:
    print("IntervalTree not available - overlap detection disabled")
    HAS_INTERVALTREE = False

Loaded 7 Helvetica fonts


## Helper Functions

In [2]:
def parse_gtf_attributes(attributes_string):
    """Parse GTF attribute string"""
    attributes = {}
    for pair in attributes_string.strip().split(';'):
        if not pair.strip():
            continue
        parts = pair.strip().split(' ', 1)
        if len(parts) == 2:
            key, value = parts
            attributes[key] = value.strip('"')
    return attributes

def gencode_slop(g_slop):
    """Parse GENCODE GTF attributes"""
    attrs = parse_gtf_attributes(g_slop)
    exon_num = attrs.get('exon_number')
    iso_name = attrs.get('transcript_name', '')
    
    if exon_num:
        try:
            exon_num = int(exon_num)
        except:
            exon_num = None
    
    return iso_name, exon_num

## Transcript Filtering

In [3]:
def identify_transcripts_to_filter(gtf_path, config):
    """Identify transcripts to filter based on configuration"""
    if not gtf_path.exists():
        raise FileNotFoundError(f"GTF file not found: {gtf_path}")
    
    print("Identifying transcripts to filter...")
    
    filtered = defaultdict(set)
    
    if HAS_INTERVALTREE:
        chrom_intervals = defaultdict(IntervalTree)
        gene_to_intervals = defaultdict(lambda: defaultdict(list))
    
    transcript_to_gene_biotype = {}
    transcript_tags = defaultdict(set)
    transcript_support_level = {}
    transcript_biotypes = {}
    gene_biotypes = {}
    zero_width_count = 0
    
    # First pass: collect information
    with open(gtf_path, 'r') as infile:
        for line in tqdm(infile, desc="Reading GTF"):
            if line.startswith("#"):
                continue
            
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue
            
            feature_type = fields[2]
            attrs = parse_gtf_attributes(fields[8])
            
            gene_id = attrs.get('gene_id')
            gene_name = attrs.get('gene_name')
            gene_biotype = attrs.get('gene_biotype', attrs.get('gene_type', 'unknown'))
            transcript_id = attrs.get('transcript_id')
            transcript_biotype = attrs.get('transcript_biotype', attrs.get('transcript_type', 'unknown'))
            
            if gene_id and gene_biotype:
                gene_biotypes[gene_id] = gene_biotype
            
            if transcript_id:
                if transcript_biotype:
                    transcript_biotypes[transcript_id] = transcript_biotype
                
                if gene_biotype:
                    transcript_to_gene_biotype[transcript_id] = gene_biotype
                
                if 'tag' in attrs:
                    tags = attrs['tag'].split(',') if ',' in attrs['tag'] else [attrs['tag']]
                    transcript_tags[transcript_id].update(tags)
                
                if 'transcript_support_level' in attrs:
                    tsl = attrs['transcript_support_level']
                    if tsl != 'NA':
                        try:
                            transcript_support_level[transcript_id] = int(tsl.replace('tsl', ''))
                        except:
                            pass
            
            if HAS_INTERVALTREE and config.get('filter_overlaps', True):
                if feature_type in ["gene", "exon", "CDS", "transcript"]:
                    chrom = fields[0]
                    start = int(fields[3])
                    end = int(fields[4])
                    
                    if start >= end:
                        zero_width_count += 1
                        continue
                    
                    if gene_name:
                        chrom_intervals[chrom][start:end] = gene_name
                        gene_to_intervals[gene_name][chrom].append((start, end))
    
    if zero_width_count > 0:
        print(f"Skipped {zero_width_count} zero-width intervals")
    
    # Apply filters
    print("Applying biotype filters...")
    
    allowed_gene_biotypes = set(config.get('allowed_gene_biotypes', ['protein_coding']))
    for transcript_id, gene_biotype in transcript_to_gene_biotype.items():
        if gene_biotype not in allowed_gene_biotypes:
            filtered['non_protein_coding_gene'].add(transcript_id)
    
    allowed_transcript_biotypes = set(config.get('allowed_transcript_biotypes', ['protein_coding']))
    for transcript_id, transcript_biotype in transcript_biotypes.items():
        if transcript_biotype not in allowed_transcript_biotypes:
            filtered['non_protein_coding_transcript'].add(transcript_id)
    
    if config.get('filter_retained_introns', True):
        for transcript_id, tags in transcript_tags.items():
            if 'retained_intron' in tags:
                filtered['retained_intron'].add(transcript_id)
    
    if config.get('filter_readthrough', True):
        for transcript_id, tags in transcript_tags.items():
            if 'readthrough_transcript' in tags:
                filtered['readthrough'].add(transcript_id)
    
    if config.get('filter_nmd', True):
        for transcript_id, tags in transcript_tags.items():
            if 'nonsense_mediated_decay' in tags or 'NMD' in tags:
                filtered['nmd'].add(transcript_id)
    
    if config.get('filter_non_canonical', False):
        for transcript_id, tags in transcript_tags.items():
            if not ('canonical' in tags or 'basic' in tags or 'MANE_Select' in tags):
                filtered['non_canonical'].add(transcript_id)
    
    min_tsl = config.get('min_transcript_support_level')
    if min_tsl is not None:
        for transcript_id in transcript_biotypes:
            tsl = transcript_support_level.get(transcript_id, 999)
            if tsl > min_tsl:
                filtered['low_support'].add(transcript_id)
    
    # Find overlapping genes
    if HAS_INTERVALTREE and config.get('filter_overlaps', True):
        overlapping_genes = set()
        genes_checked = set()
        
        for gene_name, chrom_dict in gene_to_intervals.items():
            if gene_name in genes_checked:
                continue
            genes_checked.add(gene_name)
            
            for chrom, intervals in chrom_dict.items():
                for start, end in intervals:
                    overlaps = chrom_intervals[chrom][start:end]
                    overlapping_gene_names = set(overlap.data for overlap in overlaps)
                    
                    if len(overlapping_gene_names) > 1:
                        overlapping_genes.update(overlapping_gene_names)
        
        with open(gtf_path, 'r') as infile:
            for line in infile:
                if line.startswith("#"):
                    continue
                
                fields = line.strip().split('\t')
                if len(fields) < 9:
                    continue
                
                attrs = parse_gtf_attributes(fields[8])
                gene_name = attrs.get('gene_name')
                transcript_id = attrs.get('transcript_id')
                
                if gene_name in overlapping_genes and transcript_id:
                    filtered['overlapping_gene'].add(transcript_id)
    
    # Map transcript names to IDs
    transcript_name_to_id = {}
    with open(gtf_path, 'r') as infile:
        for line in infile:
            if line.startswith("#"):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9:
                continue
            attrs = parse_gtf_attributes(fields[8])
            transcript_id = attrs.get('transcript_id')
            transcript_name = attrs.get('transcript_name')
            if transcript_id and transcript_name:
                transcript_name_to_id[transcript_name] = transcript_id
    
    final_filtered = defaultdict(set)
    for reason, transcript_ids in filtered.items():
        final_filtered[reason].update(transcript_ids)
        for transcript_id in transcript_ids:
            for name, tid in transcript_name_to_id.items():
                if tid == transcript_id:
                    final_filtered[reason].add(name)
    
    print("\nFiltering summary:")
    total_filtered = set()
    for reason, items in final_filtered.items():
        print(f"  {reason}: {len(items):,} transcripts")
        total_filtered.update(items)
    print(f"  Total unique items to filter: {len(total_filtered):,}")
    
    return final_filtered, total_filtered

## Data Loading

In [4]:
def slice_df_by_chrom_and_coords(df, chrom, start, stop, codon_start, codon_stop):
    """Calculate positions relative to start codon"""
    tmp_df = df.filter(
        (pl.col("Chromosome") == chrom) & 
        (pl.col("Start") >= start) & 
        (pl.col("Start") + 1 <= stop)
    )
    positions = tmp_df["Start"].to_list()
    return [pos - codon_stop if pos > codon_stop else pos - codon_start for pos in positions]

def load_parquet_files(directory_path):
    """Load parquet files and infer modification types"""
    directory_path = Path(directory_path)
    
    if not directory_path.exists():
        raise ValueError(f"Directory not found: {directory_path}")
    
    if not directory_path.is_dir():
        raise ValueError(f"Path is not a directory: {directory_path}")
    
    parquet_files = sorted(directory_path.glob('*.parquet'))
    
    if not parquet_files:
        raise ValueError(f"No parquet files found in directory: {directory_path}")
    
    print(f"\nLoading {len(parquet_files)} parquet file(s)...")
    
    code_mappings = {
        '17596': 'Ino',
        '17802': 'Psi',
        '19227': '2OmethylU',
        '19228': '2OmethylC',
        '19229': '2OmethylG',
        '69426': '2OmethylA',
    }
    letter_codes = {'a': 'm6A', 'm': 'm5C'}
    
    df_dict = {}
    
    for parquet_path in parquet_files:
        filename = parquet_path.stem
        
        mod_type = None
        for code, mt in code_mappings.items():
            if code in filename:
                mod_type = mt
                break
        if mod_type is None:
            for letter, mt in letter_codes.items():
                if f'_{letter}_' in filename or filename.endswith(f'_{letter}'):
                    mod_type = mt
                    break
        
        if mod_type is None:
            print(f"  Skipping {parquet_path.name}: could not infer modification type")
            continue
        
        df = pl.read_parquet(parquet_path)
        
        if 'Chromosome' not in df.columns or 'Start' not in df.columns:
            print(f"  Skipping {parquet_path.name}: missing required columns")
            continue
        
        if mod_type in df_dict:
            df_dict[mod_type] = pl.concat([df_dict[mod_type], df])
        else:
            df_dict[mod_type] = df
        
        print(f"  Loaded {parquet_path.name} -> {mod_type}: {len(df):,} rows")
    
    print(f"\nLoaded {len(df_dict)} modification type(s): {list(df_dict.keys())}")
    return df_dict

## GTF Processing Function

In [5]:
def process_gtf_and_calculate_positions(df_dict, gtf_path, config):
    """Process GTF with filtering and calculate positions relative to start codons"""
    print(f"\nParsing GTF file: {gtf_path}")
    
    filtered_transcripts, all_filtered = identify_transcripts_to_filter(gtf_path, config)
    
    exon_num_to_coords = defaultdict(dict)
    start_codon_to_exon_num = defaultdict(int)
    transcript_to_gene = {}
    
    transcripts_loaded = 0
    start_codons_loaded = 0
    transcripts_skipped = 0
    
    with open(gtf_path, 'r') as infile:
        for line in tqdm(infile, desc="Reading GTF"):
            if line[0] == "#":
                continue
            split_line = line.strip().split('\t')
            
            attrs = parse_gtf_attributes(split_line[8])
            transcript_id = attrs.get('transcript_id')
            transcript_name = attrs.get('transcript_name')
            gene_name = attrs.get('gene_name', '')
            
            if transcript_id and transcript_id in all_filtered:
                transcripts_skipped += 1
                continue
            if transcript_name and transcript_name in all_filtered:
                transcripts_skipped += 1
                continue
            
            if transcript_name and gene_name:
                transcript_to_gene[transcript_name] = gene_name
            
            if split_line[2] == "exon":
                iso_name, exon_num = gencode_slop(split_line[8])
                if iso_name and exon_num:
                    if iso_name not in exon_num_to_coords:
                        transcripts_loaded += 1
                    exon_num_to_coords[iso_name][exon_num] = (int(split_line[3]), int(split_line[4]))
            
            if split_line[2] == "start_codon":
                chrom = split_line[0]
                start, stop = (int(split_line[3]), int(split_line[4]))
                iso_name, exon_num = gencode_slop(split_line[8])
                if iso_name and exon_num:
                    start_codon_to_exon_num[(chrom, start, stop, iso_name)] = exon_num
                    start_codons_loaded += 1
    
    print(f"Loaded {transcripts_loaded:,} transcripts (skipped {transcripts_skipped:,})")
    print(f"Found {start_codons_loaded:,} start codons")
    print(f"Unique genes: {len(set(transcript_to_gene.values())):,}")
    
    print("\nFiltering breakdown:")
    for reason, items in filtered_transcripts.items():
        print(f"  {reason}: {len(items):,}")
    
    print("\nCalculating positions relative to start codons...")
    results = {}
    
    for mod_type, df in df_dict.items():
        print(f"  Processing {mod_type}...")
        positions = []
        
        for key in tqdm(start_codon_to_exon_num.keys(), desc=f"    {mod_type}"):
            chrom, start, stop, iso_name = key
            exon_num = start_codon_to_exon_num[key]
            if exon_num not in exon_num_to_coords[iso_name]:
                continue
            slice_start, slice_stop = exon_num_to_coords[iso_name][exon_num]
            positions.extend(slice_df_by_chrom_and_coords(df, chrom, slice_start, slice_stop, start, stop))
        
        results[mod_type] = positions
        print(f"    Found {len(positions):,} positions")
    
    stats = {
        'transcripts_loaded': transcripts_loaded,
        'transcripts_skipped': transcripts_skipped,
        'start_codons_analyzed': start_codons_loaded,
        'unique_genes': len(set(transcript_to_gene.values())),
        'filtering_breakdown': {k: len(v) for k, v in filtered_transcripts.items()}
    }
    
    return results, stats

## Plotting

In [6]:
def get_modification_colors(mod_names):
    """Get colors for modifications"""
    colors = {}
    
    default_colors = {
        'm5C': '#2ca02c',
        'm6A': '#1f77b4',
        'Ino': '#ff7f0e',
        'Psi': '#d62728',
        '2OmethylA': '#9467bd',
        '2OmethylC': '#8c564b',
        '2OmethylG': '#e377c2',
        '2OmethylU': '#7f7f7f',
    }
    
    if HAS_CUSTOM_STYLE:
        try:
            for i, mod_name in enumerate(mod_names):
                color = get_colors(i)
                if isinstance(color, str) and (color.startswith('#') or color in plt.colors.CSS4_COLORS):
                    colors[mod_name] = color
                elif isinstance(color, (list, tuple)) and len(color) >= 3:
                    colors[mod_name] = tuple(float(c) for c in color[:4])
                else:
                    colors[mod_name] = default_colors.get(mod_name, f'C{i}')
        except Exception as e:
            print(f"Could not use custom colors: {e}")
            colors = default_colors
    else:
        colors = default_colors
    
    return colors

def plot_modifications(results, output_path='rna_modifications_plot.pdf', 
                       window_size=200, bin_width=100, y_limit=None,
                       figure_size=(10, 2.5), show_kde=True, y_axis_mode='dynamic',
                       stats=None, save_plot=True):
    """Create figure with modification distributions"""
    
    results_filtered = {k: v for k, v in results.items() if len(v) > 0}
    
    if not results_filtered:
        print("No modifications found to plot!")
        return
    
    colors = get_modification_colors(list(results_filtered.keys()))
    
    n_mods = len(results_filtered)
    fig, axes = plt.subplots(n_mods, 1, 
                             figsize=(figure_size[0], figure_size[1] * n_mods),
                             sharex=True, sharey=(y_axis_mode == 'shared'))
    
    if n_mods == 1:
        axes = [axes]
    
    plotted_ylims = {}
    
    for ax, (mod_name, positions) in zip(axes, results_filtered.items()):
        positions_array = np.array(positions)
        filtered_pos = [x for x in positions_array if -window_size <= x <= window_size]
        
        print(f"{mod_name}: {len(filtered_pos):,} modifications in window")
        
        if len(filtered_pos) > 0:
            color = colors[mod_name]
            
            sns.histplot(
                filtered_pos,
                ax=ax,
                discrete=True,
                color=color,
                label=mod_name,
                kde=show_kde,
                kde_kws={'clip': (-window_size, window_size), 'bw_adjust': 0.5} if show_kde else None,
                stat="density",
                binwidth=bin_width,
                linewidth=0.2,
                element="step",
                fill=False
            )
            
            ax.axvline(x=0, color='black', linestyle='dashdot', linewidth=0.2)
            plotted_ylims[mod_name] = ax.get_ylim()
        
        ax.grid(False)
        ax.legend(loc='upper right')
    
    if y_limit is None and y_axis_mode != 'dynamic':
        if y_axis_mode == 'shared':
            if plotted_ylims:
                max_ylim = max([ylim[1] for ylim in plotted_ylims.values()])
                for ax in axes:
                    ax.set_ylim(0, max_ylim)
        
        elif y_axis_mode == 'grouped':
            high_density_mods = ['m6A', 'Ino', 'Psi']
            high_density_ylims = [plotted_ylims[m][1] for m in high_density_mods if m in plotted_ylims]
            if high_density_ylims:
                high_density_max = max(high_density_ylims)
                for ax, (mod_name, _) in zip(axes, results_filtered.items()):
                    if mod_name in high_density_mods:
                        ax.set_ylim(0, high_density_max)
    
    elif y_limit is not None:
        for ax in axes:
            ax.set_ylim([0, y_limit])
    
    if stats:
        title = (f'RNA Modifications Relative to Start Codons\n'
                f'({stats["unique_genes"]:,} genes, {stats["start_codons_analyzed"]:,} start codons)')
    else:
        title = 'RNA Modifications Relative to Start Codons'
    
    if ax == axes[0]:
        axes[0].set_title(title, fontsize=12, fontweight='bold', pad=10)
    
    axes[-1].set_xlabel('Position relative to start codon (bp)', fontsize=11)
    axes[-1].set_xlim([-window_size, window_size])
    
    plt.tight_layout()
    
    if save_plot:
        output_path = Path(output_path)
        fig.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"\nFigure saved to {output_path}")
        
        png_path = output_path.with_suffix('.png')
        fig.savefig(png_path, dpi=300, bbox_inches='tight')
        print(f"PNG version saved to {png_path}")
    else:
        print("\nDisplaying plot (not saved)")
        plt.show()
    
    if save_plot:
        plt.close()

In [7]:
def plot_combined(results, output_path='rna_modifications_combined.pdf',
                 window_size=200, bin_width=100, stats=None, save_plot=True):
    """Create a single plot with all modifications overlaid"""
    
    results_filtered = {k: v for k, v in results.items() if len(v) > 0}
    
    if not results_filtered:
        print("No modifications found to plot!")
        return
    
    colors = get_modification_colors(list(results_filtered.keys()))
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    
    for mod_name, positions in results_filtered.items():
        positions_array = np.array(positions)
        filtered_pos = [x for x in positions_array if -window_size <= x <= window_size]
        
        if len(filtered_pos) > 0:
            color = colors[mod_name]
            
            sns.histplot(
                filtered_pos,
                ax=ax,
                discrete=True,
                color=color,
                label=f'{mod_name} (n={len(filtered_pos):,})',
                kde=True,
                kde_kws={'clip': (-window_size, window_size), 'bw_adjust': 0.5},
                stat="density",
                binwidth=bin_width,
                linewidth=0.2,
                element="step",
                fill=False,
                alpha=0.7
            )
    
    ax.axvline(x=0, color='black', linestyle='dashdot', linewidth=0.2)
    
    ax.set_xlabel('Position relative to start codon (bp)', fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    
    if stats:
        title = (f'RNA Modifications Relative to Start Codons\n'
                f'({stats["unique_genes"]:,} genes, {stats["start_codons_analyzed"]:,} start codons)')
    else:
        title = 'RNA Modifications Relative to Start Codons'
    
    ax.set_title(title, fontsize=14, fontweight='bold', pad=10)
    ax.legend(loc='upper right', frameon=True, fontsize=10)
    ax.grid(False)
    ax.set_xlim([-window_size, window_size])
    
    if save_plot:
        output_path = Path(output_path)
        fig.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"Combined figure saved to {output_path}")
    else:
        print("Displaying combined plot (not saved)")
        plt.show()
    
    if save_plot:
        plt.close()

## Utility

In [8]:
def print_statistics(results, stats=None):
    """Print summary statistics"""
    print("\n" + "="*60)
    print("MODIFICATION ANALYSIS SUMMARY")
    print("="*60)
    
    if stats:
        print("\nFiltering Statistics:")
        print(f"  Transcripts retained: {stats['transcripts_loaded']:,}")
        print(f"  Transcripts filtered: {stats['transcripts_skipped']:,}")
        print(f"  Start codons analyzed: {stats['start_codons_analyzed']:,}")
        print(f"  Unique genes: {stats['unique_genes']:,}")
        
        if 'filtering_breakdown' in stats:
            print("\nFiltering breakdown:")
            for reason, count in stats['filtering_breakdown'].items():
                print(f"    {reason}: {count:,}")
    
    print("\nModification Counts Near Start Codons:")
    total = 0
    for mod_name, positions in results.items():
        count = len(positions)
        total += count
        print(f"  {mod_name}: {count:,} modifications")
        
        if count > 0:
            pos_array = np.array(positions)
            print(f"    Range: [{pos_array.min()}, {pos_array.max()}] bp")
            print(f"    Within ±100bp: {np.sum(np.abs(pos_array) <= 100):,}")
            print(f"    Within ±50bp: {np.sum(np.abs(pos_array) <= 50):,}")
    
    print(f"\n  Total: {total:,} modifications")
    print("="*60 + "\n")

def save_results(results, output_path, stats=None):
    """Save results to pickle file"""
    output_path = Path(output_path)
    save_dict = {'results': results}
    if stats:
        save_dict['stats'] = stats
    
    with open(output_path, 'wb') as f:
        pickle.dump(save_dict, f)
    print(f"Saved results to {output_path}")

def load_results(pickle_path):
    """Load results from pickle file"""
    with open(pickle_path, 'rb') as f:
        data = pickle.load(f)
    print(f"Loaded results from {pickle_path}")
    
    if isinstance(data, dict) and 'results' in data:
        return data['results'], data.get('stats', None)
    else:
        return data, None

## Paths

In [11]:
# Get GTF File if not local:
GTF_FILE_GZ = './gencode.v47.annotation.gtf.gz'

if not os.path.exists(GTF_FILE_GZ):
    print("Downloading GTF file...")
    subprocess.run(['wget', '-q', '-O', GTF_FILE_GZ, 
                    'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.annotation.gtf.gz'])
    print(f"  Downloaded to {GTF_FILE_GZ}")

GTF_FILE = GTF_FILE_GZ[:-3]  # Remove .gz extension

if not os.path.exists(GTF_FILE):
    print("Decompressing GTF file...")
    subprocess.run(['gunzip', '-k', GTF_FILE_GZ])  # -k keeps the original
    print(f"  Decompressed to {GTF_FILE}")

# Set your paths here
PARQUET_DIR = Path('../Exemplar_Data/parquet_files/')  # Directory containing parquet files
GTF_PATH = Path(f'{GTF_FILE}')
OUTPUT_DIR = Path('./')  # Output directory

# Plotting parameters
WINDOW_SIZE = 200  # Window around start codon (bp)
BIN_WIDTH = 100    # Histogram bin width
Y_AXIS_MODE = 'dynamic'  # 'dynamic', 'shared', or 'grouped'
SHOW_KDE = True    # Show KDE overlay
SAVE_PLOTS = False  # Set to False to only display plots without saving

# Filtering configuration
config = {
    'filter_overlaps': True,
    'filter_retained_introns': True,
    'filter_readthrough': True,
    'filter_nmd': True,
    'allowed_gene_biotypes': ['protein_coding'],
    'allowed_transcript_biotypes': ['protein_coding'],
    'filter_non_canonical': False,  # Set to True for canonical only
    'min_transcript_support_level': None  # Set to 1-5 for TSL filtering
}

## Run Analysis

In [None]:
%matplotlib inline

# Load parquet files
df_dict = load_parquet_files(PARQUET_DIR)

# Process GTF and calculate positions
results, stats = process_gtf_and_calculate_positions(df_dict, GTF_PATH, config)

# Save results for future use
save_results(results, OUTPUT_DIR / 'results.pkl', stats)

# Print statistics
print_statistics(results, stats)

# Create main plot
plot_modifications(
    results,
    output_path=OUTPUT_DIR / 'rna_modifications_plot.pdf',
    window_size=WINDOW_SIZE,
    bin_width=BIN_WIDTH,
    show_kde=SHOW_KDE,
    y_axis_mode=Y_AXIS_MODE,
    stats=stats,
    save_plot=SAVE_PLOTS  # Uses the config setting
)

# Create combined plot
plot_combined(
    results,
    output_path=OUTPUT_DIR / 'rna_modifications_combined.pdf',
    window_size=WINDOW_SIZE,
    bin_width=BIN_WIDTH,
    stats=stats,
    save_plot=SAVE_PLOTS  # Uses the config setting
)


Loading 8 parquet file(s)...
  Loaded Dorado_filtered_20_20_17596.parquet -> Ino: 14 rows
  Loaded Dorado_filtered_20_20_17802.parquet -> Psi: 2 rows
  Loaded Dorado_filtered_20_20_19227.parquet -> 2OmethylU: 4 rows
  Loaded Dorado_filtered_20_20_19228.parquet -> 2OmethylC: 1 rows
  Loaded Dorado_filtered_20_20_19229.parquet -> 2OmethylG: 1 rows
  Loaded Dorado_filtered_20_20_69426.parquet -> 2OmethylA: 1 rows
  Loaded Dorado_filtered_20_20_a.parquet -> m6A: 106 rows
  Loaded Dorado_filtered_20_20_m.parquet -> m5C: 44 rows

Loaded 8 modification type(s): ['Ino', 'Psi', '2OmethylU', '2OmethylC', '2OmethylG', '2OmethylA', 'm6A', 'm5C']

Parsing GTF file: gencode.v47.annotation.gtf
Identifying transcripts to filter...


Reading GTF: 4105490it [01:18, 52316.44it/s]


Skipped 705 zero-width intervals
Applying biotype filters...
