In [None]:
import subprocess
import os


def make_fasta(output_dir = "results"):
    os.makedirs(output_dir, exist_ok=True)
    os.chdir(output_dir)

    download_variants = "wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/latest/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz"
    download_reference_genome = "wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta.gz && gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
    get_biallelic_variants = "bcftools view -v snps -m2 -M2 HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -Oz -o GM12878_SNPs_biallelic.vcf.gz"
    get_index = "bcftools index GM12878_SNPs_biallelic.vcf.gz"
    get_consensus = "bcftools consensus -f GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta -H 1 GM12878_SNPs_biallelic.vcf.gz > GM12878.fasta"

    subprocess.run(download_variants, shell=True)
    subprocess.run(download_reference_genome, shell=True)
    subprocess.run(get_biallelic_variants, shell=True)
    subprocess.run(get_index, shell=True)
    subprocess.run(get_consensus, shell=True)

    os.remove(path="HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz")
    os.remove(path="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta")
    os.remove(path="GM12878_SNPs_biallelic.vcf.gz")
    os.remove(path="GM12878_SNPs_biallelic.vcf.gz.csi")
    
    print(f"Done! Fasta file saved to {output_dir}/GM12878.fasta")

In [None]:
make_fasta()

In [None]:
from gtfparse import read_gtf
import polars as pl
from IPython.display import display

In [None]:
df = read_gtf("/data/common/genome/gencode.v47.basic.annotation.gtf")
filtered_df = df.filter(
    (pl.col('feature') == 'exon') &
    (pl.col('gene_type') == 'protein_coding') &
    (pl.col('seqname').is_in(['chr9']))
)
as_string = filtered_df.with_columns(pl.col('start').cast(pl.Utf8), pl.col('end').cast(pl.Utf8))
as_num = as_string.with_columns(pl.col('exon_number').cast(pl.Int64))
indexed_df = as_num.with_row_index()

display(indexed_df)

In [None]:
# Get lists of first and last indices
first_indices = []
last_indices = []

for _, group in indexed_df.group_by('transcript_id'):
    sorted_group = group.sort('exon_number')
    first_indices.append(sorted_group.row(0, named=True)['index'])
    last_indices.append(sorted_group.row(-1, named=True)['index'])

# Create update expressions
placeholder_df = indexed_df.with_columns([
    pl.when(pl.col("index").is_in(first_indices))
    .then(pl.lit("START"))
    .otherwise(pl.col("start"))
    .alias("start"),
    
    pl.when(pl.col("index").is_in(last_indices))
    .then(pl.lit("END"))
    .otherwise(pl.col("end"))
    .alias("end")
])

sorted_df = placeholder_df.sort('seqname', 'transcript_id', 'exon_number')

display(sorted_df)

In [None]:
import polars as pl
from IPython.display import display

In [None]:
quant_tsv_1 = pl.read_csv("../ENCFF189XTO.tsv", separator='\t')
quant_tsv_2 = pl.read_csv("../ENCFF971DVB.tsv", separator='\t')
display(quant_tsv_1)
display(quant_tsv_2)


In [None]:
joined_tsv = quant_tsv_1.join(quant_tsv_2, on='transcript_ID', how='inner')
display(joined_tsv)

averaged_counts = joined_tsv.with_columns(
    ((pl.col('rep1ENCSR368UNC') + pl.col('rep2ENCSR368UNC')) / 2).alias('transcript_count')
)
clean_tsv = averaged_counts.select("annot_transcript_id", "annot_transcript_name", "transcript_count")

In [None]:
import polars as pl 
from gtfparse import read_gtf

In [None]:
gtf_file = read_gtf("/data/common/genome/gencode.v44.basic.annotation.gtf")
gtf_file.write_parquet("../reference_files/gencode.v44.basic.annotation.gtf.parquet")

In [None]:
import os
os.chdir("/zata/zippy/ramirezc/splice-model-benchmark")

import sys
splice_transformer_path = os.path.join(f"{os.getcwd}", 'reference_files', 'SpliceTransformer')
sys.path.append(splice_transformer_path)

import pandas as pd
import numpy as np
from pyfaidx import Fasta
import argparse
import vcf as pyvcf
from pyensembl import Genome
import tqdm
import os
from sptransformer import Annotator
import torch

In [None]:
annotator = Annotator()
gtf = annotator.gtf

tis_names = ['Adipose Tissue', 'Blood', 'Blood Vessel', 'Brain', 'Colon', 'Heart', 'Kidney',
                'Liver', 'Lung', 'Muscle', 'Nerve', 'Small Intestine', 'Skin', 'Spleen', 'Stomach']

input_seq = 'N'*4000 + 'ACGTAGGGCG' + 'N'*4000  # just an example
input_seq = annotator.model.one_hot_encode(input_seq)
input_seq = torch.tensor(input_seq).to(annotator.model.device)
print(input_seq.shape)
# the function step() accepts encoded sequence, (Batch, 4, Length),
# thus, the input_seq should have shape (1, 4, Length)
input_seq = input_seq.unsqueeze(0).float().transpose(1, 2)
output = annotator.model.step(input_seq)
print(output.shape)

In [None]:
import torch

save_path = 'model/weights/SpTransformer_pytorch.ckpt'
save_dict = torch.load(save_path, map_location='cpu')

new_state_dict = {}
for key, value in save_dict["state_dict"].items():
    if "attn.pos_emb.weights_" in key:
        new_key = key.replace("attn.pos_emb.weights_", "attn.pos_emb.weights.")
        new_state_dict[new_key] = value
    else:
        new_state_dict[key] = value

save_dict["state_dict"] = new_state_dict

new_save_path = 'model/weights/SpTransformer_pytorch_fixed.ckpt'
torch.save(save_dict, new_save_path)

print(f"Modified checkpoint saved to {new_save_path}")

In [None]:
import torch
from pangolin.model import *
import os
os.chdir("/zata/zippy/ramirezc/splice-model-benchmark")

model_path = "reference_files/pangolin/models/final.{model_index}.{model_num}.3"

model_nums = [0, 2, 4, 6]
models = []
for i in model_nums:
    for j in range(1, 6):
        model = Pangolin(L, W, AR)
        model.cuda()
        weights = torch.load(model_path.format(model_index=j, model_num=i))
        model.load_state_dict(weights)
        model.eval()
        models.append(model)
                
print(models)


In [1]:
import zarr
import os

In [24]:
splice_site_predicitons = zarr.open_group(store="/zata/zippy/ramirezc/splice-model-benchmark/results/pangolin_predictions.zarr/splice_site_predictions", mode="r")
splice_site_truth = zarr.open_group(store="/zata/zippy/ramirezc/splice-model-benchmark/results/pangolin_predictions.zarr/splice_site_truth", mode="r")
splice_sites = zarr.open_group(store="/zata/zippy/ramirezc/splice-model-benchmark/results/pangolin_predictions.zarr/splice_sites", mode="r")

In [31]:
print(splice_site_predicitons["chr1"][11963635])
print(splice_site_truth["chr1"][11963635])

0.9036884307861328
1


In [14]:
print(splice_sites['metadata'][:200])

[('chr1',  53945928, '-') ('chr1',  53939984, '-')
 ('chr1',  53940096, '-') ('chr1',  53930038, '-')
 ('chr1',  53930144, '-') ('chr1',  53928371, '-')
 ('chr1',  53928439, '-') ('chr1',  53923903, '-')
 ('chr1',  53923946, '-') ('chr1',  53921560, '-')
 ('chr1',  11934854, '+') ('chr1',  11947975, '+')
 ('chr1',  11948066, '+') ('chr1',  11949772, '+')
 ('chr1',  11949905, '+') ('chr1',  11950356, '+')
 ('chr1',  11950519, '+') ('chr1',  11952622, '+')
 ('chr1',  11952734, '+') ('chr1',  11954829, '+')
 ('chr1',  11954892, '+') ('chr1',  11956916, '+')
 ('chr1',  11957013, '+') ('chr1',  11957841, '+')
 ('chr1',  11957942, '+') ('chr1',  11958515, '+')
 ('chr1',  11958646, '+') ('chr1',  11960645, '+')
 ('chr1',  11960766, '+') ('chr1',  11963531, '+')
 ('chr1',  11963635, '+') ('chr1',  11964174, '+')
 ('chr1',  11964299, '+') ('chr1',  11964643, '+')
 ('chr1',  11964784, '+') ('chr1',  11965479, '+')
 ('chr1',  11965592, '+') ('chr1',  11966250, '+')
 ('chr1',  11966315, '+') ('chr

In [1]:
from nucleotide_transformer.pretrained import get_pretrained_segment_nt_model
import haiku as hk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
parameters, forward_fn, tokenizer, config = get_pretrained_segment_nt_model(
    model_name="segment_nt",
    max_positions=5000 + 1,
)
forward_fn = hk.transform(forward_fn)
donor_idx = config.features.index('splice_donor')
acceptor_idx = config.features.index('splice_acceptor')
print(config.features)

Downloading model's hyperparameters json file...
Downloaded model's hyperparameters.
Downloading model's weights...
Downloaded model's weights...
['protein_coding_gene', 'lncRNA', 'exon', 'intron', 'splice_donor', 'splice_acceptor', '5UTR', '3UTR', 'CTCF-bound', 'polyA_signal', 'enhancer_Tissue_specific', 'enhancer_Tissue_invariant', 'promoter_Tissue_specific', 'promoter_Tissue_invariant']


In [14]:
import os
os.chdir("/zata/zippy/ramirezc/splice-model-benchmark")

import polars as pl
import numpy as np
from models.spliceai import SpliceAIEvaluator
from IPython.display import display

In [7]:
print([f"chr{i}" for i in range(1, 11, 2)])

['chr1', 'chr3', 'chr5', 'chr7', 'chr9']


In [4]:
evaluator = SpliceAIEvaluator()
sorted_df = evaluator._filter_gencode()
display(sorted_df)

Filtering GENCODE GTF...
Number of expressed transcripts: 10660
Number of transcripts per chromsome: shape: (5, 2)
┌─────────┬───────┐
│ seqname ┆ count │
│ ---     ┆ ---   │
│ cat     ┆ u32   │
╞═════════╪═══════╡
│ chr1    ┆ 281   │
│ chr3    ┆ 136   │
│ chr5    ┆ 113   │
│ chr7    ┆ 129   │
│ chr9    ┆ 98    │
└─────────┴───────┘
Done


index,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
u32,cat,cat,cat,str,str,f32,cat,i64,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str
1043,"""chr1""","""HAVANA""","""exon""","""EXCLUDE""","""53945929""",,"""-""",0,"""ENSG00000081870.11""","""protein_coding""","""HSPB11""","""2""","""OTTHUMG00000008408.4""","""ENST00000194214.9""","""protein_coding""","""HSPB11-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000023114.1""",1,"""ENSE00001841796.1""","""""","""ENSP00000194214.5""","""CCDS41341.1"""
1044,"""chr1""","""HAVANA""","""exon""","""53939985""","""53940097""",,"""-""",0,"""ENSG00000081870.11""","""protein_coding""","""HSPB11""","""2""","""OTTHUMG00000008408.4""","""ENST00000194214.9""","""protein_coding""","""HSPB11-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000023114.1""",2,"""ENSE00001334213.1""","""""","""ENSP00000194214.5""","""CCDS41341.1"""
1045,"""chr1""","""HAVANA""","""exon""","""53930039""","""53930145""",,"""-""",0,"""ENSG00000081870.11""","""protein_coding""","""HSPB11""","""2""","""OTTHUMG00000008408.4""","""ENST00000194214.9""","""protein_coding""","""HSPB11-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000023114.1""",3,"""ENSE00000772733.1""","""""","""ENSP00000194214.5""","""CCDS41341.1"""
1046,"""chr1""","""HAVANA""","""exon""","""53928372""","""53928440""",,"""-""",0,"""ENSG00000081870.11""","""protein_coding""","""HSPB11""","""2""","""OTTHUMG00000008408.4""","""ENST00000194214.9""","""protein_coding""","""HSPB11-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000023114.1""",4,"""ENSE00001242678.2""","""""","""ENSP00000194214.5""","""CCDS41341.1"""
1047,"""chr1""","""HAVANA""","""exon""","""53923904""","""53923947""",,"""-""",0,"""ENSG00000081870.11""","""protein_coding""","""HSPB11""","""2""","""OTTHUMG00000008408.4""","""ENST00000194214.9""","""protein_coding""","""HSPB11-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000023114.1""",5,"""ENSE00003618665.1""","""""","""ENSP00000194214.5""","""CCDS41341.1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
4821,"""chr9""","""HAVANA""","""exon""","""69046385""","""69046482""",,"""+""",0,"""ENSG00000165060.12""","""protein_coding""","""FXN""","""2""","""OTTHUMG00000019977.11""","""ENST00000643639.1""","""protein_coding""","""FXN-207""","""""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000052568.4""",2,"""ENSE00003532023.1""","""""","""ENSP00000496143.1""","""CCDS6626.1"""
4822,"""chr9""","""HAVANA""","""exon""","""69053140""","""69053260""",,"""+""",0,"""ENSG00000165060.12""","""protein_coding""","""FXN""","""2""","""OTTHUMG00000019977.11""","""ENST00000643639.1""","""protein_coding""","""FXN-207""","""""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000052568.4""",3,"""ENSE00001089856.1""","""""","""ENSP00000496143.1""","""CCDS6626.1"""
4823,"""chr9""","""HAVANA""","""exon""","""69064938""","""69065035""",,"""+""",0,"""ENSG00000165060.12""","""protein_coding""","""FXN""","""2""","""OTTHUMG00000019977.11""","""ENST00000643639.1""","""protein_coding""","""FXN-207""","""""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000052568.4""",4,"""ENSE00001089858.1""","""""","""ENSP00000496143.1""","""CCDS6626.1"""
4824,"""chr9""","""HAVANA""","""exon""","""69072612""","""EXCLUDE""",,"""+""",0,"""ENSG00000165060.12""","""protein_coding""","""FXN""","""2""","""OTTHUMG00000019977.11""","""ENST00000643639.1""","""protein_coding""","""FXN-207""","""""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000052568.4""",5,"""ENSE00001887404.2""","""""","""ENSP00000496143.1""","""CCDS6626.1"""


In [12]:
quant_tsv_1 = pl.read_csv("reference_files/transcript_quantifications_rep1.tsv", separator='\t')
quant_tsv_2 = pl.read_csv("reference_files/transcript_quantifications_rep2.tsv", separator='\t')
joined_tsv = quant_tsv_1.join(quant_tsv_2, on='transcript_ID', how='inner')
averaged_counts = joined_tsv.with_columns(
    ((pl.col('rep1ENCSR368UNC') + pl.col('rep2ENCSR368UNC')) / 2).alias('transcript_count')
)
clean_tsv = averaged_counts.select("annot_transcript_id", "annot_transcript_name", "transcript_count")
expressed_transcripts = clean_tsv.filter(pl.col('transcript_count') >= 2.0)['annot_transcript_id'].to_list()

gtf = pl.read_parquet("reference_files/gencode.v29.primary_assembly.annotation_UCSC_names.gtf.parquet")
filtered_df = gtf.filter(
    (pl.col('feature') == 'transcript') &
    (pl.col('gene_type') == 'protein_coding') &
    (pl.col('seqname').is_in([f"chr{i}" for i in range(1, 11, 2)])) &
    (pl.col('transcript_id').is_in(expressed_transcripts))
)
display(filtered_df)

seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
cat,cat,cat,i64,i64,f32,cat,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""chr1""","""HAVANA""","""transcript""",1013423,1014540,,"""+""",0,"""ENSG00000187608.9""","""protein_coding""","""ISG15""","""2""","""OTTHUMG00000040777.4""","""ENST00000379389.4""","""protein_coding""","""ISG15-201""","""1""","""basic,appris_principal_2,CCDS""","""OTTHUMT00000097989.1""","""""","""""","""""","""ENSP00000368699.4""","""CCDS6.1"""
"""chr1""","""HAVANA""","""transcript""",1216908,1232001,,"""-""",0,"""ENSG00000078808.16""","""protein_coding""","""SDF4""","""2""","""OTTHUMG00000001812.6""","""ENST00000360001.10""","""protein_coding""","""SDF4-202""","""1""","""basic,appris_alternative_2,CCD…","""OTTHUMT00000005064.1""","""""","""""","""""","""ENSP00000353094.6""","""CCDS30553.1"""
"""chr1""","""HAVANA""","""transcript""",1324767,1328897,,"""+""",0,"""ENSG00000224051.6""","""protein_coding""","""CPTP""","""2""","""OTTHUMG00000003171.3""","""ENST00000343938.8""","""protein_coding""","""CPTP-201""","""2""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000008742.1""","""""","""""","""""","""ENSP00000343890.4""","""CCDS30555.1"""
"""chr1""","""HAVANA""","""transcript""",1373730,1375157,,"""-""",0,"""ENSG00000175756.13""","""protein_coding""","""AURKAIP1""","""2""","""OTTHUMG00000001413.3""","""ENST00000338370.7""","""protein_coding""","""AURKAIP1-203""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000008273.1""","""""","""""","""""","""ENSP00000342676.3""","""CCDS25.1"""
"""chr1""","""HAVANA""","""transcript""",1373730,1375495,,"""-""",0,"""ENSG00000175756.13""","""protein_coding""","""AURKAIP1""","""2""","""OTTHUMG00000001413.3""","""ENST00000338338.9""","""protein_coding""","""AURKAIP1-202""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000004082.2""","""""","""""","""""","""ENSP00000340656.5""","""CCDS25.1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr9""","""HAVANA""","""transcript""",137217452,137219361,,"""+""",0,"""ENSG00000284976.1""","""protein_coding""","""BX255925.3""","""2""","""OTTHUMG00000192535.1""","""ENST00000645271.1""","""protein_coding""","""BX255925.3-201""","""""","""CAGE_supported_TSS,basic,appri…","""OTTHUMT00000495558.1""","""""","""""","""""","""ENSP00000494172.1""",""""""
"""chr9""","""HAVANA""","""transcript""",137241213,137243707,,"""+""",0,"""ENSG00000188229.5""","""protein_coding""","""TUBB4B""","""2""","""OTTHUMG00000131783.2""","""ENST00000340384.4""","""protein_coding""","""TUBB4B-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000254715.1""","""""","""""","""""","""ENSP00000341289.4""","""CCDS7039.1"""
"""chr9""","""HAVANA""","""transcript""",137255173,137273546,,"""+""",0,"""ENSG00000188986.6""","""protein_coding""","""NELFB""","""2""","""OTTHUMG00000131778.2""","""ENST00000343053.4""","""protein_coding""","""NELFB-201""","""1""","""non_ATG_start,basic,appris_alt…","""OTTHUMT00000254710.2""","""""","""""","""""","""ENSP00000339495.5""","""CCDS7040.2"""
"""chr9""","""HAVANA""","""transcript""",137551199,137552555,,"""+""",0,"""ENSG00000182154.7""","""protein_coding""","""MRPL41""","""2""","""OTTHUMG00000020987.1""","""ENST00000371443.5""","""protein_coding""","""MRPL41-201""","""1""","""basic,appris_principal_1,CCDS""","""OTTHUMT00000055327.1""","""""","""""","""""","""ENSP00000360498.5""","""CCDS7046.1"""


In [23]:
print(f"Average length of transcripts: {np.median((filtered_df['end'].to_numpy() - filtered_df['start'].to_numpy()))}")

Average length of transcripts: 12760.0
