In [None]:
import subprocess
import os


def make_fasta(output_dir = "results"):
    os.makedirs(output_dir, exist_ok=True)
    os.chdir(output_dir)

    download_variants = "wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/latest/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz"
    download_reference_genome = "wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta.gz && gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
    get_biallelic_variants = "bcftools view -v snps -m2 -M2 HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz -Oz -o GM12878_SNPs_biallelic.vcf.gz"
    get_index = "bcftools index GM12878_SNPs_biallelic.vcf.gz"
    get_consensus = "bcftools consensus -f GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta -H 1 GM12878_SNPs_biallelic.vcf.gz > GM12878.fasta"

    subprocess.run(download_variants, shell=True)
    subprocess.run(download_reference_genome, shell=True)
    subprocess.run(get_biallelic_variants, shell=True)
    subprocess.run(get_index, shell=True)
    subprocess.run(get_consensus, shell=True)

    os.remove(path="HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz")
    os.remove(path="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta")
    os.remove(path="GM12878_SNPs_biallelic.vcf.gz")
    os.remove(path="GM12878_SNPs_biallelic.vcf.gz.csi")
    
    print(f"Done! Fasta file saved to {output_dir}/GM12878.fasta")

In [None]:
make_fasta()

In [None]:
from gtfparse import read_gtf
import polars as pl
from IPython.display import display

In [None]:
df = read_gtf("/data/common/genome/gencode.v47.basic.annotation.gtf")
filtered_df = df.filter(
    (pl.col('feature') == 'exon') &
    (pl.col('gene_type') == 'protein_coding') &
    (pl.col('seqname').is_in(['chr9']))
)
as_string = filtered_df.with_columns(pl.col('start').cast(pl.Utf8), pl.col('end').cast(pl.Utf8))
as_num = as_string.with_columns(pl.col('exon_number').cast(pl.Int64))
indexed_df = as_num.with_row_index()

display(indexed_df)

In [None]:
# Get lists of first and last indices
first_indices = []
last_indices = []

for _, group in indexed_df.group_by('transcript_id'):
    sorted_group = group.sort('exon_number')
    first_indices.append(sorted_group.row(0, named=True)['index'])
    last_indices.append(sorted_group.row(-1, named=True)['index'])

# Create update expressions
placeholder_df = indexed_df.with_columns([
    pl.when(pl.col("index").is_in(first_indices))
    .then(pl.lit("START"))
    .otherwise(pl.col("start"))
    .alias("start"),
    
    pl.when(pl.col("index").is_in(last_indices))
    .then(pl.lit("END"))
    .otherwise(pl.col("end"))
    .alias("end")
])

sorted_df = placeholder_df.sort('seqname', 'transcript_id', 'exon_number')

display(sorted_df)

In [None]:
import polars as pl
from IPython.display import display

In [None]:
quant_tsv_1 = pl.read_csv("../ENCFF189XTO.tsv", separator='\t')
quant_tsv_2 = pl.read_csv("../ENCFF971DVB.tsv", separator='\t')
display(quant_tsv_1)
display(quant_tsv_2)


In [None]:
joined_tsv = quant_tsv_1.join(quant_tsv_2, on='transcript_ID', how='inner')
display(joined_tsv)

averaged_counts = joined_tsv.with_columns(
    ((pl.col('rep1ENCSR368UNC') + pl.col('rep2ENCSR368UNC')) / 2).alias('transcript_count')
)
clean_tsv = averaged_counts.select("annot_transcript_id", "annot_transcript_name", "transcript_count")

In [3]:
import polars as pl 
from gtfparse import read_gtf

In [6]:
gtf_file = read_gtf("/data/common/genome/gencode.v44.basic.annotation.gtf")
gtf_file.write_parquet("../reference_files/gencode.v44.basic.annotation.gtf.parquet")

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']
