# Simulate a transcriptome with TE transcripts using polyester

1. Generate transcriptome 
 - Spliced and unspliced transcripts from GENCODE annotation
 - L1 transcripts from full-length L1HS-L1PA6 annotations in reference genome
 OR
 - L1 consensus sequences from RepBase (ask mike for file)

2. Simulate reads with polyester (see code from `./mikes_old_notebook`)
3. Quantify reads with salmon
 - build index of transcriptome (use same transcriptome from step 1)
 - quantify reads with salmon

4. Compare with original count matrix
 - figure out how to get read counts from salmon (https://salmon.readthedocs.io/en/latest/file_formats.html)

In [1]:
import pandas as pd
from collections import defaultdict
import pyranges as pr
from src.make_txome import make_txome
from src.simulate import run_polyester

## Generating transcriptome 

### Get L1 transcripts from annotations in reference genome

In [3]:
# read parsed rmsk file
# NOTE: ignore has_promoter column for now, not sure if it is accurate
rmsk = pd.read_csv("resources/hg38.rmsk.tsv", sep="\t")
rmsk = rmsk[(rmsk.repName == "L1HS") & (rmsk.length > 6000)]
rmsk = rmsk[["genoName", "genoStart", "genoEnd", "strand"]].rename(
    columns={
        "genoName": "Chromosome",
        "genoStart": "Start",
        "genoEnd": "End",
        "strand": "Strand",
    }
)

# save as bedfile
L1_BED = "../resources/hg38_FL_L1HS.bed"
pr.PyRanges(rmsk).to_bed(L1_BED)

# use bedfile to extract sequences from fasta, save to new fasta
GENOME_FA = "../resources/hg38.fa"
L1_FA = "../resources/hg38_FL_L1HS.fa"
!bedtools getfasta -s -fi {GENOME_FA} -bed {L1_BED} -fo {L1_FA}

In [4]:
### Generating transcriptome
# - use the code from make_txome to make a gtf file with the L1 annotations
TXOME_GTF = "../resources/gencode.v44.primary_assembly.basic.annotation.gtf"
make_txome("chr22_txome", GENOME_FA, TXOME_GTF, L1_FA, chromosome="chr22")

INFO:root:Using bedtools from /logg/LOG-G4/mcuoco/projects/salmonTE_testing/.conda/bin/bedtools
INFO:root:Chromosome chr22 fasta written to /iblm/netapp/data4/mcuoco/tmp/tmpdjn2n4pw.fa
INFO:root:Chromosome chr22 gtf written to /iblm/netapp/data4/mcuoco/tmp/tmpxodq6lj1.gtf
INFO:root:Saving spliceu txome to new_data


In [None]:
# TODO
# - check if the above worked and makese sense?
# - maybe redo with the consensus files instead
# - add other l1 files