# Simulate a transcriptome with TE transcripts using polyester

1. Generate transcriptome 
 - Spliced and unspliced transcripts from GENCODE annotation
 - L1 transcripts from full-length L1HS-L1PA6 annotations in reference genome
 OR
 - L1 consensus sequences from RepBase (ask mike for file)

2. Simulate reads with polyester (see code from `./mikes_old_notebook`)
3. Quantify reads with salmon
 - build index of transcriptome (use same transcriptome from step 1)
 - quantify reads with salmon

4. Compare with original count matrix
 - figure out how to get read counts from salmon (https://salmon.readthedocs.io/en/latest/file_formats.html)

## Generating transcriptome 

In [6]:
GENOME_FA = "resources/hg38.fa"
TXOME_GTF = "resources/gencode.v43.basic.annotation.gtf"
RMSK = "resources/hg38.fa.out.gz"

In [7]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict
from pyutils.make_txome import make_txome
from pyutils.simulate import run_polyester

### Get L1 transcripts from annotations in reference genome

In [19]:
# Load RepeatMasker output into a dataframe
rmsk_df = pd.read_csv(RMSK, sep="\s+", skiprows=3, header=None, compression="gzip")

In [20]:
rmsk_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,463,1.3,0.6,1.7,chr1,10001,10468,(248945954),+,(TAACCC)n,Simple_repeat,1,471,(0),1
1,3612,11.4,21.5,1.3,chr1,10469,11447,(248944975),C,TAR1,Satellite/telo,(399),1712,483,2
2,484,25.1,13.2,0.0,chr1,11505,11675,(248944747),C,L1MC5a,LINE/L1,(2382),395,199,3
3,239,29.4,1.9,1.0,chr1,11678,11780,(248944642),C,MER5B,DNA/hAT-Charlie,(74),104,1,4
4,318,23.0,3.7,0.0,chr1,15265,15355,(248941067),C,MIR3,SINE/MIR,(119),143,49,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5520113,744,3.8,1.1,2.5,chrY,57216407,57217320,(10095),+,(TTAGGG)n,Simple_repeat,1,924,(0),4607726
5520114,55,10.5,4.3,0.0,chrY,57217321,57217413,(10002),+,(GT)n,Simple_repeat,1,97,(0),4607727
5520115,2240,11.9,0.0,1.0,chrY_KI270740v1_random,229,540,(36700),C,AluY,SINE/Alu,(0),311,3,4607728
5520116,1433,14.1,4.0,0.4,chrY_KI270740v1_random,625,867,(36373),C,L1PA7,LINE/L1,(2),6152,5901,4607729


In [35]:
# Filter for L1HS to L1PA6 annotations
l1_df = rmsk_df[rmsk_df[9].str.contains("L1HS")]
# CHANGE TO l1_df = rmsk_df[rmsk_df[9].str.contains('L1HS|L1PA[1-6]')]
# WHEN AWARE OF LENGTH OF L1PA1-6

In [36]:
l1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
24292,2082,0.4,0.0,0.4,chr1,12816084,12816355,(236140067),C,L1HS,LINE/L1,(0),6155,5914,20430
27163,2320,0.8,0.0,0.0,chr1,14257711,14257955,(234698467),+,L1HS,LINE/L1,5910,6154,(1),22856
65478,3107,0.9,0.0,0.0,chr1,28985834,28986182,(219970240),C,L1HS,LINE/L1,(4),6151,5803,55671
68952,8095,0.5,0.0,0.0,chr1,30567814,30568750,(218387672),+,L1HS,LINE/L1,5203,6139,(16),58634
78927,29267,0.6,0.0,0.0,chr1,34566056,34572105,(214384317),C,L1HS,LINE/L1,(0),6155,124,67007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5510639,4793,0.7,0.0,0.2,chrY,20690314,20690858,(36536557),C,L1HS,LINE/L1,(0),6155,5612,4600409
5511004,7228,1.2,0.0,0.0,chrY,20958692,20959548,(36267867),C,L1HS,LINE/L1,(0),6155,5299,4600658
5515241,19281,0.7,0.0,0.0,chrY,23956872,23959834,(33267581),C,L1HS,LINE/L1,(0),6155,3193,4603915
5517909,19281,0.7,0.0,0.1,chrY,25710292,25713238,(31514177),+,L1HS,LINE/L1,3193,6139,(16),4605941


In [45]:
l1_df.loc[:, 15] = l1_df[6] - l1_df[5]
l1_df = l1_df[l1_df[15] >= 6000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l1_df.loc[:,15]= l1_df[6] - l1_df[5]


In [46]:
l1_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
78927,29267,0.6,0.0,0.0,chr1,34566056,34572105,(214384317),C,L1HS,LINE/L1,(0),6155,124,67007,6049
152203,28782,0.3,0.1,0.2,chr1,67078892,67084915,(181871507),C,L1HS,LINE/L1,(10),6145,126,128189,6023
160412,28765,0.9,0.0,0.2,chr1,71513699,71519742,(177436680),+,L1HS,LINE/L1,298,6328,(0),134967,6043
176017,28834,0.5,0.0,0.0,chr1,80939204,80945257,(168011165),C,L1HS,LINE/L1,(0),6155,132,147789,6053
181032,27641,0.4,0.0,0.1,chr1,84052390,84058406,(164898016),C,L1HS,LINE/L1,(13),6142,125,152004,6016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5475639,28601,0.6,0.0,0.0,chrX,155516017,155522048,(518847),C,L1HS,LINE/L1,(0),6155,124,4573434,6031
5485359,28376,0.7,0.3,0.1,chrY,3443551,3449565,(53777850),+,L1HS,LINE/L1,125,6155,(0),4580303,6014
5487522,28420,0.7,0.1,0.0,chrY,4948914,4954938,(52272477),C,L1HS,LINE/L1,(0),6155,127,4581871,6024
5488398,28664,0.4,0.0,0.0,chrY,5606145,5612199,(51615216),C,L1HS,LINE/L1,(0),6155,124,4582621,6054


In [48]:
# Read the reference genome
genome = SeqIO.to_dict(SeqIO.parse(GENOME_FA, "fasta"))

In [49]:
# Extract L1 transcripts based on the coordinates
l1_sequences = {}
for index, row in l1_df.iterrows():
    chrom = row[4]
    start = row[5]
    end = row[6]
    strand = row[8]
    l1_id = f"L1_{chrom}_{start}_{end}"
    l1_sequences[l1_id] = genome[chrom].seq[start:end]
    if strand == "C":
        l1_sequences[l1_id] = l1_sequences[l1_id].reverse_complement()

# You can now work with `l1_sequences`, which is a dictionary with keys being the L1 annotation IDs and values being the L1 sequences.

In [50]:
l1_sequences

{'L1_chr1_34566056_34572105': Seq('gggggaggagccaagatggccgaataggaacagctccggtctacagctcccagc...aaa'),
 'L1_chr1_67078892_67084915': Seq('gggaggagccaagatggccgaataggaacagctccggtctacagctcccagcgt...taa'),
 'L1_chr1_71513699_71519742': Seq('aggaggagccaagatggccgaataggaacagctccggtgaacagctccggtcta...aaa'),
 'L1_chr1_80939204_80945257': Seq('agccaagatggccgaataggaacagctctggtctacagctcccagcgtgagcga...aaa'),
 'L1_chr1_84052390_84058406': Seq('ggggaggagccaagatggccgaataggaacagctccggtctacagctcccagcg...taa'),
 'L1_chr1_85748520_85754548': Seq('agggaggagccaagatggccgaataggaacagctccggtctacagctcccagcg...aaa'),
 'L1_chr1_85927068_85933100': Seq('gaggaggagccaagacggccgaataggaacagctccggtctacagctcccagcg...aaa'),
 'L1_chr1_86679081_86685111': Seq('ggggaggagccaagatggccgaataggaacagctccggtctacagctcccagcg...aaa'),
 'L1_chr1_104770248_104776278': Seq('ggcggaggagccaagatggccgaataggaacagctccggtctacagctcccagc...aaa'),
 'L1_chr1_104843834_104849864': Seq('ggggaggagccaagatggccgaataggaacagctccggtctacagctcccagcg...aaa'),
 'L1_c

In [None]:
# TODO
# - put them in fasta file

### Generating transcriptome
# - use the code from make_txome to make a gtf file with the L1 annotations