In [27]:
from myutils import rmsk
import pyranges as pr
import pandas as pd
import numpy as np

# import rmsk_to_gtf() from the file /netapp/LOG-G4/mcuoco/projects/bulk_te_bench/workflow/make_txome/make_txome.py
from netapp.LOG_G4.mcuoco.projects.bulk_te_bench.workflow.make_txome.make_txome import (
    rmsk_to_gtf,
)

ModuleNotFoundError: No module named 'netapp'

In [6]:
## ADAPTED FROM make_txome.py

"""
DEFINE FUNCTIONS 
"""


def rmsk_to_gtf(rmsk: pd.DataFrame) -> pd.DataFrame:
    "convert RepeatMasker output file to gtf"

    rmsk.rename(
        columns={
            "genoName": "Chromosome",
            "genoStart": "Start",
            "genoEnd": "End",
            "strand": "Strand",
        },
        inplace=True,
    )

    # gene level
    rmsk.Start = rmsk.Start - 1  # make 0 based
    rmsk["gene_id"] = rmsk["repName"]
    rmsk["family_id"] = rmsk["repFamily"]
    rmsk["class_id"] = rmsk["repClass"]
    rmsk["gene_name"] = rmsk["repName"] + ":TE"
    rmsk = rmsk[
        [
            "Chromosome",
            "Start",
            "End",
            "Strand",
            "gene_id",
            "family_id",
            "class_id",
            "gene_name",
        ]
    ].copy()
    rmsk["Source"] = "RepeatMasker"
    rmsk["Feature"] = "gene"
    rmsk["gene_type"] = "retrogene"

    # tx level
    rmsktx = rmsk.copy().reset_index(drop=True)
    rmsktx["Feature"] = "transcript"
    rmsktx["transcript_id"] = (
        rmsktx["gene_id"] + "_dup" + rmsktx.groupby("gene_id").cumcount().astype(str)
    )

    # exon level
    rmskex = rmsktx.copy().reset_index(drop=True)
    rmskex["Feature"] = "exon"
    rmskex["exon_id"] = rmskex.transcript_id
    rmskex["exon_number"] = 1

    return pd.concat([rmsk, rmsktx, rmskex]).sort_values(["Chromosome", "Start", "End"])


def gtf_to_bed(rmsk_file, genes, family):
    "convert gtf to bed"

    bed = rmsk_file[rmsk_file["repName"] == family]

    bed = bed[["genoName", "genoStart", "genoEnd", "repName", "strand"]].copy()
    # bed.to_csv(f"../data/{family}.bed", sep="\t", index=False, header=False)

    # get rid of chr in chromosome names

    truncated_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == False])
    full_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == True])

    ### Full length intronic and intergenic regions
    full_l1["Length_TE"] = full_l1["End"] - full_l1["Start"]

    full_l1 = (
        pr.PyRanges(full_l1, int64=True)
        .join(
            pr.PyRanges(genes, int64=True),
            how="left",
            report_overlap=True,
            suffix="_gene",
        )
        .df
    )
    return full_l1

    full_l1[(full_l1["Overlap"] == full_l1["Length_TE"])].to_csv(
        f"../data/full_intronic_{family}.bed", sep="\t", index=False, header=False
    )
    full_l1[(full_l1["Overlap"] != full_l1["Length_TE"])].to_csv(
        f"../data/full_intergenic_{family}.bed", sep="\t", index=False, header=False
    )

    ### Truncated intronic and intergenic regions
    truncated_l1["Length_TE"] = truncated_l1["End"] - truncated_l1["Start"]
    truncated_l1 = (
        pr.PyRanges(truncated_l1, int64=True)
        .join(
            pr.PyRanges(genes, int64=True),
            how="left",
            report_overlap=True,
            suffix="_gene",
        )
        .df
    )

    truncated_l1[(truncated_l1["Overlap"] == truncated_l1["Length_TE"])].to_csv(
        f"../data/truncate_intronic_{family}.bed", sep="\t", index=False, header=False
    )

    truncated_l1[(truncated_l1["Overlap"] != truncated_l1["Length_TE"])].to_csv(
        f"../data/truncate_intergenic_{family}.bed", sep="\t", index=False, header=False
    )

    return bed

## Read in gene functions

In [3]:
gtf = pr.read_gtf("../data/gencode.v26.basic.annotation.gtf.gz").df
genes = []
for gene_id, df in gtf.groupby("gene_id"):
    assert (
        df["Strand"].unique().shape[0] == 1
    ), f"This gene: {gene_id} is on two strands"

    row = {
        "Chromosome": df["Chromosome"].unique()[0],
        "Start": min(df["Start"]),
        "End": max(df["End"]),
        "Strand": df["Strand"].unique()[0],
        "gene_id": gene_id,
        "gene_name": df["gene_name"].unique()[0],
    }
    genes.append(row)

# additional filtering for future methods
genes = pd.DataFrame(genes)
genes["Length_Gene"] = genes["End"] - genes["Start"]
genes["Chromosome"] = genes["Chromosome"].str.replace(
    "chr", ""
)  # get rid of chr in chromosome names

genes

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,gene_name,Length_Gene
0,X,100627108,100639991,-,ENSG00000000003.14,TSPAN6,12883
1,X,100584801,100599885,+,ENSG00000000005.5,TNMD,15084
2,20,50934866,50958555,-,ENSG00000000419.12,DPM1,23689
3,1,169849630,169894267,-,ENSG00000000457.13,SCYL3,44637
4,1,169662006,169854080,+,ENSG00000000460.16,C1orf112,192074
...,...,...,...,...,...,...,...
58214,1,157203603,157205062,-,ENSG00000284592.1,RP11-71G12.2,1459
58215,11,1880044,1880147,+,ENSG00000284594.1,MIR7847,103
58216,17,75498547,75498628,+,ENSG00000284595.1,MIR6785,81
58217,7,102471468,102471531,+,ENSG00000284596.1,MIR4467,63


In [4]:
genes["Chromosome"] = genes["Chromosome"].str.replace("chr", "")
genes

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,gene_name,Length_Gene
0,X,100627108,100639991,-,ENSG00000000003.14,TSPAN6,12883
1,X,100584801,100599885,+,ENSG00000000005.5,TNMD,15084
2,20,50934866,50958555,-,ENSG00000000419.12,DPM1,23689
3,1,169849630,169894267,-,ENSG00000000457.13,SCYL3,44637
4,1,169662006,169854080,+,ENSG00000000460.16,C1orf112,192074
...,...,...,...,...,...,...,...
58214,1,157203603,157205062,-,ENSG00000284592.1,RP11-71G12.2,1459
58215,11,1880044,1880147,+,ENSG00000284594.1,MIR7847,103
58216,17,75498547,75498628,+,ENSG00000284595.1,MIR6785,81
58217,7,102471468,102471531,+,ENSG00000284596.1,MIR4467,63


In [5]:
rmsk_file = rmsk.read_rmsk("../data/hg38.fa.out.gz")
rmsk_file["genoName"].unique()

rmsk_file["genoName"] = rmsk_file["genoName"].str.replace("chr", "")
rmsk_file = rmsk_file[rmsk_file["genoName"].isin(genes["Chromosome"].unique())]
# keep only standard chromosomes


rmsk_file

Unnamed: 0,swScore,milliDiv,milliDel,milliIns,genoName,genoStart,genoEnd,genoLeft,strand,repName,repStart,repEnd,repLeft,id,repClass,repFamily,length,age,has_promoter,is_full_length
0,463,13.0,6.0,17.0,1,10001,10468,248945954,+,(TAACCC)n,1,471,0,1,Simple_repeat,,470,2.980451,,
1,3612,114.0,215.0,13.0,1,10469,11447,248944975,-,TAR1,399,1712,483,2,Satellite,telo,1229,28.103632,,
2,484,251.0,132.0,0.0,1,11505,11675,248944747,-,L1MC5a,2382,395,199,3,LINE,L1,196,69.454621,False,False
3,239,294.0,19.0,10.0,1,11678,11780,248944642,-,MER5B,74,104,1,4,DNA,hAT-Charlie,103,84.814840,,
4,318,230.0,37.0,0.0,1,15265,15355,248941067,-,MIR3,119,143,49,5,SINE,MIR,94,62.428022,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5520110,256,304.0,19.0,10.0,Y,57214797,57214899,12516,+,MER5B,1,104,74,4607723,DNA,hAT-Charlie,103,88.594475,,
5520111,497,204.0,152.0,36.0,Y,57214902,57215093,12322,+,L1MC5a,199,395,2382,4607724,LINE,L1,196,54.111517,False,False
5520112,6064,115.0,204.0,10.0,Y,57215117,57216403,11012,+,TAR1,469,2068,43,4607725,Satellite,telo,1599,28.371854,,
5520113,744,38.0,11.0,25.0,Y,57216407,57217320,10095,+,(TTAGGG)n,1,924,0,4607726,Simple_repeat,,923,8.862834,,


## Make bed files for LINE-1 families

In [18]:
L1_families = ["L1HS", "L1PA2", "L1PA3", "L1PA6"]

for family in L1_families:
    df = gtf_to_bed(rmsk_file, genes, family)

In [16]:
df

Unnamed: 0,genoName,genoStart,genoEnd,repName,strand
24292,1,12816084,12816355,L1HS,-
27163,1,14257711,14257955,L1HS,+
65478,1,28985834,28986182,L1HS,-
68952,1,30567814,30568750,L1HS,+
78927,1,34566056,34572105,L1HS,-
...,...,...,...,...,...
5510639,Y,20690314,20690858,L1HS,-
5511004,Y,20958692,20959548,L1HS,-
5515241,Y,23956872,23959834,L1HS,-
5517909,Y,25710292,25713238,L1HS,+


In [17]:
rmsk_file[rmsk_file[rmsk_file["repName"] == "L1HS"]["is_full_length"] == True]

  rmsk_file[rmsk_file[rmsk_file['repName']=='L1HS']['is_full_length'] == True]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [15]:
df = gtf_to_bed(rmsk_file, genes, "L1HS")
df

KeyError: 'is_full_length'

In [14]:
pd.read_csv(
    "/netapp/LOG-G4/mcuoco/projects/bulk_te_bench/results/full_txome/txome_rmsk.gtf",
    sep="\t",
    header=None,
)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,RepeatMasker,exon,26791,27053,.,+,.,"transcript_id ""AluSp_dup0""; gene_id ""AluSp""; g..."
1,chr1,RepeatMasker,exon,31436,31733,.,+,.,"transcript_id ""AluJo_dup0""; gene_id ""AluJo""; g..."
2,chr1,RepeatMasker,exon,39624,39924,.,+,.,"transcript_id ""AluSx_dup0""; gene_id ""AluSx""; g..."
3,chr1,RepeatMasker,exon,51585,51880,.,+,.,"transcript_id ""AluYj4_dup0""; gene_id ""AluYj4"";..."
4,chr1,RepeatMasker,exon,61863,62160,.,-,.,"transcript_id ""AluSc_dup0""; gene_id ""AluSc""; g..."
...,...,...,...,...,...,...,...,...,...
899540,chrY,RepeatMasker,exon,57201488,57201792,.,+,.,"transcript_id ""AluSp_dup45579""; gene_id ""AluSp..."
899541,chrY,RepeatMasker,exon,57202265,57202567,.,+,.,"transcript_id ""AluSz_dup91148""; gene_id ""AluSz..."
899542,chrY,RepeatMasker,exon,57202569,57202876,.,+,.,"transcript_id ""AluSc_dup30553""; gene_id ""AluSc..."
899543,chrY,RepeatMasker,exon,57203561,57203838,.,+,.,"transcript_id ""AluSz_dup91149""; gene_id ""AluSz..."


In [18]:
gtf = pr.read_gtf(
    "/netapp/LOG-G4/mcuoco/projects/bulk_te_bench/results/full_txome/txome_rmsk.gtf"
)
gtf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,gene_id,gene_name,family_id,class_id,gene_type,transcript_type,exon_id,exon_number,contained_in
0,chr1,RepeatMasker,exon,26790,27053,.,+,.,AluSp_dup0,AluSp,AluSp:TE,Alu,SINE,retrogene,retrogene,AluSp_dup0,1.0,
1,chr1,RepeatMasker,exon,31435,31733,.,+,.,AluJo_dup0,AluJo,AluJo:TE,Alu,SINE,retrogene,retrogene,AluJo_dup0,1.0,
2,chr1,RepeatMasker,exon,39623,39924,.,+,.,AluSx_dup0,AluSx,AluSx:TE,Alu,SINE,retrogene,retrogene,AluSx_dup0,1.0,
3,chr1,RepeatMasker,exon,51584,51880,.,+,.,AluYj4_dup0,AluYj4,AluYj4:TE,Alu,SINE,retrogene,retrogene,AluYj4_dup0,1.0,
4,chr1,RepeatMasker,exon,76892,77201,.,+,.,AluSz_dup0,AluSz,AluSz:TE,Alu,SINE,retrogene,retrogene,AluSz_dup0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899540,chrY,RepeatMasker,exon,57178820,57179120,.,-,.,AluJr4_dup11436,AluJr4,AluJr4:TE,Alu,SINE,retrogene,retrogene,AluJr4_dup11436,1.0,
899541,chrY,RepeatMasker,exon,57180797,57181083,.,-,.,AluJb_dup84782,AluJb,AluJb:TE,Alu,SINE,retrogene,retrogene,AluJb_dup84782,1.0,
899542,chrY,RepeatMasker,exon,57181121,57181411,.,-,.,AluSz_dup91146,AluSz,AluSz:TE,Alu,SINE,retrogene,retrogene,AluSz_dup91146,1.0,
899543,chrY,RepeatMasker,exon,57198515,57198819,.,-,.,AluSq2_dup55478,AluSq2,AluSq2:TE,Alu,SINE,retrogene,retrogene,AluSq2_dup55478,1.0,


In [25]:
bed = gtf.df[gtf.df["gene_id"] == "L1HS"]
bed

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,gene_id,gene_name,family_id,class_id,gene_type,transcript_type,exon_id,exon_number,contained_in
17591,chr1,RepeatMasker,exon,85748519,85754548,.,+,.,L1HS_dup1,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup1,1.0,ENSG00000171502.15-I (-)
17612,chr1,RepeatMasker,exon,85927067,85933100,.,+,.,L1HS_dup2,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup2,1.0,ENSG00000171502.15-I (-)
30301,chr1,RepeatMasker,exon,197707714,197713746,.,+,.,L1HS_dup9,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup9,1.0,ENSG00000213047.13-I (-)
48568,chr1,RepeatMasker,exon,34566055,34572105,.,-,.,L1HS_dup0,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup0,1.0,
59192,chr1,RepeatMasker,exon,104770247,104776278,.,-,.,L1HS_dup3,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup3,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883538,chrX,RepeatMasker,exon,66180696,66186728,.,-,.,L1HS_dup153,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup153,1.0,ENSG00000089472.18-I (+)
885360,chrX,RepeatMasker,exon,83059584,83065637,.,-,.,L1HS_dup156,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup156,1.0,
891393,chrX,RepeatMasker,exon,141421202,141427246,.,-,.,L1HS_dup158,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup158,1.0,ENSG00000277215.2-I (+)
892506,chrX,RepeatMasker,exon,155516016,155522048,.,-,.,L1HS_dup161,L1HS,L1HS:TE,L1,LINE,retrogene,retrogene,L1HS_dup161,1.0,"ENSG00000224533.4-I (+),ENSG00000185973.12-I (-)"


In [None]:
bed = gtf[gtf["gene_id"] == "L1HS"]


bed = bed[["genoName", "genoStart", "genoEnd", "repName", "strand"]].copy()
# bed.to_csv(f"../data/{family}.bed", sep="\t", index=False, header=False)


# get rid of chr in chromosome names

truncated_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == False])
full_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == True])

### Full length intronic and intergenic regions
full_l1["Length_TE"] = full_l1["End"] - full_l1["Start"]

full_l1 = (
    pr.PyRanges(full_l1, int64=True)
    .join(
        pr.PyRanges(genes, int64=True),
        how="left",
        report_overlap=True,
        suffix="_gene",
    )
    .df
)
full_l1