In [1]:
from myutils import rmsk
import pyranges as pr
import pandas as pd
import numpy as np

# Creating bed files for LINE-1 Families 

In [17]:
## ADAPTED FROM make_txome.py

"""
DEFINE FUNCTIONS 
"""


def rmsk_to_gtf(rmsk: pd.DataFrame) -> pd.DataFrame:
    "convert RepeatMasker output file to gtf"

    rmsk.rename(
        columns={
            "genoName": "Chromosome",
            "genoStart": "Start",
            "genoEnd": "End",
            "strand": "Strand",
        },
        inplace=True,
    )

    # gene level
    rmsk.Start = rmsk.Start - 1  # make 0 based
    rmsk["gene_id"] = rmsk["repName"]
    rmsk["family_id"] = rmsk["repFamily"]
    rmsk["class_id"] = rmsk["repClass"]
    rmsk["gene_name"] = rmsk["repName"] + ":TE"
    rmsk = rmsk[
        [
            "Chromosome",
            "Start",
            "End",
            "Strand",
            "gene_id",
            "family_id",
            "class_id",
            "gene_name",
        ]
    ].copy()
    rmsk["Source"] = "RepeatMasker"
    rmsk["Feature"] = "gene"
    rmsk["gene_type"] = "retrogene"

    # tx level
    rmsktx = rmsk.copy().reset_index(drop=True)
    rmsktx["Feature"] = "transcript"
    rmsktx["transcript_id"] = (
        rmsktx["gene_id"] + "_dup" + rmsktx.groupby("gene_id").cumcount().astype(str)
    )

    # exon level
    rmskex = rmsktx.copy().reset_index(drop=True)
    rmskex["Feature"] = "exon"
    rmskex["exon_id"] = rmskex.transcript_id
    rmskex["exon_number"] = 1

    return pd.concat([rmsk, rmsktx, rmskex]).sort_values(["Chromosome", "Start", "End"])


def gtf_to_bed(rmsk_file, genes, family):
    "convert gtf to bed"

    bed = rmsk_file[rmsk_file["repName"] == family]

    bed = bed[["genoName", "genoStart", "genoEnd", "repName", "strand"]].copy()
    bed.to_csv(f"../data/{family}.bed", sep="\t", index=False, header=False)
    return bed

    # get rid of chr in chromosome names

    truncated_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == False])
    full_l1 = rmsk_to_gtf(bed[bed["is_full_length"] == True])

    ### Full length intronic and intergenic regions
    full_l1["Length_TE"] = full_l1["End"] - full_l1["Start"]

    full_l1 = (
        pr.PyRanges(full_l1, int64=True)
        .join(
            pr.PyRanges(genes, int64=True),
            how="left",
            report_overlap=True,
            suffix="_gene",
        )
        .df
    )

    full_l1[(full_l1["Overlap"] == full_l1["Length_TE"])].to_csv(
        f"../data/full_intronic_{family}.bed", sep="\t", index=False, header=False
    )
    full_l1[(full_l1["Overlap"] != full_l1["Length_TE"])].to_csv(
        f"../data/full_intergenic_{family}.bed", sep="\t", index=False, header=False
    )

    ### Truncated intronic and intergenic regions
    truncated_l1["Length_TE"] = truncated_l1["End"] - truncated_l1["Start"]
    truncated_l1 = (
        pr.PyRanges(truncated_l1, int64=True)
        .join(
            pr.PyRanges(genes, int64=True),
            how="left",
            report_overlap=True,
            suffix="_gene",
        )
        .df
    )

    truncated_l1[(truncated_l1["Overlap"] == truncated_l1["Length_TE"])].to_csv(
        f"../data/truncate_intronic_{family}.bed", sep="\t", index=False, header=False
    )

    truncated_l1[(truncated_l1["Overlap"] != truncated_l1["Length_TE"])].to_csv(
        f"../data/truncate_intergenic_{family}.bed", sep="\t", index=False, header=False
    )

    return bed

## Read in gene functions

In [3]:
gtf = pr.read_gtf("../data/gencode.v26.basic.annotation.gtf.gz").df
genes = []
for gene_id, df in gtf.groupby("gene_id"):
    assert (
        df["Strand"].unique().shape[0] == 1
    ), f"This gene: {gene_id} is on two strands"

    row = {
        "Chromosome": df["Chromosome"].unique()[0],
        "Start": min(df["Start"]),
        "End": max(df["End"]),
        "Strand": df["Strand"].unique()[0],
        "gene_id": gene_id,
        "gene_name": df["gene_name"].unique()[0],
    }
    genes.append(row)

# additional filtering for future methods
genes = pd.DataFrame(genes)
genes["Length_Gene"] = genes["End"] - genes["Start"]
genes["Chromosome"] = genes["Chromosome"].str.replace(
    "chr", ""
)  # get rid of chr in chromosome names

genes

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,gene_name,Length_Gene
0,X,100627108,100639991,-,ENSG00000000003.14,TSPAN6,12883
1,X,100584801,100599885,+,ENSG00000000005.5,TNMD,15084
2,20,50934866,50958555,-,ENSG00000000419.12,DPM1,23689
3,1,169849630,169894267,-,ENSG00000000457.13,SCYL3,44637
4,1,169662006,169854080,+,ENSG00000000460.16,C1orf112,192074
...,...,...,...,...,...,...,...
58214,1,157203603,157205062,-,ENSG00000284592.1,RP11-71G12.2,1459
58215,11,1880044,1880147,+,ENSG00000284594.1,MIR7847,103
58216,17,75498547,75498628,+,ENSG00000284595.1,MIR6785,81
58217,7,102471468,102471531,+,ENSG00000284596.1,MIR4467,63


In [None]:
genes["Chromosome"] = genes["Chromosome"].str.replace("chr", "")
genes

In [14]:
rmsk_file = rmsk.read_rmsk("../data/hg38.fa.out.gz")
rmsk_file["genoName"].unique()

rmsk_file["genoName"] = rmsk_file["genoName"].str.replace("chr", "")
rmsk_file = rmsk_file[rmsk_file["genoName"].isin(genes["Chromosome"].unique())]
# keep only standard chromosomes


rmsk_file

Unnamed: 0,swScore,milliDiv,milliDel,milliIns,genoName,genoStart,genoEnd,genoLeft,strand,repName,repStart,repEnd,repLeft,id,repClass,repFamily,length,age,has_promoter,is_full_length
0,463,13.0,6.0,17.0,1,10001,10468,248945954,+,(TAACCC)n,1,471,0,1,Simple_repeat,,470,2.980451,,
1,3612,114.0,215.0,13.0,1,10469,11447,248944975,-,TAR1,399,1712,483,2,Satellite,telo,1229,28.103632,,
2,484,251.0,132.0,0.0,1,11505,11675,248944747,-,L1MC5a,2382,395,199,3,LINE,L1,196,69.454621,False,False
3,239,294.0,19.0,10.0,1,11678,11780,248944642,-,MER5B,74,104,1,4,DNA,hAT-Charlie,103,84.814840,,
4,318,230.0,37.0,0.0,1,15265,15355,248941067,-,MIR3,119,143,49,5,SINE,MIR,94,62.428022,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5520110,256,304.0,19.0,10.0,Y,57214797,57214899,12516,+,MER5B,1,104,74,4607723,DNA,hAT-Charlie,103,88.594475,,
5520111,497,204.0,152.0,36.0,Y,57214902,57215093,12322,+,L1MC5a,199,395,2382,4607724,LINE,L1,196,54.111517,False,False
5520112,6064,115.0,204.0,10.0,Y,57215117,57216403,11012,+,TAR1,469,2068,43,4607725,Satellite,telo,1599,28.371854,,
5520113,744,38.0,11.0,25.0,Y,57216407,57217320,10095,+,(TTAGGG)n,1,924,0,4607726,Simple_repeat,,923,8.862834,,


## Make bed files for LINE-1 families

In [18]:
L1_families = ["L1HS", "L1PA2", "L1PA3", "L1PA6"]

for family in L1_families:
    df = gtf_to_bed(rmsk_file, genes, family)

In [16]:
df

Unnamed: 0,genoName,genoStart,genoEnd,repName,strand
24292,1,12816084,12816355,L1HS,-
27163,1,14257711,14257955,L1HS,+
65478,1,28985834,28986182,L1HS,-
68952,1,30567814,30568750,L1HS,+
78927,1,34566056,34572105,L1HS,-
...,...,...,...,...,...
5510639,Y,20690314,20690858,L1HS,-
5511004,Y,20958692,20959548,L1HS,-
5515241,Y,23956872,23959834,L1HS,-
5517909,Y,25710292,25713238,L1HS,+
