In [2]:
GTF_hg38 = "/Users/ckuo/genome_data/gencode.v45.primary_assembly.annotation.gtf.gz"

Get all the genes from a GTF file

In [3]:
from genomkit import GAnnotation

gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
genes = gtf.get_regions(element_type="gene")
genes.write(filename="hg38_genes.bed")

Loading Data: 100%|██████████| 3428060/3428060 [00:23<00:00, 147905.28it/s]


Extract exon, intron, and intergenetic regions in BED format from a GTF file

In [4]:
from genomkit import GRegions
from genomkit import GAnnotation

gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
genes = gtf.get_regions(element_type="gene")
exons = gtf.get_regions(element_type="exon")
introns = genes.subtract(exons, inplace=False)

chromosomes = GRegions(name="chromosomes")
chromosomes.get_chromosomes(organism="hg38")
intergenic_regions = chromosomes.subtract(genes, inplace=False)
exons.write(filename="hg38_exons.bed")
introns.write(filename="hg38_introns.bed")
intergenic_regions.write(filename="hg38_intergenic_regions.bed")


Loading Data: 100%|██████████| 3428060/3428060 [00:23<00:00, 145496.08it/s]


Get all promoter regions in BED format from a GTF file

In [5]:
from genomkit import GAnnotation

gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
genes = gtf.get_regions(element_type="gene")
promoters = genes.resize(extend_upstream=2000,
                        extend_downstream=0,
                        center="5prime", inplace=False)
promoters.write(filename="hg38_promoters.bed")

Loading Data: 100%|██████████| 3428060/3428060 [00:23<00:00, 143017.55it/s]


Extract the genes by their biotypes from a GTF file

In [6]:
from genomkit import GAnnotation

gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
target_biotypes = ["protein_coding", "lncRNA", "snRNA", "miRNA"]
for biotype in target_biotypes:
    genes = gtf.get_regions(element_type="gene",
                            attribute="gene_type", value=biotype)
    genes.write(filename="hg38_genes_"+biotype+".bed")

Loading Data: 100%|██████████| 3428060/3428060 [00:24<00:00, 138936.39it/s]


In [None]:
from genomkit import GRegions, GCoverages

# Generate a heatmap from two BED files: one BED file is used as windows and the other used as the signal
DMSs = GRegions(name="DMSs", load="/Users/ckuo/github/genomkit/tests/test_files/bed/example.bed")
DMSs.sort()
TSSs = GRegions(name="TSSs", load="/Users/ckuo/github/genomkit/tests/test_files/bed/example.bed")
TSSs.sort()
cov = GCoverages(bin_size=2)
cov.calculate_coverage_GRegions(regions=TSSs, scores=DMSs)
# for r, cov in cov.coverage.items():
#     print(r)
#     print(cov)
df = cov.get_dataframe()
print(df)