In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
GTF_PATH = "../datasets/gencode.v49.chr_patch_hapl_scaff.annotation.gtf"
miRNA_GTF_PATH = "../datasets/preprocessed/miRNA_gtf.csv"

pd.set_option('display.max_colwidth', None)

In [6]:
cols = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]

gtf_df = pd.read_csv(GTF_PATH, sep='\t', comment='#', names=cols)
gtf_df.shape

(8063229, 9)

In [7]:
def parse_attributes(attr):
    d = {}
    for a in attr.split(";"):
        a = a.strip()
        if a == "":
            continue
        if " " in a:
            key, val = a.split(" ", 1)
            d[key] = val.strip('"')
    return d

chunks = np.array_split(gtf_df, 10)
results = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)} ...")    
    attr_df = chunk['attribute'].apply(parse_attributes).apply(pd.Series)
    merged = pd.concat([chunk.drop(columns=['attribute']), attr_df], axis=1)    
    results.append(merged)

complete_gtf_df = pd.concat(results, ignore_index=True)

  return bound(*args, **kwds)


Processing chunk 1/10 ...
Processing chunk 2/10 ...
Processing chunk 3/10 ...
Processing chunk 4/10 ...
Processing chunk 5/10 ...
Processing chunk 6/10 ...
Processing chunk 7/10 ...
Processing chunk 8/10 ...
Processing chunk 9/10 ...
Processing chunk 10/10 ...


### Try to search for a specific miRNA ID

In [8]:
result = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-let-7a-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-let-7a-1'. Try another miRNA ID.

In [9]:
result = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-mir-103b-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-mir-103b-1'.

### Filter miRNA entries by searching for 'miRNA' keyword

In [16]:
mirna_df = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('miRNA', case=False, na=False)).any(axis=1)]
mirna_df.shape

(5835, 26)

It is sufficient to filter only by **'gene_type'** column.

In [31]:
mirna_df = complete_gtf_df[complete_gtf_df['gene_type'] == 'miRNA']
mirna_df.shape

(5835, 26)

In [24]:
mirna_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
2415,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2416,chr1,ENSEMBL,transcript,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2417,chr1,ENSEMBL,exon,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,1.0,ENSE00003746039.1,,,HGNC:50039,,,,,
2433,chr1,ENSEMBL,gene,30366,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,
2434,chr1,ENSEMBL,transcript,30366,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,


### Save the preprocessed miRNA GTF data as CSV file

In [32]:
mirna_df.to_csv(miRNA_GTF_PATH, index=False)

## Reload created CSV file

In [43]:
df = pd.read_csv(miRNA_GTF_PATH)
df.shape

(5835, 26)

In [44]:
pd.set_option('display.max_colwidth', None)
df.describe()

Unnamed: 0,start,end,level,exon_number,transcript_support_level,havana_transcript,havana_gene,ont,protein_id,ccdsid,artif_dupl
count,5835.0,5835.0,5835.0,1945.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,73996840.0,73996920.0,3.0,1.0,,,,,,,
std,56395110.0,56395110.0,0.0,0.0,,,,,,,
min,2050.0,2164.0,3.0,1.0,,,,,,,
25%,28958580.0,28958660.0,3.0,1.0,,,,,,,
50%,61199800.0,61199880.0,3.0,1.0,,,,,,,
75%,110284900.0,110284900.0,3.0,1.0,,,,,,,
max,248826400.0,248826400.0,3.0,1.0,,,,,,,


In [45]:
df[['gene_name', 'gene_id', 'transcript_id']].head()

Unnamed: 0,gene_name,gene_id,transcript_id
0,MIR6859-1,ENSG00000278267.1,
1,MIR6859-1,ENSG00000278267.1,ENST00000619216.1
2,MIR6859-1,ENSG00000278267.1,ENST00000619216.1
3,MIR1302-2,ENSG00000284332.1,
4,MIR1302-2,ENSG00000284332.1,ENST00000607096.1


In [46]:
from pybiomart import Server

server = Server(host='http://www.ensembl.org')
mart = server['ENSEMBL_MART_ENSEMBL']
dataset = mart['hsapiens_gene_ensembl']

# Query con filtro sui geni miRNA
mapping = dataset.query(
    attributes=['ensembl_gene_id', 'external_gene_name', 'mirbase_id'],
    filters={'biotype': 'miRNA'}  # filtra solo i miRNA
)

print(mapping.head())
mapping.shape

    Gene stable ID  Gene name      miRBase ID
0  ENSG00000283344  MIR1244-4  hsa-mir-1244-1
1  ENSG00000283344  MIR1244-4  hsa-mir-1244-2
2  ENSG00000283344  MIR1244-4  hsa-mir-1244-3
3  ENSG00000283344  MIR1244-4  hsa-mir-1244-4
4  ENSG00000292346    MIR6089  hsa-mir-6089-1


(2170, 3)

In [47]:
result = mapping[mapping.astype(str).apply(lambda x: x.str.contains('hsa-mir-103b-2', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,Gene stable ID,Gene name,miRBase ID
209,ENSG00000283320,MIR103B2,hsa-mir-103b-2


In [42]:
result = df[df.astype(str).apply(lambda x: x.str.contains('MIR103B2', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
4917,chr20,ENSEMBL,gene,3917502,3917563,.,-,.,ENSG00000283320.1,miRNA,...,,,,,HGNC:35385,,,,,
4918,chr20,ENSEMBL,transcript,3917502,3917563,.,-,.,ENSG00000283320.1,miRNA,...,,,,,HGNC:35385,,,,,
4919,chr20,ENSEMBL,exon,3917502,3917563,.,-,.,ENSG00000283320.1,miRNA,...,1.0,ENSE00003798546.1,,,HGNC:35385,,,,,


#  Homo sapiens GTF

In [10]:
HOMO_GTF_PATH = "../datasets/Homo_sapiens.GRCh38.115.gtf"
cols = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]

homo_df = pd.read_csv(GTF_PATH, sep='\t', comment='#', names=cols)
homo_df.shape

(8063229, 9)

In [None]:
def parse_attributes(attr):
    d = {}
    for a in attr.split(";"):
        a = a.strip()
        if a == "":
            continue
        if " " in a:
            key, val = a.split(" ", 1)
            d[key] = val.strip('"')
    return d

chunks = np.array_split(homo_df, 10)
results = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)} ...")    
    attr_df = chunk['attribute'].apply(parse_attributes).apply(pd.Series)
    merged = pd.concat([chunk.drop(columns=['attribute']), attr_df], axis=1)    
    results.append(merged)

homo_complete_gtf_df = pd.concat(results, ignore_index=True)

  return bound(*args, **kwds)


Processing chunk 1/10 ...
Processing chunk 2/10 ...


In [15]:
result = homo_complete_gtf_df[homo_complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-let-7a-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-let-7a-1'.