In [11]:
import pandas as pd
import numpy as np
import os

In [12]:
GTF_PATH = "../../datasets/gencode.v49.chr_patch_hapl_scaff.annotation.gtf"
miRNA_GTF_PATH = "../../datasets/preprocessed/miRNA_gtf.csv"

pd.set_option('display.max_colwidth', None)

In [13]:
cols = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]

gtf_df = pd.read_csv(GTF_PATH, sep='\t', comment='#', names=cols)
gtf_df.shape

(8063229, 9)

In [4]:
def parse_attributes(attr):
    d = {}
    for a in attr.split(";"):
        a = a.strip()
        if a == "":
            continue
        if " " in a:
            key, val = a.split(" ", 1)
            d[key] = val.strip('"')
    return d

chunks = np.array_split(gtf_df, 10)
results = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)} ...")    
    attr_df = chunk['attribute'].apply(parse_attributes).apply(pd.Series)
    merged = pd.concat([chunk.drop(columns=['attribute']), attr_df], axis=1)    
    results.append(merged)

complete_gtf_df = pd.concat(results, ignore_index=True)

  return bound(*args, **kwds)


Processing chunk 1/10 ...
Processing chunk 2/10 ...
Processing chunk 3/10 ...
Processing chunk 4/10 ...
Processing chunk 5/10 ...
Processing chunk 6/10 ...
Processing chunk 7/10 ...
Processing chunk 8/10 ...
Processing chunk 9/10 ...
Processing chunk 10/10 ...


In [5]:
complete_gtf_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
0,chr1,HAVANA,gene,11121,24894,.,+,.,ENSG00000290825.2,lncRNA,...,,,,,,,,,,
1,chr1,HAVANA,transcript,11121,14413,.,+,.,ENSG00000290825.2,lncRNA,...,,,,,,,,,,
2,chr1,HAVANA,exon,11121,11211,.,+,.,ENSG00000290825.2,lncRNA,...,1.0,ENSE00004248723.1,,,,,,,,
3,chr1,HAVANA,exon,12010,12227,.,+,.,ENSG00000290825.2,lncRNA,...,2.0,ENSE00004248735.1,,,,,,,,
4,chr1,HAVANA,exon,12613,12721,.,+,.,ENSG00000290825.2,lncRNA,...,3.0,ENSE00003582793.1,,,,,,,,


### Try to search for a specific miRNA ID

In [8]:
result = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-let-7a-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-let-7a-1'. Try another miRNA ID.

In [9]:
result = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-mir-103b-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-mir-103b-1'.

In [55]:
result = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('MIR1254-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


In [7]:
result = complete_gtf_df[
    complete_gtf_df.astype(str).apply(lambda x: x.str.contains('MIR', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
2415,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2416,chr1,ENSEMBL,transcript,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2417,chr1,ENSEMBL,exon,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,1.0,ENSE00003746039.1,,,HGNC:50039,,,,,
2418,chr1,HAVANA,gene,28589,31109,.,+,.,ENSG00000243485.6,lncRNA,...,,,,,HGNC:52482,OTTHUMG00000000959.2,,,,
2419,chr1,HAVANA,transcript,28589,29345,.,+,.,ENSG00000243485.6,lncRNA,...,,,,,HGNC:52482,OTTHUMG00000000959.2,,,,


In [8]:
result = complete_gtf_df[
    complete_gtf_df.astype(str).apply(lambda x: x.str.contains('ENST00000408257.1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
4013594,chr10,ENSEMBL,transcript,68759318,68759414,.,+,.,ENSG00000221184.1,miRNA,...,,,,,,,,,,
4013595,chr10,ENSEMBL,exon,68759318,68759414,.,+,.,ENSG00000221184.1,miRNA,...,1.0,ENSE00001564892.1,,,,,,,,


### Filter miRNA entries by searching for 'miRNA' keyword

In [16]:
mirna_df = complete_gtf_df[complete_gtf_df.astype(str).apply(lambda x: x.str.contains('miRNA', case=False, na=False)).any(axis=1)]
mirna_df.shape

(5835, 26)

It is sufficient to filter only by **'gene_type'** column.

In [31]:
mirna_df = complete_gtf_df[complete_gtf_df['gene_type'] == 'miRNA']
mirna_df.shape

(5835, 26)

In [24]:
mirna_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
2415,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2416,chr1,ENSEMBL,transcript,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,,,,HGNC:50039,,,,,
2417,chr1,ENSEMBL,exon,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,1.0,ENSE00003746039.1,,,HGNC:50039,,,,,
2433,chr1,ENSEMBL,gene,30366,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,
2434,chr1,ENSEMBL,transcript,30366,30503,.,+,.,ENSG00000284332.1,miRNA,...,,,,,HGNC:35294,,,,,


### Save the preprocessed miRNA GTF data as CSV file

In [32]:
mirna_df.to_csv(miRNA_GTF_PATH, index=False)

## Reload created CSV file

In [4]:
df = pd.read_csv(miRNA_GTF_PATH)
df.shape

(5835, 26)

In [5]:
pd.set_option('display.max_colwidth', None)
df.describe()

Unnamed: 0,start,end,level,exon_number,transcript_support_level,havana_transcript,havana_gene,ont,protein_id,ccdsid,artif_dupl
count,5835.0,5835.0,5835.0,1945.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,73996840.0,73996920.0,3.0,1.0,,,,,,,
std,56395110.0,56395110.0,0.0,0.0,,,,,,,
min,2050.0,2164.0,3.0,1.0,,,,,,,
25%,28958580.0,28958660.0,3.0,1.0,,,,,,,
50%,61199800.0,61199880.0,3.0,1.0,,,,,,,
75%,110284900.0,110284900.0,3.0,1.0,,,,,,,
max,248826400.0,248826400.0,3.0,1.0,,,,,,,


In [45]:
df[['gene_name', 'gene_id', 'transcript_id']].head()

Unnamed: 0,gene_name,gene_id,transcript_id
0,MIR6859-1,ENSG00000278267.1,
1,MIR6859-1,ENSG00000278267.1,ENST00000619216.1
2,MIR6859-1,ENSG00000278267.1,ENST00000619216.1
3,MIR1302-2,ENSG00000284332.1,
4,MIR1302-2,ENSG00000284332.1,ENST00000607096.1


## Map miRNA gene names to miRBase IDs using pybiomart

In [6]:
from pybiomart import Server

server = Server(host='http://www.ensembl.org')
mart = server['ENSEMBL_MART_ENSEMBL']
dataset = mart['hsapiens_gene_ensembl']

mapping = dataset.query(
    attributes=['ensembl_gene_id', 'external_gene_name', 'mirbase_id', 'mirbase_accession'],
    filters={'biotype': 'miRNA'}
)
mapping = mapping[mapping['miRBase ID'].notnull()].drop_duplicates()

print(mapping.head())
mapping.shape

    Gene stable ID  Gene name      miRBase ID miRBase accession
0  ENSG00000283344  MIR1244-4  hsa-mir-1244-1         MI0006379
1  ENSG00000283344  MIR1244-4  hsa-mir-1244-2         MI0015974
2  ENSG00000283344  MIR1244-4  hsa-mir-1244-3         MI0015975
3  ENSG00000283344  MIR1244-4  hsa-mir-1244-4         MI0031511
4  ENSG00000292346    MIR6089  hsa-mir-6089-1         MI0020366


(2135, 4)

In [7]:
server = Server(host='http://grch37.ensembl.org')
mart = server['ENSEMBL_MART_ENSEMBL']
dataset = mart['hsapiens_gene_ensembl']

mapping_grch37 = dataset.query(
    attributes=['ensembl_gene_id', 'external_gene_name', 'mirbase_id', 'mirbase_accession'],
    filters={'biotype': 'miRNA'}
)
mapping_grch37 = mapping_grch37[mapping_grch37['miRBase ID'].notnull()].drop_duplicates()

print(mapping_grch37.head())

     Gene stable ID Gene name    miRBase ID miRBase accession
2   ENSG00000252695   MIR2276  hsa-mir-2276         MI0011282
5   ENSG00000263399   MIR3170  hsa-mir-3170         MI0014201
7   ENSG00000207719    MIR623   hsa-mir-623         MI0003637
9   ENSG00000263615   MIR4306  hsa-mir-4306         MI0015836
12  ENSG00000265164   MIR2681  hsa-mir-2681         MI0012062


In [16]:
from Bio import SeqIO

records = SeqIO.parse("../../datasets/hairpin.fa", "fasta")
mirna_data = [
    {
        'mirbase_id': r.id,
        'description': r.description,
        'sequence': str(r.seq)
    }
    for r in records
]

mirna_data

[{'mirbase_id': 'cel-let-7',
  'description': 'cel-let-7 MI0000001 Caenorhabditis elegans let-7 stem-loop',
  'sequence': 'UACACUGUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA'},
 {'mirbase_id': 'cel-lin-4',
  'description': 'cel-lin-4 MI0000002 Caenorhabditis elegans lin-4 stem-loop',
  'sequence': 'AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAUUGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU'},
 {'mirbase_id': 'cel-mir-1',
  'description': 'cel-mir-1 MI0000003 Caenorhabditis elegans miR-1 stem-loop',
  'sequence': 'AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU'},
 {'mirbase_id': 'cel-mir-2',
  'description': 'cel-mir-2 MI0000004 Caenorhabditis elegans miR-2 stem-loop',
  'sequence': 'UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU'},
 {'mirbase_id': 'cel-mir-34',
  'description': 'cel-mir-34 MI0000005 Caenorhabditis elegans miR-34 stem-l

In [31]:
q = "hsa-mir-7641-1".lower()  # cerchiamo in minuscolo per sicurezza

found = [m for m in mirna_data if q.lower() in m['description'].lower()]
if found:
    print(f"✅ {q} trovato ({len(found)} record)")
else:
    print(f"❌ {q} non trovato")

❌ hsa-mir-7641-1 non trovato


Anyway there are some miRNA IDs that are not present.
We create a manual mapping for those missing miRNA IDs.

manual_mapping = {
    "hsa-mir-3607": "MIR3607",
    "hsa-mir-3653": "MIR3653",
    "hsa-mir-3687-1": "MIR3687-1",
    "hsa-mir-3687-2": "MIR3687-2",
    "hsa-mir-6087": "MIR6087",
    "hsa-mir-6723": "MIR6723",
    "hsa-mir-6827": "MIR6827",
    "hsa-mir-7641-1": "MIR7641-1",
    "hsa-mir-7641-2": "MIR7641-2",
    "hsa-mir-3656": "MIR3656",
    "hsa-mir-4788": "MIR4788"
}

In [8]:
result = mapping[mapping.astype(str).apply(lambda x: x.str.contains('hsa-mir-3607', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,Gene stable ID,Gene name,miRBase ID,miRBase accession


In [9]:
result = mapping_grch37[mapping_grch37.astype(str).apply(lambda x: x.str.contains('hsa-mir-3607', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,Gene stable ID,Gene name,miRBase ID,miRBase accession


We can see that not all miRNA ID are present in the mapping DataFrame of the latest Ensembl version. But they are present in the GRCh37 version.

In [10]:
result = df[df.astype(str).apply(lambda x: x.str.contains('ENSG00000266398', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


In [50]:
3917502 - 3917563

-61

In [49]:
# Add miRBase IDs to the miRNA GTF DataFrame
merged_df = pd.merge(df, mapping, left_on='gene_name', right_on='Gene name', how='left')
merged_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl,Gene stable ID,Gene name,miRBase ID
0,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,HGNC:50039,,,,,,ENSG00000278267,MIR6859-1,hsa-mir-6859-1
1,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,HGNC:50039,,,,,,ENSG00000278267,MIR6859-1,hsa-mir-6859-2
2,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,HGNC:50039,,,,,,ENSG00000278267,MIR6859-1,hsa-mir-6859-3
3,chr1,ENSEMBL,gene,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,HGNC:50039,,,,,,ENSG00000278267,MIR6859-1,hsa-mir-6859-4
4,chr1,ENSEMBL,transcript,17369,17436,.,-,.,ENSG00000278267.1,miRNA,...,,HGNC:50039,,,,,,ENSG00000278267,MIR6859-1,hsa-mir-6859-1


#  Homo sapiens GTF

In [56]:
HOMO_GTF_PATH = "../../datasets/Homo_sapiens.GRCh38.115.gtf"
cols = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]

homo_df = pd.read_csv(GTF_PATH, sep='\t', comment='#', names=cols)
homo_df.shape

(8063229, 9)

In [57]:
def parse_attributes(attr):
    d = {}
    for a in attr.split(";"):
        a = a.strip()
        if a == "":
            continue
        if " " in a:
            key, val = a.split(" ", 1)
            d[key] = val.strip('"')
    return d

chunks = np.array_split(homo_df, 10)
results = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)} ...")    
    attr_df = chunk['attribute'].apply(parse_attributes).apply(pd.Series)
    merged = pd.concat([chunk.drop(columns=['attribute']), attr_df], axis=1)    
    results.append(merged)

homo_complete_gtf_df = pd.concat(results, ignore_index=True)

  return bound(*args, **kwds)


Processing chunk 1/10 ...
Processing chunk 2/10 ...
Processing chunk 3/10 ...
Processing chunk 4/10 ...
Processing chunk 5/10 ...
Processing chunk 6/10 ...
Processing chunk 7/10 ...
Processing chunk 8/10 ...
Processing chunk 9/10 ...
Processing chunk 10/10 ...


In [15]:
result = homo_complete_gtf_df[homo_complete_gtf_df.astype(str).apply(lambda x: x.str.contains('hsa-let-7a-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl


No results found about 'hsa-let-7a-1'.

In [58]:
result = homo_complete_gtf_df[homo_complete_gtf_df.astype(str).apply(lambda x: x.str.contains('MIR1254-1', case=False, na=False)).any(axis=1)]
result.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,...,exon_number,exon_id,transcript_support_level,havana_transcript,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
