<a href="https://colab.research.google.com/github/gracetavares/mapedrMATS2isoformas/blob/main/mapedrMATS2transcriptID.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Bibliotecas**:
1. *gffutils*:

  1.1. manipulação de arquvos GFF e GTF

  1.2. Cria um banco de dados do GTF(função create_db)

  1.3. Condulta regiões por coordenadas (função db.region) e filtra por feature

2. *pandas*:

  2.1. Manipulação de tabelas e dados

  2.2. lê e itera cada linha dos arquivos de splicing do rMATS (função pd.read_csv)

  2.3. Cria a tabela final de resultados (função pd.DataFrame)

3. *argparse*

  3.1. criação de argumentos de linha de comando







In [None]:
#!pip install gffutils

Collecting gffutils
  Downloading gffutils-0.13-py3-none-any.whl.metadata (1.5 kB)
Collecting pyfaidx>=0.5.5.2 (from gffutils)
  Downloading pyfaidx-0.8.1.4-py3-none-any.whl.metadata (25 kB)
Collecting argh>=0.26.2 (from gffutils)
  Downloading argh-0.31.3-py3-none-any.whl.metadata (7.4 kB)
Collecting argcomplete>=1.9.4 (from gffutils)
  Downloading argcomplete-3.6.2-py3-none-any.whl.metadata (16 kB)
Downloading gffutils-0.13-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argcomplete-3.6.2-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading argh-0.31.3-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyfaidx-0.8.1.4-py3-none-any.whl (28 kB)
Installing colle

In [None]:
#!/usr/bin/env python3
import gffutils as gu #manipulação de arquivos gff e gtf e criação de banco de dados
import pandas as pd #para criação do resultado final
import argparse #para criação de argumentos

##################################
# Argumentos de entrada de dados #
##################################
parser = argparse.ArgumentParser(description="Mapear eventos do rMATS com os transcript_id dos arquivos GTF")
parser.add_argument("--gtf", required=True, help="Entre com o arquivo GTF utilizado no mapeamento")
parser.add_argument("--rmats", required=True, help="Arquivo de saída do rMATS (.MATS.JC.txt, JCEC.txt). Informe um por vez")
parser.add_argument("--tipo_evento", required=True, help="Tipo de evento: SE, RI, A3SS, A5SS ou MXE. Informe conforme o arquivo do rMATS fornecido")
parser.add_argument("--out", required=True, help="Arquivo de saída com mapeamento")
args = parser.parse_args()

usage: colab_kernel_launcher.py [-h] --gtf GTF --rmats RMATS --tipo_evento
                                TIPO_EVENTO --out OUT
colab_kernel_launcher.py: error: the following arguments are required: --gtf, --rmats, --tipo_evento, --out


SystemExit: 2

In [None]:
####################################################
# Criando o banco de dados a partir do arquivo GTF #
####################################################

print("Criando o banco de dados a partir do arquivo GTF...")
db = gu.create_db(
    args.gtf,
    dbfn="gtf.db", #Caminho para o banco de dados que será criado
    force=True, #para sobrescrever db existentes
    keep_order=True,
    merge_strategy="merge", #merge para que os transcripts_id fiquem na mesma linha
    sort_attribute_values=True #Todos os recursos retornados do banco de dados terão seu atributo valores classificados
)


In [None]:
#########################
# Lendo o arquivo rMATS #
#########################

print("Lendo o arquivo rMATS: {args.rmats}")
rmats = pd.read_csv(args.rmats, sep="\t")

In [None]:
################################################################################
# Criação de uma função para pegar as coordenadas de cada evento: SE, RI, A3SS,#
# A5SS e MXE                                                                   #
################################################################################

def coords (line, type):
  if type == "SE":
    return (line['chr'], line['exonStart_0base']+1, line['exonEnd'], line['strand'])
  elif type == "RI":
    return (line['chr'], line['riExonStart_0base']+1, line['riExonEnd'], line['strand'])
  elif type == "MXE":
    return (line['chr'], line['1stExonStart_0base']+1, line['1stExonEnd'], line['strand'])
  elif type == "A3SS":
    return (line['chr'], line['longExonStart_0base']+1, line['longExonEnd'], line['strand'])
  elif type == "A5SS":
    return (line['chr'], line['longExonStart_0base']+1, line['longExonEnd'], line['strand'])
  else:
    raise ValueError("Por favor informe um evento válido: SE, RI, A3SS, A5SS e MXE")

In [None]:
################################################################################
# Mapeando os os genes vs. transcript_id                                       #                                                       #
################################################################################

print("Mapeando os os genes vs. transcript_id.")

results = []

for _, line in rmats.iterrows():
    chrom, start, end, strand = coords(line, args.tipo_evento)

    for feat in db.region(region=(chrom, start, end), strand=strand,
                          featuretype='exon'):
        tid = feat.attributes.get('transcript_id', ['NA'])[0]
        gid = feat.attributes.get('gene_id', ['NA'])[0]
        results.append({
            "Evento": line['ID'],
            "Gene_ID": gid,
            "Transcript_ID": tid,
            "Chr": chrom,
            "Start": start,
            "End": end,
            "Strand": strand
        })

In [None]:
######################
# Saída do resultado #                                                       #
######################

saida_df = pd.DataFrame(results).drop_duplicates()
saida_df.to_csv(args.saida, sep="\t", index=False)
print("Mapeamento salvo em: {args.out}")