In [1]:
import sys
sys.path.append('../')

from lib import libdata
import numpy as np
import pandas as pd

In [2]:
# Input files
ribocirc_metadata_file_cd = 'riboCirc/Human_condition_dependent_meta.txt'
ribocirc_metadata_file_ci = 'riboCirc/Human_condition_independent_meta.txt'
ribocirc_circrnas_file = 'riboCirc/Human_circ_Sequences.fasta'
ribocirc_orfs_fasta_file = 'riboCirc/Human_cORF_Sequences_NA.fasta'

# Output files
circrnas_file = libdata.CIRCODAN_PATH + 'datasets/04_riboCirc_hsa.fa'
gtf_file = libdata.CIRCODAN_PATH + 'datasets/04_riboCirc_hsa.gtf'

In [3]:
df_ribocirc_metadata_cd = pd.read_csv(ribocirc_metadata_file_cd, sep='\t')
df_ribocirc_metadata_ci = pd.read_csv(ribocirc_metadata_file_ci, sep='\t')
df_ribocirc_metadata = pd.concat([df_ribocirc_metadata_cd, df_ribocirc_metadata_ci], ignore_index=True)
df_ribocirc_metadata

Unnamed: 0,Condition,Species,riboCIRC_ID,Genome_assembly,Genome_pos,Strand,Spliced_length,Transcript,Symbol,Ensembl,Dataset_count,Evidences,Algorithm,Other_ids
0,Condition-dependent,Human,hsa_circPRDM2_001,hg38,chr1:13773077-13782831,+,4525,NM_012231,PRDM2,ENSG00000116731,1,"3: RPF,cORF,m6A","CIRCexplorer2,DCC",
1,Condition-dependent,Human,hsa_circNBPF1_001,hg38,chr1:16564806-16567351,-,515,NM_017940,NBPF1,ENSG00000219481,2,"3: RPF,cORF,m6A","CIRCexplorer2,DCC",
2,Condition-dependent,Human,hsa_circNUDC_001,hg38,chr1:26942659-26943065,+,312,NM_006600,NUDC,ENSG00000090273,2,"3: RPF,cORF,m6A","CIRCexplorer2,DCC",
3,Condition-dependent,Human,hsa_circMAN1A2_003,hg38,chr1:117402185-117420649,+,553,NM_006699,MAN1A2,ENSG00000198162,14,"4: RPF,cORF,MS,m6A","CIRCexplorer2,DCC,CIRI2",
4,Condition-dependent,Human,hsa_circMAN1A2_004,hg38,chr1:117402185-117442325,+,648,NM_006699,MAN1A2,ENSG00000198162,2,"3: RPF,cORF,m6A",CIRCexplorer2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1287,Condition-independent,Human,hsa_circBCORL1_001,hg38,chrX:130005187-130021150,+,3651,NM_001184772,BCORL1,ENSG00000085185,1,"4: RPF,cORF,MS,m6A",,CIRCpedia:HSA_CIRCpedia_65182
1288,Condition-independent,Human,hsa_circFIRRE_001,hg38,chrX:131749305-131794466,-,845,NR_026975,FIRRE,ENSG00000213468,3,"3: RPF,cORF,IRES",,"circAtlas:hsa-FIRRE_0001,circBank:hsa_circTCON..."
1289,Condition-independent,Human,hsa_circAFF2_001,hg38,chrX:148661907-148662768,+,861,NM_001169125,AFF2,ENSG00000155966,1,"3: RPF,cORF,m6A",,"circAtlas:hsa-AFF2_0002,circBank:hsa_circAFF2_..."
1290,Condition-independent,Human,hsa_circPASD1_001,hg38,chrX:151601526-151611753,+,234,NM_173493,PASD1,ENSG00000166049,1,"2: RPF,cORF",,"CIRCpedia:HSA_CIRCpedia_351670,CSCD:chrX:15160..."


In [4]:
df_ribocirc_metadata = df_ribocirc_metadata.drop_duplicates(subset='riboCIRC_ID')
df_ribocirc_metadata.shape

(1205, 14)

In [5]:
# Creating FASTA file with selected circRNAs
circ_ids = np.array(df_ribocirc_metadata['riboCIRC_ID'])
libdata.create_fasta_file_riboCirc(circ_ids, ribocirc_circrnas_file, circrnas_file)
libdata.count_sequences_in_file(circrnas_file)

1205

In [6]:
# Creating GTF file
libdata.create_gtf_file_riboCirc(circrnas_file, ribocirc_orfs_fasta_file, gtf_file)

df_gtf = pd.read_csv(gtf_file, sep='\t', header=None)
df_gtf.columns = libdata.get_gtf_columns()
df_gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,hsa_circPRDM2_001,riboCirc v1.0,cORF,93,25,.,+,2,translation_cycles=1;length=4458
1,hsa_circNBPF1_001,riboCirc v1.0,cORF,217,259,.,+,0,translation_cycles=1;length=558
2,hsa_circNBPF1_001,riboCirc v1.0,cORF,344,95,.,+,1,translation_cycles=1;length=267
3,hsa_circNUDC_001,riboCirc v1.0,cORF,206,141,.,+,1,translation_cycles=1;length=342
4,hsa_circNUDC_001,riboCirc v1.0,cORF,257,141,.,+,1,translation_cycles=2;length=624
...,...,...,...,...,...,...,...,...,...
1614,hsa_circAFF2_001,riboCirc v1.0,cORF,61,60,.,+,0,translation_cycles=2;length=1722
1615,hsa_circAFF2_001,riboCirc v1.0,cORF,794,64,.,+,1,translation_cycles=1;length=132
1616,hsa_circPASD1_001,riboCirc v1.0,cORF,28,24,.,+,0,translation_cycles=1;length=231
1617,hsa_circFLNA_001,riboCirc v1.0,cORF,265,264,.,+,0,translation_cycles=2;length=9486
