In [1]:
import sys
sys.path.append('../')

from lib import libdata
import numpy as np
import pandas as pd

In [2]:
# Input files
ribocirc_metadata_file_cd = 'riboCirc/Mouse_condition_dependent_meta.txt'
ribocirc_metadata_file_ci = 'riboCirc/Mouse_condition_independent_meta.txt'
ribocirc_circrnas_file = 'riboCirc/Mouse_circ_Sequences.fasta'
ribocirc_orfs_fasta_file = 'riboCirc/Mouse_cORF_Sequences_NA.fasta'

# Output files
circrnas_file = libdata.CIRCODAN_PATH + 'datasets/05_riboCirc_mmu.fa'
gtf_file = libdata.CIRCODAN_PATH + 'datasets/05_riboCirc_mmu.gtf'

In [3]:
df_ribocirc_metadata_cd = pd.read_csv(ribocirc_metadata_file_cd, sep='\t')
df_ribocirc_metadata_ci = pd.read_csv(ribocirc_metadata_file_ci, sep='\t')
df_ribocirc_metadata = pd.concat([df_ribocirc_metadata_cd, df_ribocirc_metadata_ci], ignore_index=True)
df_ribocirc_metadata

Unnamed: 0,Condition,Species,riboCIRC_ID,Genome_assembly,Genome_pos,Strand,Spliced_length,Transcript,Symbol,Ensembl,Dataset_count,Evidences,Algorithm,Other_ids
0,Condition-dependent,Mouse,mmu_circRims1_001,mm10,chr1:22346292-22379540,-,516,NM_183018,Rims1,ENSMUSG00000041670,1,"3: RPF,cORF,m6A",CIRCexplorer2,
1,Condition-dependent,Mouse,mmu_circPrim2_001,mm10,chr1:33628394-33630433,-,234,NM_008922,Prim2,ENSMUSG00000026134,1,"2: RPF,cORF","CIRI2,CIRCexplorer2,DCC",
2,Condition-dependent,Mouse,mmu_circUxs1_001,mm10,chr1:43797244-43807377,-,182,NM_001368288,Uxs1,ENSMUSG00000057363,1,"3: RPF,cORF,m6A","CIRCexplorer2,DCC",
3,Condition-dependent,Mouse,mmu_circNdufs1_001,mm10,chr1:63148395-63148571,-,176,NM_001160039,Ndufs1,ENSMUSG00000025968,1,"2: RPF,cORF",CIRCexplorer2,
4,Condition-dependent,Mouse,mmu_circPid1_001,mm10,chr1:84159231-84159378,-,147,NM_001003948,Pid1,ENSMUSG00000045658,1,"3: RPF,cORF,MS","CIRI2,CIRCexplorer2",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,Condition-independent,Mouse,mmu_circTmsb15b1_001,mm10,chrX:136975397-136975502,-,105,NM_001081983,Tmsb15b1,ENSMUSG00000089768,1,"3: RPF,cORF,m6A",,CIRCpedia:MMU_CIRCpedia_212151
900,Condition-independent,Mouse,mmu_circAcsl4_001,mm10,chrX:142338883-142351615,-,1324,NM_001033600,Acsl4,ENSMUSG00000031278,1,"3: RPF,cORF,m6A",,circAtlas:mmu-Acsl4_0004
901,Condition-independent,Mouse,mmu_circRps6ka3_001,mm10,chrX:159337917-159352433,+,857,NM_001346675,Rps6ka3,ENSMUSG00000031309,1,"3: RPF,cORF,m6A",,circAtlas:mmu-Rps6ka3_0040
902,Condition-independent,Mouse,mmu_circPdha1_001,mm10,chrX:160132068-160133232,-,234,NM_008810,Pdha1,ENSMUSG00000031299,1,"4: RPF,cORF,MS,m6A",,CIRCpedia:MMU_CIRCpedia_212249


In [4]:
df_ribocirc_metadata = df_ribocirc_metadata.drop_duplicates(subset='riboCIRC_ID')
df_ribocirc_metadata.shape

(763, 14)

In [5]:
# Creating FASTA file with selected circRNAs
circ_ids = np.array(df_ribocirc_metadata['riboCIRC_ID'])
libdata.create_fasta_file_riboCirc(circ_ids, ribocirc_circrnas_file, circrnas_file)
libdata.count_sequences_in_file(circrnas_file)

763

In [6]:
# Creating GTF file
libdata.create_gtf_file_riboCirc(circrnas_file, ribocirc_orfs_fasta_file, gtf_file)

df_gtf = pd.read_csv(gtf_file, sep='\t', header=None)
df_gtf.columns = libdata.get_gtf_columns()
df_gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,mmu_circRims1_001,riboCirc v1.0,cORF,33,32,.,+,2,translation_cycles=2;length=1032
1,mmu_circPrim2_001,riboCirc v1.0,cORF,40,39,.,+,0,translation_cycles=2;length=468
2,mmu_circUxs1_001,riboCirc v1.0,cORF,24,27,.,+,2,translation_cycles=1;length=186
3,mmu_circNdufs1_001,riboCirc v1.0,cORF,27,117,.,+,2,translation_cycles=1;length=267
4,mmu_circPid1_001,riboCirc v1.0,cORF,13,12,.,+,0,translation_cycles=2;length=294
...,...,...,...,...,...,...,...,...,...
1056,mmu_circAcsl4_001,riboCirc v1.0,cORF,10,32,.,+,0,translation_cycles=1;length=1347
1057,mmu_circRps6ka3_001,riboCirc v1.0,cORF,93,171,.,+,2,translation_cycles=1;length=936
1058,mmu_circPdha1_001,riboCirc v1.0,cORF,142,141,.,+,0,translation_cycles=2;length=468
1059,mmu_circPdha1_001,riboCirc v1.0,cORF,227,82,.,+,1,translation_cycles=1;length=90
