### Creating a dataset with circRNAs with translations evidence by ribosome and polysome binding from TransCirc database

In [1]:
import sys
sys.path.append('../')

from lib import libdata
import numpy as np
import pandas as pd

In [2]:
# Input files
transcirc_metadata_file = 'TransCirc/transcirc_metadata.tsv.gz'
transcirc_circrnas_file = 'TransCirc/transcirc_sequence.fa.bgz'
transcirc_orfs_file = 'TransCirc/transcirc_orf_na.fa.bgz'

# Output files
circrnas_file = libdata.CIRCODAN_PATH + 'datasets/03_TransCirc_RP_PP_evidence_hsa.fa'
gtf_file = libdata.CIRCODAN_PATH + 'datasets/03_TransCirc_RP_PP_evidence_hsa.gtf'

In [3]:
# All circRNAs
df_tc = pd.read_csv(transcirc_metadata_file, sep='\t', compression='gzip')
df_tc.shape

(328080, 19)

In [4]:
# Removing circRNAs that not have ORF annotation
df_tc.drop(df_tc[df_tc['ORF'] == 0].index, axis=0, inplace=True)
df_tc.shape

(305016, 19)

In [5]:
# Removing circRNAs that not have RP/PP evidence
df_tc.drop(df_tc[(df_tc['RP/RP'] == 0)].index, axis=0, inplace=True)
df_tc.shape

(4035, 19)

In [6]:
# Removing circRNAs that have MS evidence
df_tc.drop(df_tc[(df_tc['MS'] != 0)].index, axis=0, inplace=True)
df_tc.shape

(4033, 19)

In [7]:
# Creating FASTA file with selected circRNAs
circ_ids = np.array(df_tc['TransCirc_ID'])
libdata.create_fasta_file_TransCirc(circ_ids, transcirc_circrnas_file, circrnas_file)
libdata.count_sequences_in_file(circrnas_file)

4033

In [8]:
# Removing repeated sequences
n_unique_seqs = libdata.remove_repeated_sequences_in_file(circrnas_file)
n_unique_seqs

3924

In [9]:
# Creating GTF file
libdata.create_gtf_file_TransCirc(circrnas_file, transcirc_orfs_file, gtf_file)

df_gtf = pd.read_csv(gtf_file, sep='\t', header=None)
df_gtf.columns = libdata.get_gtf_columns()
df_gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,TC-hsa-SDF4_0002,TransCirc v1.0,cORF,41,11,.,+,1,translation_cycles=1;length=222
1,TC-hsa-SDF4_0002,TransCirc v1.0,cORF,204,18,.,+,2,translation_cycles=1;length=66
2,TC-hsa-NADK_0011,TransCirc v1.0,cORF,41,40,.,+,1,translation_cycles=3;length=909
3,TC-hsa-USP48_0035,TransCirc v1.0,cORF,184,58,.,+,0,translation_cycles=1;length=87
4,TC-hsa-USP48_0035,TransCirc v1.0,cORF,111,111,.,+,2,translation_cycles=1;length=213
...,...,...,...,...,...,...,...,...,...
5508,TC-hsa-DKC1_0007,TransCirc v1.0,cORF,87,15,.,+,2,translation_cycles=1;length=204
5509,TC-hsa-MPP1_0012,TransCirc v1.0,cORF,37,36,.,+,0,translation_cycles=3;length=1134
5510,TC-hsa-UTP14A_0010,TransCirc v1.0,cORF,119,17,.,+,1,translation_cycles=1;length=495
5511,TC-hsa-UTP14A_0010,TransCirc v1.0,cORF,564,54,.,+,2,translation_cycles=1;length=87
