### Creating a dataset with circRNAs with translations evidence by proteomics mass spectrometry from TransCirc database

In [1]:
import sys
sys.path.append('../')

from lib import libdata
import numpy as np
import pandas as pd

In [2]:
# Input files
transcirc_metadata_file = 'TransCirc/transcirc_metadata.tsv.gz'
transcirc_circrnas_file = 'TransCirc/transcirc_sequence.fa.bgz'
transcirc_orfs_file = 'TransCirc/transcirc_orf_na.fa.bgz'

# Output files
circrnas_file = libdata.CIRCODAN_PATH + 'datasets/02_TransCirc_MS_evidence_hsa.fa'
gtf_file = libdata.CIRCODAN_PATH + 'datasets/02_TransCirc_MS_evidence_hsa.gtf'

In [3]:
# All circRNAs
df_tc = pd.read_csv(transcirc_metadata_file, sep='\t', compression='gzip')
df_tc.shape

(328080, 19)

In [4]:
# Removing circRNAs that not have ORF annotation
df_tc.drop(df_tc[df_tc['ORF'] == 0].index, axis=0, inplace=True)
df_tc.shape

(305016, 19)

In [5]:
# Removing circRNAs that not have MS evidence
df_tc.drop(df_tc[(df_tc['MS'] == 0)].index, axis=0, inplace=True)
df_tc.shape

(167, 19)

In [6]:
# Removing circRNAs that have RP/PP evidence
df_tc.drop(df_tc[(df_tc['RP/RP'] != 0)].index, axis=0, inplace=True)
df_tc.shape

(165, 19)

In [7]:
# Creating FASTA file with selected circRNAs
circ_ids = np.array(df_tc['TransCirc_ID'])
libdata.create_fasta_file_TransCirc(circ_ids, transcirc_circrnas_file, circrnas_file)
libdata.count_sequences_in_file(circrnas_file)

165

In [8]:
# Removing repeated sequences
n_unique_seqs = libdata.remove_repeated_sequences_in_file(circrnas_file)
n_unique_seqs

137

In [9]:
# Creating GTF file
libdata.create_gtf_file_TransCirc(circrnas_file, transcirc_orfs_file, gtf_file)

df_gtf = pd.read_csv(gtf_file, sep='\t', header=None)
df_gtf.columns = libdata.get_gtf_columns()
df_gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,TC-hsa-CLSTN1_0036,TransCirc v1.0,cORF,21,4,.,+,2,translation_cycles=1;length=333
1,TC-hsa-GNB1_0007,TransCirc v1.0,cORF,124,23,.,+,0,translation_cycles=1;length=156
2,TC-hsa-GNB1_0007,TransCirc v1.0,cORF,150,127,.,+,2,translation_cycles=1;length=234
3,TC-hsa-MACF1_0262,TransCirc v1.0,cORF,73,4,.,+,0,translation_cycles=1;length=1233
4,TC-hsa-MACF1_0270,TransCirc v1.0,cORF,833,48,.,+,1,translation_cycles=1;length=3975
...,...,...,...,...,...,...,...,...,...
198,TC-hsa-UBA1_0019,TransCirc v1.0,cORF,547,4,.,+,0,translation_cycles=2;length=612
199,TC-hsa-UBA1_0019,TransCirc v1.0,cORF,66,4,.,+,2,translation_cycles=1;length=516
200,TC-hsa-USP9X_0058,TransCirc v1.0,cORF,193,192,.,+,0,translation_cycles=3;length=7110
201,TC-hsa-OCRL_0024,TransCirc v1.0,cORF,56,12,.,+,1,translation_cycles=1;length=999
