In [1]:
import os
import pandas as pd
import libcirctis

# Inputs
transcirc_circrnas_fasta_file = 'data_raw/transcirc_sequence.fa.bgz'
mapped_tis_tsv_file = 'data_raw/circRNA_TIS_all.tsv'

# Outputs
output_path_main_dataset = 'datasets/main/'
os.mkdir(output_path_main_dataset)

In [2]:
df_tis = pd.read_csv(mapped_tis_tsv_file, sep='\t', header=0)
df_tis

Unnamed: 0,transcirc_id,circ_strand,circ_chrom,tis_coordinate_hg38,tis_position_in_circrna
0,TC-hsa-GNB1_0028,-,chr1,1825453,47
1,TC-hsa-GNB1_0038,-,chr1,1825453,47
2,TC-hsa-GNB1_0041,-,chr1,1825453,47
3,TC-hsa-GNB1_0076,-,chr1,1825453,47
4,TC-hsa-GNB1_0065,-,chr1,1825453,47
...,...,...,...,...,...
10631,TC-hsa-KMT5B_0036,-,chr11,68190104,49
10632,TC-hsa-TOP2A_0045,-,chr17,40416754,142
10633,TC-hsa-RECQL_0005,-,chr12,21499570,46
10634,TC-hsa-SET_0008,+,chr9,128693791,372


In [3]:
df_tis['transcirc_id'].nunique()

9394

### Data cleaning

In [4]:
df_tis[df_tis['transcirc_id'] == 'TC-hsa-UBC_0003']

Unnamed: 0,transcirc_id,circ_strand,circ_chrom,tis_coordinate_hg38,tis_position_in_circrna
2791,TC-hsa-UBC_0003,-,chr12,124911874,77
2792,TC-hsa-UBC_0003,-,chr12,124913242,77
2793,TC-hsa-UBC_0003,-,chr12,124913698,77


In [5]:
df_tis = df_tis.drop(index=2792)  # repetition
df_tis = df_tis.drop(index=2793)  # repetition

In [6]:
df_tis[df_tis['transcirc_id'] == 'TC-hsa-UBB_0002']

Unnamed: 0,transcirc_id,circ_strand,circ_chrom,tis_coordinate_hg38,tis_position_in_circrna
3851,TC-hsa-UBB_0002,+,chr17,16382097,196
3852,TC-hsa-UBB_0002,+,chr17,16382325,196


In [7]:
df_tis = df_tis.drop(index=3852)  # repetition

In [8]:
df_tis[df_tis['transcirc_id'] == 'TC-hsa-PHF6_0005']

Unnamed: 0,transcirc_id,circ_strand,circ_chrom,tis_coordinate_hg38,tis_position_in_circrna
9643,TC-hsa-PHF6_0005,+,chrX,134377590,141
9644,TC-hsa-PHF6_0005,+,chrX,134377618,28


In [9]:
df_tis = df_tis.drop(index=9643)  # TIS non-existent

In [10]:
df_tis

Unnamed: 0,transcirc_id,circ_strand,circ_chrom,tis_coordinate_hg38,tis_position_in_circrna
0,TC-hsa-GNB1_0028,-,chr1,1825453,47
1,TC-hsa-GNB1_0038,-,chr1,1825453,47
2,TC-hsa-GNB1_0041,-,chr1,1825453,47
3,TC-hsa-GNB1_0076,-,chr1,1825453,47
4,TC-hsa-GNB1_0065,-,chr1,1825453,47
...,...,...,...,...,...
10631,TC-hsa-KMT5B_0036,-,chr11,68190104,49
10632,TC-hsa-TOP2A_0045,-,chr17,40416754,142
10633,TC-hsa-RECQL_0005,-,chr12,21499570,46
10634,TC-hsa-SET_0008,+,chr9,128693791,372


In [11]:
output_fasta_file = output_path_main_dataset + 'seqs.fa'
output_tsv_file = output_path_main_dataset + 'tis.tsv'

n_circrnas, n_tis = libcirctis.create_main_dataset(transcirc_circrnas_fasta_file, df_tis, output_fasta_file, output_tsv_file)

n_circrnas, n_tis

(6650, 7665)

### Checking

In [12]:
libcirctis.remove_repeated_sequences_in_fasta_file(output_fasta_file)
libcirctis.count_sequences_in_fasta_file(output_fasta_file)

6650