In [3]:
import pandas as pd
from Bio import SeqIO

In [4]:
def fasta_to_df(file):
    
    sequences = []
    ids = []
    labels = []


    for seq_record in SeqIO.parse(file, "fasta"):
        ids.append(seq_record.id)
        sequences.append(str(seq_record.seq))
        labels.append(str(seq_record.description.split(' ')[1]))
        
    df = pd.DataFrame({'ids': ids, 'seq':sequences, 'labels':labels})
    
    return df
    

In [5]:
df_train = fasta_to_df('./dataset/dataset_Rfam_6320_13classes-2.fasta')
df_test = fasta_to_df('./dataset/dataset_Rfam_validated_2600_13classes-2.fasta')

In [6]:
df_train

Unnamed: 0,ids,seq,labels
0,RF00001_AF095839_1_346-228,GCGTACGGCCATACTATGGGGAATACACCTGATCCCGTCCGATTTC...,5S_rRNA
1,RF00001_AY245018_1_1-119,GCTATCGGCCATACTAAGCCAAATGCACCGGATCCCTTCCGAACTC...,5S_rRNA
2,RF00001_X52048_1_2-120,TGCTACGATCATACCACTTAGAAAGCACCCGGTCCCATCAGACCCC...,5S_rRNA
3,RF00001_M28193_1_1-119,AGTTACGGCCATACCTCAGAGAATATACCGTATCCCGTTCGATCTG...,5S_rRNA
4,RF00001_X14816_1_860-978,ACCAACGGCCATACCACGTTGAAAGTACCCAGTCTCGTCAGATCCT...,5S_rRNA
...,...,...,...
6315,RF02535_AFEY01343643_1_18075-17945,ACTTCCAATGCAATGGCTGCAGTGAAGCTATAATTATAGCCTTGTA...,IRES
6316,RF02535_AAPE02009951_1_24083-24245,ATTCCCAGTGCTGCACCGAGAGGACCTGTCTCCTGTGGACTGGAAG...,IRES
6317,RF02535_ABQO011108623_1_28-199,AGTGCAACGGCTGCACCGAAGGCACAATCGTAGCCTTGTATTTCAC...,IRES
6318,RF02535_AAPE02044716_1_11582-11441,ATTCCCGCTGCTGCACAGAGAGGACCCGTGTCCCGTGGACTGGGAG...,IRES


In [7]:
df_test

Unnamed: 0,ids,seq,labels
0,RF00001_AGFK01083378_1_200-82,GGATGCGATCATGTCTGCACTAACACACCGGATCCCATCAGAACTC...,5S_rRNA
1,RF00001_ABIM01030413_1_269-151,GGGTGCGATCATACCAGCACTAATTCACCAGATCCCATCAGAACTC...,5S_rRNA
2,RF00001_FJ968867_1_23-141,GCTTACGGCCACACCGCCTGGATCACGCCCGATCTCGTTTGATCTC...,5S_rRNA
3,RF00001_AFEY01299926_1_3191-3073,AGCTACGGCCATATCACCCTGAACATGCCTGATCTCGTCTGATCTC...,5S_rRNA
4,RF00001_ACYM01068335_1_1131-1013,GGTTGCGTTCATACCAACTCACATGCACCGGATCCCATCTGAACTC...,5S_rRNA
...,...,...,...
2595,RF02535_ABQO010479098_1_2462-2621,AGTGCAACCGCTGCACCGAAGTCACCATCGTAGCCTTGTATCTCAC...,IRES
2596,RF02535_ABQO010931335_1_1612-1466,CAGGGCAGTGGCTGCACTGAAGGCAAAGTCGTAGCCTTGTATTGCA...,IRES
2597,RF02535_ABVD01851849_1_606-739,AGTCCTAAGGCTGCATTGAACAGGACCTGGAATTCCCAGAGAATTG...,IRES
2598,RF02535_AAGU03005870_1_8100-7981,GTTTCCACAGCTGGAGGAAGCCTACAAGTGGAATTTGGAATTCCCG...,IRES


In [8]:
#df_train.to_csv('./nRC-training-set.csv')
#df_test.to_csv('./nRC-test-set.csv')

In [9]:
df_train['labels'].value_counts()

5_8S_rRNA      500
Intron_gpI     500
ribozyme       500
miRNA          500
HACA-box       500
scaRNA         500
CD-box         500
Intron_gpII    500
5S_rRNA        500
tRNA           500
leader         500
riboswitch     500
IRES           320
Name: labels, dtype: int64

In [10]:
sum(df_train['labels'].value_counts())

6320

In [11]:
df_test['labels'].value_counts()

miRNA          200
HACA-box       200
5_8S_rRNA      200
Intron_gpI     200
scaRNA         200
CD-box         200
ribozyme       200
Intron_gpII    200
5S_rRNA        200
IRES           200
tRNA           200
leader         200
riboswitch     200
Name: labels, dtype: int64

In [12]:
sum(df_test['labels'].value_counts())

2600

In [13]:
set(df_train['labels'])

{'5S_rRNA',
 '5_8S_rRNA',
 'CD-box',
 'HACA-box',
 'IRES',
 'Intron_gpI',
 'Intron_gpII',
 'leader',
 'miRNA',
 'riboswitch',
 'ribozyme',
 'scaRNA',
 'tRNA'}

In [14]:
6320+2600

8920

## data leakage

In [15]:
print(len(set(df_train.seq)), 'unique sequences in the training set')

6320 unique sequences in the training set


In [16]:
print(len(set(df_test.seq)), 'unique sequences in the test set')

2600 unique sequences in the test set


In [17]:
print('concatenating both sets')
df_full = pd.concat([df_train, df_test])

concatenating both sets


In [18]:
print(len(set(df_full.seq)), 'unique sequences in the full dataset')

8573 unique sequences in the full dataset


In [19]:
print('full dataset has 8920 sequences')
print(8920-8573, 'sequences are not unique')

full dataset has 8920 sequences
347 sequences are not unique


In [20]:
print('merging the training set and test set')
df_leak = pd.merge(df_train, df_test, on='seq')

merging the training set and test set


In [21]:
df_leak

Unnamed: 0,ids_x,seq,labels_x,ids_y,labels_y
0,RF00001_AAQQ01594944_1_3072-2954,TCCTGAGCATAAACCCCAACCTGTGTACCTGATCTCATCAGATCTC...,5S_rRNA,RF00001_AAQQ01594944_1_3072-2954,5S_rRNA
1,RF00001_AFSA01509842_1_590-487,CGTGAGGGCAGCAGATCCCATAAGAACTCTGCAATTAAACGTGCCC...,5S_rRNA,RF00001_AFSA01509842_1_590-487,5S_rRNA
2,RF00001_ABRO01332170_1_3445-3562,TTCTATAATTTTACCATCCTGAATGTATCCAGTCTCCTCTGATCTT...,5S_rRNA,RF00001_ABRO01332170_1_3445-3562,5S_rRNA
3,RF00002_HM069474_1_174-327,AACCTTTGGCGACGGATATCTCGGCTCTTGCAACGATGACGAGCGC...,5_8S_rRNA,RF00002_HM069474_1_174-327,5_8S_rRNA
4,RF00002_GU941699_1_236-390,AACTTTTGTCGATGGATATCTTGGCATGAGCAACGATGAAGACCGC...,5_8S_rRNA,RF00002_GU941699_1_236-390,5_8S_rRNA
...,...,...,...,...,...
342,RF02535_AEYP01059957_1_11419-11566,TTTTACACTGCTGCACTGAGGGCACATGCATCTTTAGTGGGACTTG...,IRES,RF02535_AEYP01059957_1_11419-11566,IRES
343,RF02535_AAQQ01517481_1_186-328,ATTTCCACTGCTGGTCCAAGGACACGGGATTTGGAAATCCTGGAGA...,IRES,RF02535_AAQQ01517481_1_186-328,IRES
344,RF02535_AAGV020518517_1_2074-1915,GTTTCCACTGCTGCACCAAAGGCAGAAGTCTTTTTTTTTTTCCCCC...,IRES,RF02535_AAGV020518517_1_2074-1915,IRES
345,RF02535_AAGW02013247_1_72866-72738,ATTTCCACTGTAGCTCTGAGGATACACAGGACTTCAAATTCCTGAA...,IRES,RF02535_AAGW02013247_1_72866-72738,IRES


In [22]:
df_leak.labels_x.value_counts()

scaRNA         97
IRES           76
leader         55
Intron_gpII    40
Intron_gpI     26
5_8S_rRNA      16
ribozyme       12
riboswitch      7
HACA-box        6
CD-box          5
miRNA           4
5S_rRNA         3
Name: labels_x, dtype: int64

In [25]:
id_leak = list(df_leak.ids_x)

In [26]:
for n,s in enumerate(df_train.ids):
    if s in id_leak:
        print(s)

RF00001_AAQQ01594944_1_3072-2954
RF00001_AFSA01509842_1_590-487
RF00001_ABRO01332170_1_3445-3562
RF00002_HM069474_1_174-327
RF00002_GU941699_1_236-390
RF00002_HQ651786_1_1868-2019
RF00002_EU200772_1_507-661
RF00002_FJ821426_1_330-520
RF00002_AAKN02051266_1_33611-33456
RF00002_CABF01088313_1_1118-959
RF00002_AF133196_1_373-533
RF00002_AJ845126_3_256-398
RF00002_AF239414_1_268-391
RF00002_AY515188_1_99-247
RF00002_ABSP01054427_1_672-826
RF00002_ADGC01004738_1_818-972
RF00002_AACY023748301_1_1028-1164
RF00002_CP001959_1_1248710-1248559
RF00002_AAGH01030076_1_317-168
RF00008_AACY020666933_1_89-136
RF00009_AADM01000339_1_3746-3427
RF00009_ACSY01001980_1_3051-3394
RF00010_ADND01152599_1_4652-4305
RF00010_AACY023430652_1_22-398
RF00010_CP002364_1_3129794-3130164
RF00010_AEVM01000009_1_108474-108077
RF00010_CP001850_2_899033-898660
RF00011_ACCK01000073_1_34977-35289
RF00028_AM266921_1_821-1240
RF00028_EU249787_1_7-505
RF00028_AY437971_1_26-379
RF00028_DQ442703_1_2673-2974
RF00028_AM231291_1_49

In [56]:
for n,s in enumerate(df_test.ids):
    if s in id_leak:
        print(s)

RF00001_AAQQ01594944_1_3072-2954
RF00001_AFSA01509842_1_590-487
RF00001_ABRO01332170_1_3445-3562
RF00002_HM069474_1_174-327
RF00002_GU941699_1_236-390
RF00002_HQ651786_1_1868-2019
RF00002_EU200772_1_507-661
RF00002_FJ821426_1_330-520
RF00002_AAKN02051266_1_33611-33456
RF00002_CABF01088313_1_1118-959
RF00002_AF133196_1_373-533
RF00002_AJ845126_3_256-398
RF00002_AF239414_1_268-391
RF00002_AY515188_1_99-247
RF00002_ABSP01054427_1_672-826
RF00002_ADGC01004738_1_818-972
RF00002_AACY023748301_1_1028-1164
RF00002_CP001959_1_1248710-1248559
RF00002_AAGH01030076_1_317-168
RF00008_AACY020666933_1_89-136
RF00009_AADM01000339_1_3746-3427
RF00009_ACSY01001980_1_3051-3394
RF00010_ADND01152599_1_4652-4305
RF00010_AACY023430652_1_22-398
RF00010_CP002364_1_3129794-3130164
RF00010_AEVM01000009_1_108474-108077
RF00010_CP001850_2_899033-898660
RF00011_ACCK01000073_1_34977-35289
RF00028_AM266921_1_821-1240
RF00028_EU249787_1_7-505
RF00028_AY437971_1_26-379
RF00028_DQ442703_1_2673-2974
RF00028_AM231291_1_49

In [27]:
leak_ids= []

for n_test, s_test in enumerate(df_test.seq):
    if s_test in list(df_train.seq):
        leak_ids.append(df_test.ids[n_test])
        print(df_test.ids[n_test])
        

RF00001_AAQQ01594944_1_3072-2954
RF00001_AFSA01509842_1_590-487
RF00001_ABRO01332170_1_3445-3562
RF00002_HM069474_1_174-327
RF00002_GU941699_1_236-390
RF00002_HQ651786_1_1868-2019
RF00002_EU200772_1_507-661
RF00002_FJ821426_1_330-520
RF00002_AAKN02051266_1_33611-33456
RF00002_CABF01088313_1_1118-959
RF00002_AF133196_1_373-533
RF00002_AJ845126_3_256-398
RF00002_AF239414_1_268-391
RF00002_AY515188_1_99-247
RF00002_ABSP01054427_1_672-826
RF00002_ADGC01004738_1_818-972
RF00002_AACY023748301_1_1028-1164
RF00002_CP001959_1_1248710-1248559
RF00002_AAGH01030076_1_317-168
RF00008_AACY020666933_1_89-136
RF00009_AADM01000339_1_3746-3427
RF00009_ACSY01001980_1_3051-3394
RF00010_ADND01152599_1_4652-4305
RF00010_AACY023430652_1_22-398
RF00010_CP002364_1_3129794-3130164
RF00010_AEVM01000009_1_108474-108077
RF00010_CP001850_2_899033-898660
RF00011_ACCK01000073_1_34977-35289
RF00028_AM266921_1_821-1240
RF00028_EU249787_1_7-505
RF00028_AY437971_1_26-379
RF00028_DQ442703_1_2673-2974
RF00028_AM231291_1_49

In [28]:
len(leak_ids)

347