In [2]:
import pandas as pd

In [3]:
df_train = pd.read_csv('../dataset/training-set-rfam-corrected.csv', index_col=0)

In [4]:
df_train

Unnamed: 0,index,id,seq,label
0,605,CM000941.1/96223541-96223663,ATCTGATAGCTTCTCTGGATCCCAAAAGTTGCCAAATACCATCTAA...,5S-rRNA
1,9424,KB097753.1/1354714-1354787,TTCACTGATGATGATTATATTACCGCCCCAGTCTGATTTTATATTC...,CD-box
2,20087,LRQV01000095.1/3352-3537,CCGGATTGTCGATACCGTCGCGCCGTCGCGGTAACGGGAAGCCGGT...,Riboswitch
3,14722,LGTL01000006.1/1071421-1071353,CGACTACAGCTCTCTGGATTCACTCTGGCCTTTCAATGCATGTGTT...,HACA-box
4,33739,JH668243.1/177327-177435,GCTCCTGTGGCCTAGTGGTGAGGGCATCCCTCTTATAAGGAATCTG...,tRNA
...,...,...,...,...
31796,20773,MJEH01000016.1/10722-10918,AACTTAATAGATGTTAGAAGTTCTCAAGTATGTTAAAAAACTTGAA...,Riboswitch
31797,11644,KK200027.1/2409-2319,TGGTTGGAGCCAGATCCAACGCCTCATAAGTTAACTCACTGGTTGT...,HACA-box
31798,15785,FQXR01000009.1/20507-20749,AAAAATTCTGTGATAGGAAAGAGTAGTTAGAATGAAGTATTTAAAG...,Leader
31799,28125,CM000865.1/12686370-12686489,TAGGGTGACCAGCTCTCCCAGTTTGGTCAGGACTGAGGAGTTTCCT...,miRNA


In [5]:
labels = list(set(list(df_train.label)))

In [6]:
labels.sort()
labels

['5.8S-rRNA',
 '5S-rRNA',
 'CD-box',
 'HACA-box',
 'Intron-gp-I',
 'Intron-gp-II',
 'Leader',
 'Riboswitch',
 'Ribozyme',
 'Y-RNA',
 'Y-RNA-like',
 'miRNA',
 'tRNA']

In [20]:
for l in labels:
    
    with open('{}.fasta'.format(l), 'w') as file:

        for n,s in enumerate(df_train.labels):
            if s == l:
                file.write('>{} {} \n {}\n'.format(df_train['index'][n], df_train['labels'][n], df_train['seq'][n]))
        

### 1 - MSA using clustal omega of each class in the training set
### 2 - save alignments as Stockholm format
### 3 - use alignments to build HMM profiles:
###     hmmbuild profile.hmm alignment_file.sto
### 4 - use profiles to search for homologs (test set)
###     nhmmer profile.hmm test-set.fasta > output_file.out
### 5 - merge results

In [7]:
df_test = pd.read_csv('../dataset/validation-set-rfam-corrected.csv', index_col=0)

In [8]:
df_test

Unnamed: 0,index,id,seq,label
0,21355,LGCL01000045.1/11910-11982,AATTAAAAAAGCGATGAGGCTCGCTTGAGATGTAATCAAACCGCCG...,Riboswitch
1,11684,ALWZ044445528.1/962-1083,ATCCTAGGTAAGCCCTTTAAACTACTTATCTGATTTATGGGAGACA...,HACA-box
2,14551,LT598460.1/687408-687646,AACTTTGTCCAAAAGTACTCCACCGCAATAGACAGTACGACGGCCG...,HACA-box
3,6725,AYZS02015661.1/13237-13329,TTGCCATGATGATTTCAAATTATTAGGAAAGCATTATGCTAATGCT...,CD-box
4,28222,KB846811.1/2403452-2403262,AATCTGGGAAGAGAGCTTCCTTCAGTCCACTCATGGGGAGTGGTAA...,miRNA
...,...,...,...,...
13641,18742,FCNA01000158.1/16563-16326,GAAAAAGCGTTGAAAAGATGAGTAGTTTTGGATAGACTTATAGAGA...,Leader
13642,13516,AFYH01058929.1/8670-8538,TCCCAAGGTGTTGAGTTCAGTTCAGGGCTGCTTCCCTGTTCTGGTA...,HACA-box
13643,28677,ABEG02003677.1/12241-12145,AAGGGGTGTCAAAGTAGAGGTAGGTGCGAGAAATGACGGGCATCGT...,miRNA
13644,6156,CM002815.1/56721418-56721311,CGTCATGATTTGGGCATTGCATTAACTTATCATTAAAGTATGTTGA...,CD-box


In [12]:
def get_output(label, df_test):
    
    df = pd.DataFrame()
    e_values = []
    scores = []
    biases = []
    indices = []
    labels = []

    with open('./profile-based/output/{}.out'.format(label), 'r') as file:
    
        lines = file.readlines()
        
        for n,line in enumerate(lines[14:]):
    
            if len(line.strip().split()) > 3:
        
                e_value = line.strip().split()[0]
                score = line.strip().split()[1]
                bias = line.strip().split()[2]
                seq = line.strip().split()[3]
        
                if '----' in e_value:
                    break
                else:
                    if float(e_value) < 10**(-3):
                        #print(e_value, score, bias, seq.split('_')[0], seq.split('_')[1])
                        e_values.append(e_value)
                        scores.append(score)
                        biases.append(bias)
                        indices.append(seq.split('_')[0])
                        labels.append(seq.split('_')[1])
                        
    
    df['index'] = indices
    df['label'] = labels
    df['e_value'] = e_values
    df['score'] = scores
    df['bias'] = biases
    
    df.to_csv('./{}-HMM-profile-results.csv'.format(label))
    
    df_test_f = df_test.loc[df_test['index'].isin(df['index'])]
    df_test_f = df_test_f.reset_index(drop=True)
    
    df_test_f['predicted'] = df['label']
    
    max_count = df_test['label'].value_counts()[label]
    hit_count = len(df.label)
    
    print('{0}/{1} ({2}%) {3} sequences not predicted'.format(max_count-hit_count, max_count, 
                                                              round(((max_count-hit_count)/max_count)*100,2),
                                                              label))
    
    return df_test_f

In [13]:
label_list = ['5.8S-rRNA', '5S-rRNA',
              'Intron-gp-I', 'Leader', 'Riboswitch', 
              'Ribozyme', 'Y-RNA', 'Y-RNA-like', 'tRNA']

In [14]:
for l in label_list:
    get_output(l, df_test)

5/126 (3.97%) 5.8S-rRNA sequences not predicted
556/1500 (37.07%) 5S-rRNA sequences not predicted
44/390 (11.28%) Intron-gp-I sequences not predicted
1514/1514 (100.0%) Leader sequences not predicted
1483/1483 (100.0%) Riboswitch sequences not predicted
828/1408 (58.81%) Ribozyme sequences not predicted
50/107 (46.73%) Y-RNA sequences not predicted
-5/41 (-12.2%) Y-RNA-like sequences not predicted
1532/1533 (99.93%) tRNA sequences not predicted


In [12]:
## CD-box, HACA-box, Intron-gp-II, miRNA, Leader e Riboswitch --> sem alinhamentos significativos

In [17]:
get_output('Y-RNA-like', df_test)

-5/41 (-12.2%) Y-RNA-like sequences not predicted


Unnamed: 0,index,id,seq,label,predicted
0,35035,CP015118.1/98213-98345,GCCACGAATGCGGGTGGAACTACAGCCTACGAAGCTCGTGGTCGCG...,Y-RNA-like,Y-RNA-like
1,35091,JSUQ01000002.1/188079-187891,CCGGCGAATGCCGGCGGAACTACAGCGCTGTTAACGCCCAGGTTGC...,Y-RNA-like,Y-RNA-like
2,35110,LT607413.1/5198265-5198368,ACGACGGATACTTCTAATCTATGGGTCGGGGGTTCGAATCCTCTCC...,Y-RNA-like,Y-RNA-like
3,35033,CP022117.1/350327-350445,GGCACGAATGCTGGTGGATCTACACCTTGTATGGCCAGACGCAGGT...,Y-RNA-like,Y-RNA-like
4,35005,CP011131.1/4801897-4801769,ACTGCGAATGCTGGCGAAACTACATCGGAACTCACAGACGCGGGTT...,Y-RNA-like,Y-RNA-like
5,35052,GG694028.1/93881-94006,CCGGCAAATGCGGAAGGAATTACATGGGGGTCGTAAATGCAGGTTC...,Y-RNA-like,Y-RNA-like
6,35024,LMRG01000027.1/174110-174218,ATGGCGAATGCAAAGGAAACTACATCGGCCGCGGCGGGTTCGAATC...,Y-RNA-like,Y-RNA-like
7,35071,CAGR02000059.1/52107-51999,TTGGCGAATGCCTGTGGATCTACATTTGTTGCGGGTTCGAGACCCG...,Y-RNA-like,Y-RNA-like
8,35048,FUYE01000002.1/319683-319541,GGGGCGAATGCCCGATGGAACTACATGATTCAAAAATCCTCAGGCC...,Y-RNA-like,Y-RNA-like
9,35039,CP011514.1/4308279-4308412,GCGATGAATGCCGGTGGAACTACAGCACTATTAATGCCCGTGTCGT...,Y-RNA-like,Y-RNA-like
