In [88]:
import pandas as pd
import arnie
from arnie.utils import *
from arnie.utils import _group_into_non_conflicting_bp

# import csv for pseudoknot predictions

def get_csv(csv_loc):
    df = pd.read_csv(csv_loc)
    return df 

# extract locations for each pseudoknot along with dotbracket structures

def get_info(df):
    
    starts = df['start'].to_list()
    ends = df['end'].to_list()
    sequences = df['sequence'].to_list()
    dotbrackets = df['struct'].to_list()
    
    return starts, ends, sequences, dotbrackets

# import shapeknots data and convert to list

def get_shape_data(filename):
    shape = []
    with open(filename) as f:
        for line in f:
            line = line.strip()
            shape.append(line)
            
    for i in range(len(shape)):
        shape[i] = (-1) if shape[i] == 'nan' else float(shape[i])
        
    return shape

# use Rachael's function to compare shape and dotbracket structure and return ranking

def evaluate_L1_shape_score(s,shape):
    score = 0
    for c,react in zip(s,shape):
        if (c=="." and react>0.25) or (c!="." and react<0.5):
            score += 1
    return score/len(s)

# get locations of pseudoknotted base pairs in a window

def get_groups(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    groups = _group_into_non_conflicting_bp(bp_list)
    return groups

def get_pk_bp_locs(groups):
    pk_bp_list = []
    pk_bp_locs = []
    for i, lists in enumerate(groups):
        if i == 0: 
            None
        else: 
            length = len(lists)
            for idx in range(length):
                bp = lists[idx]
                pk_bp_list.append(bp)
                pk_bp_locs.append(bp[0])
                pk_bp_locs.append(bp[1])             
    pk_bp_locs.sort()
    return pk_bp_locs, pk_bp_list

def get_pk_bp_struct(pk_bp_locs, dotbracket):
    pk_bp_struct = []
    for idx in pk_bp_locs:
        bracket = dotbracket[idx]
        pk_bp_struct.append(bracket)
    return pk_bp_struct

# rank PKs based on theoretically thermodynamic stability 

# ranking function operations: 
# add 1 for every additional base pair in a helix
# subtract 1 for every base pair adjacent to a separate helix 

def get_pk_rank(pk_bp_locs, dotbracket):
    pk_rank = 0
    for idx in pk_bp_locs:
        if (idx != 119) and (dotbracket[idx] == dotbracket[idx+1]):
            pk_rank += 0.5   
    for idx in pk_bp_locs: 
        if (idx != 119) and (dotbracket[idx] != dotbracket[idx+1]):
            if dotbracket[idx+1] == '.':
                pk_rank += 0
            else:
                pk_rank -= 1
    for idx in pk_bp_locs: 
        if (idx != 0) and (dotbracket[idx] != dotbracket[idx-1]):
            if dotbracket[idx-1] == '.':
                pk_rank += 0
            else:
                pk_rank -= 1
    return pk_rank

# rank PKs on consensus with other predictions

def get_bp_list(dotbracket):
    bp_list = convert_dotbracket_to_bp_list(dotbracket, allow_pseudoknots=True)
    return bp_list

def compare_bp_lists(bp_list1, bp_list2):
    bp_list_score = 0
    for bp1 in bp_list1: 
        for bp2 in bp_list2: 
            if bp1 == bp2: 
                bp_list_score += 1
        # divide by total number of base pairs in bp_list1 to normalize results
    return bp_list_score/len(bp_list1)

def get_consensus_scores(start_locs1, bp_lists1, start_locs2, bp_lists2):
    scores = []
    for i, loc1 in enumerate(start_locs1): 
        for idx, loc2 in enumerate(start_locs2):
            if loc1 == loc2:
                bp_list1 = bp_lists1[i]
                bp_list2 = bp_lists2[idx]
                bp_list_score = compare_bp_lists(bp_list1, bp_list2)
                scores.append(bp_list_score)
        else:
            scores.append(0)
    return scores

# create new dataframe with rankings

def get_df(starts, ends, sequences, dotbrackets, shape_scores, pk_bp_shape_scores, ranks, consensus_scores, pk_bp_consensus_scores):
    PK_list = zip(starts, ends, sequences, dotbrackets, shape_scores, pk_bp_shape_scores, ranks, consensus_scores, pk_bp_consensus_scores)
    df = pd.DataFrame(PK_list, columns = ['start', 'end', 'sequence', 'structure', 'shape_score', 'pk_bp_shape_score', 'rank', 'consensus_score', 'pk_bp_consensus_score'])
    ranked_df = df.sort_values('pk_bp_shape_score', ascending=False)
    return ranked_df

# get consensus score for only pk bps

def compare_bp_pks(struct1, struct2):
    bp_count = 0
    score = 0
    for i, char in enumerate(struct1):
        if char == '[' or char == ']':
            bp_count += 1
            if char == struct2[i]:
                score += 1
    return score/bp_count

def get_pk_bp_consensus_scores(starts, dotbrackets, starts2, dotbrackets2):
    bp_pk_consensus_scores = []
    for i, start in enumerate(starts):
        for i2, start2 in enumerate(starts2):
            if start == start2: 
                consensus_score = compare_bp_pks(dotbrackets[i], dotbrackets2[i2])
                bp_pk_consensus_scores.append(consensus_score)
        else: 
            bp_pk_consensus_scores.append(0)
    return bp_pk_consensus_scores
    
# put it all together

def score_pk_overall(csv, shape_file, csv2):
    df = get_csv(csv)
    starts, ends, sequences, dotbrackets = get_info(df)
    
    # get rough score for consensus with shape data for entire window
    
    full_shape = get_shape_data(shape_file)
    shapes = []
    for i, start in enumerate(starts):
        end = ends[i]
        shape_window = full_shape[start:end]
        shapes.append(shape_window)
    
    shape_scores = []
    for i, struct in enumerate(dotbrackets):
        shape = shapes[i]
        score = evaluate_L1_shape_score(struct, shape)
        shape_scores.append(score)

    # get score for shape consensus with only pk bps

    pk_bp_lists = []
    pk_bp_locs = []
    pk_bp_structs = []
    for i, struct in enumerate(dotbrackets):
        groups = get_groups(struct)
        pk_bp_loc, pk_bp_list = get_pk_bp_locs(groups)
        pk_bp_struct = get_pk_bp_struct(pk_bp_loc, struct)
        
        pk_bp_locs.append(pk_bp_loc)
        pk_bp_structs.append(pk_bp_struct)
        pk_bp_lists.append(pk_bp_list)
        
    pk_bp_shapes = []
    for i, locs in enumerate(pk_bp_locs):
        pk_bp_shapes_window = []
        shape_window = shapes[i]
        for idx in locs:
            shape = shape_window[idx]
            pk_bp_shapes_window.append(shape)
        pk_bp_shapes.append(pk_bp_shapes_window)
        
    pk_bp_shape_scores = []
    for i, struct in enumerate(pk_bp_structs):
        pk_bp_shape_window = pk_bp_shapes[i]
        score = evaluate_L1_shape_score(struct, pk_bp_shape_window)
        pk_bp_shape_scores.append(score)
    
    # get rough ranking for likelihood of PK
        
    ranks = []
    for i, struct in enumerate(dotbrackets): 
        rank = get_pk_rank(pk_bp_locs[i], struct)
        ranks.append(rank)
        
    # get consensus score with other predictions
    
    df2 = get_csv(csv2)
    starts2, ends2, sequences2, dotbrackets2 = get_info(df2)
    
    bp_lists1 = []
    for dotbracket in dotbrackets: 
        bp_list = get_bp_list(dotbracket)
        bp_lists1.append(bp_list)
        
    bp_lists2 = []
    for dotbracket in dotbrackets2: 
        bp_list = get_bp_list(dotbracket)
        bp_lists2.append(bp_list)
        
    consensus_scores = get_consensus_scores(starts, bp_lists1, starts2, bp_lists2)
    
    # get consensus scores for pk bps only 
    
    groups2 = []
    for dotbracket in dotbrackets2:
        groups = get_groups(dotbracket)
        groups2.append(groups)
        
    pk_bp_lists2 = []
    for group in groups2:
        pk_bp_loc, pk_bp_list = get_pk_bp_locs(group)
        pk_bp_lists2.append(pk_bp_list)
    
    pk_bp_consensus_scores = get_consensus_scores(starts, pk_bp_lists, starts2, pk_bp_lists2)
    
    # put it all together into a dataframe
        
    df = get_df(starts, ends, sequences, dotbrackets, shape_scores, pk_bp_shape_scores, ranks, consensus_scores, pk_bp_consensus_scores)
    return df

In [89]:
knotty_df = score_pk_overall('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv')

In [110]:
knotty_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_analysis_scores.csv')

In [111]:
pknots_df = score_pk_overall('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pk_predictor_output.csv', '/home/gnye8/Desktop/PK_research/SSRP_work/shape_data/incarnato_invivo_reactivity-Copy1.csv', '/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv')

In [112]:
pknots_df.to_csv('/home/gnye8/Desktop/PK_research/pipeline_results/pknots/pknots_analysis_scores.csv')

In [90]:
knotty_df

# pseudoknot specific shape ranking
# pseudoknot specific consensus score
# visualize interesting pseudoknots

Unnamed: 0,start,end,sequence,structure,shape_score,pk_bp_shape_score,rank,consensus_score,pk_bp_consensus_score
500,21680,21800,ACAAAGUUUUCAGAUCCUCAGUUUUACAUUCAACUCAGGACUUGUU...,...(((....(((.((((.((((..[[[[..)))).)))).))).....,0.650000,1.000000,16.0,0.000000,0.0
415,18160,18280,ACAUACCUGGCAUACCUAAGGACAUGACCUAUAGAAGACUCAUCUC...,.....((([[[[[.....)))..]]].]].(((((.........))...,0.541667,1.000000,3.5,0.000000,0.0
460,20080,20200,CUGUAGGUCCCAAACAAGCUAGUCUUAAUGGAGUCACAUUAAUUGG...,((((..((.....((..(((.(.((...[[[)).)....[[[[[[[...,0.725000,1.000000,21.0,0.000000,0.0
414,18120,18240,AGUGUUGACACUAAAUUCAAAACUGAAGGUUUAUGUGUUGACAUAC...,.((((((((((((((((..........)))))).)))))))))).(...,0.591667,1.000000,3.5,0.133333,0.0
435,19000,19120,UAUUAGCAGACAAAUUCCCAGUUCUUCACGACAUUGGUAACCCUAA...,((.(((([[[....[[[[..[[[[[.....[[[[[[[[...........,0.725000,1.000000,26.5,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
147,6760,6880,ACUAAUUAUAUGCCUUAUUUCUUUACUUUAUUGCUACAAUUGUGUA...,.........((((.(((.((((..[[[[[.[[[[[[.((((.((((...,0.450000,0.288462,17.0,0.264706,0.0
420,18400,18520,CUACAGGUUAUGUUGAUACACCUAAUAAUACAGAUUUUUCCAGAGU...,....((((..[[[......))))......]]]((((..(((((.((...,0.600000,0.285714,4.0,0.000000,0.0
464,20240,20360,UCAAAUGGAAAUUGAUUUCUUAGAAUUAGCUAUGGAUGAAUUCAUU...,((((..[[[[[))))]]]]]...((((.(((((((.((((.((..(...,0.658333,0.222222,3.0,0.000000,0.0
546,23600,23720,CUCCUCGGCGGGCACGUAGUGUAGCUAGUCAAUCCAUCAUUGCCUA...,(([[[.)).]]].(((((((((((.....(((.......))).)))...,0.708333,0.200000,1.0,0.000000,0.0


In [109]:
pknots_df

Unnamed: 0,start,end,sequence,structure,shape_score,pk_shape_score,pk_bp_shape_score,rank,consensus_score,pk_consensus_score,bp_pk_consensus_score
237,21200,21320,UAAGCUCAUGGGACACUUCGCAUGGUGGACAGCCUUUGUUACUAAU...,...........(((...................................,0.541667,0.857143,0.000000,-2,0.0,0.454545,0.428571
255,22520,22640,UCCAACCAACAGAAUCUAUUGUUAGAUUUCCUAAUAUUACAAACUU...,...........(((((((....)))))))........((((....)...,0.691667,1.000000,0.750000,-3,0.0,0.000000,0.000000
222,20120,20240,AAUUGGAGAAGCCGUAAAAACACAGUUCAAUUAUUAUAAGAAAGUU...,...(((.....)))....(((((...((((((..........))))...,0.508333,0.833333,0.750000,-3,0.0,0.000000,0.000000
115,10320,10440,UAAGGUUGAUACAGCCAAUCCUAAGACACCUAAGUAUAAGUUUGUU...,...((((.....))))........(((((..(((...(((((((((...,0.750000,0.538462,0.750000,-3,0.0,0.000000,0.000000
1,120,240,CACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGA...,.((((((((..((((.....))))))))).)))((.(((((.((((...,0.700000,0.741935,1.000000,-3,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
4,440,560,GCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUC...,((.....))((((((.((((((........)))))).....[[[.....,0.716667,0.684211,0.700000,-23,0.2,0.000000,0.000000
92,8800,8920,ACUAAUGACAAAGCUUGCCCAUUGAUUGCUGCAGUCAUAACAAGAG...,....((((([[[[.[[[[[.[[[[.[[[.[[[.))))).....(((...,0.683333,0.626263,0.473684,-24,0.0,0.000000,0.000000
30,3000,3120,UGAGUUUAAAUUGGCUUCACAUAUGUAUUGUUCUUUCUACCCUCCA...,...........((.[[[[[[[....[[[[.[[[[[[....[[[[))...,0.750000,0.698413,0.404762,-25,0.0,0.000000,0.000000
49,4480,4600,UAUAAGGGUAUUAAAAUACAAGAGGGUGUGGUUGAUUAUGGUGCUA...,..((((((((...((((..[[[[[[[[[[[.[[[[[..[[[[[[[[...,0.791667,0.782178,0.520833,-27,0.0,0.000000,0.000000


In [97]:
dotbracket1 = '...(((..[[..)))..]]...'
shape1 = '1110001100110001100111'
start1 = 0

def get_sp_pk_locs(dotbracket, start):
    locs = []
    struct = []
    for i, char in enumerate(dotbracket):
        if char == '[':
            locs.append(i)
            struct.append(char)
        elif char == ']':
            locs.append(i)
            struct.append(char)
    return locs, struct

locs1, struct1 = get_pk_locs(dotbracket1, start1)
print(locs1)
print(struct1)

def get_sp_pk_shape(full_shape, locs):
    shapes = []
    for loc in locs: 
        shape = full_shape[loc]
        shape = float(shape)
        shapes.append(shape)
    return shapes

pk_shape = get_pk_shape(shape1, locs1)
print(pk_shape)

score = evaluate_L1_shape_score(struct1, pk_shape)
print(score)

[8, 9, 17, 18]
['[', '[', ']', ']']
[0.0, 0.0, 0.0, 0.0]
1.0


In [100]:
struct1 = '...(((..[[..)))..]]...)))'
struct2 = '...(((..[...)))...]...)))'
def compare_sp_pks(struct1, struct2):
    bp_count = 0
    score = 0
    for i, char in enumerate(struct1):
        if char == '[' or char == ']':
            bp_count += 1
            if char == struct2[i]:
                score += 1
    return score/bp_count

compare_pks(struct1, struct2)

0.5

In [2]:
knotty = pd.read_csv('/home/gnye8/Desktop/PK_research/pipeline_results/knotty/knotty_output.csv')
knotty

Unnamed: 0.1,Unnamed: 0,start,end,sequence,struct
0,0,0,120,AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGA...,......(((((.(((((....)))))..)))))...(((.....[[...
1,1,40,160,UUUCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUG...,....((((.......)))).((((.......))))........(((...
2,2,80,200,AAUCUGUGUGGCUGUCACUCGGCUGCAUGCUUAGUGCACUCACGCA...,....((((((((((.....)))))))))).(((((....[[[[[[[...
3,3,120,240,CACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGA...,.((((((((...[[[.........))))).)))((.(((((.((((...
4,4,200,320,UUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCG...,.(((.(.[[[[[[..).)))....]]]]]].(((..(((....(((...
...,...,...,...,...,...
677,677,29600,29720,GUCUACUCUUGUGCAGAAUGAAUUCUCGUAACUACAUAGCACAAGU...,.((((((..((((([[[.[[[[......[[[[[[[[..))))))))...
678,678,29640,29760,ACAAGUAGAUGUAGUUAACUUUAAUCUCACAUAGCAAUCUUUAAUC...,..(((([[[[.......))))..]]]](((((.................
679,679,29680,29800,UUAAUCAGUGUGUAACAUUAGGGAGGACUUGAAAGAGCCACCACAU...,...((.[[[[[)).]]]]]..((.((.(((....))))).)).(((...
680,680,29720,29840,CCACAUUUUCACCGAGGCCACGCGGAGUACGAUCGAGUGUACAGUG...,.((((((((((...(((((((.((([[[[[..))).)))..........


In [63]:
test_struct = knotty.loc[0, 'struct']
print(test_struct)
print(type(test_struct))
test_struct[119]

......(((((.(((((....)))))..)))))...(((.....[[[[[.))).]]]]].((((.......)))).........((((((((((.....))))))))))...........
<class 'str'>


'.'

In [24]:
test_groups = get_groups(test_struct)
print(len(test_groups))
print(test_groups[0])
print(test_groups[1])
print(test_groups[2])

3
[[6, 32], [7, 31], [8, 30], [9, 29], [10, 28], [12, 25], [13, 24], [14, 23], [15, 22], [16, 21], [60, 74], [61, 73], [62, 72], [63, 71], [84, 108], [85, 107], [86, 106], [87, 105], [88, 104], [89, 103], [90, 102], [91, 101], [92, 100], [93, 99]]
[[36, 52], [37, 51], [38, 50]]
[[44, 58], [45, 57], [46, 56], [47, 55], [48, 54]]


In [43]:
# pseudocode
# discard the first list in groups
# append all other bps to a new list called 'pk_bp_locs'

def get_pk_bp_locs(groups, start):
    pk_bp_list = []
    pk_bp_locs = []
    locs = []
    for i, lists in enumerate(groups):
        if i == 0: 
            None
        else: 
            length = len(lists)
            for idx in range(length):
                bp = lists[idx]
                pk_bp_list.append(bp)
                pk_bp_locs.append(bp[0])
                
                pk_bp_locs.append(bp[1])
                locs.append(bp[0]+start)
                locs.append(bp[1]+start)
                
    pk_bp_locs.sort()
    return pk_bp_list, pk_bp_locs, locs
                

pk_bp_list, pk_bp_locs, locs = get_pk_bp_locs(test_groups, 380)
print(pk_bp_locs)
print(locs)

[36, 37, 38, 44, 45, 46, 47, 48, 50, 51, 52, 54, 55, 56, 57, 58]
[416, 432, 417, 431, 418, 430, 424, 438, 425, 437, 426, 436, 427, 435, 428, 434]


In [46]:
def get_pk_bp_struct(pk_bp_locs, dotbracket):
    pk_bp_struct = []
    for idx in pk_bp_locs:
        bracket = dotbracket[idx]
        pk_bp_struct.append(bracket)
    return pk_bp_struct

get_pk_bp_struct(pk_bp_locs, test_struct)

['(',
 '(',
 '(',
 '[',
 '[',
 '[',
 '[',
 '[',
 ')',
 ')',
 ')',
 ']',
 ']',
 ']',
 ']',
 ']']

In [66]:
def get_pk_rank(pk_bp_locs, dotbracket):
    pk_rank = 0
    for idx in pk_bp_locs:
        if dotbracket[idx] == dotbracket[idx+1]:
            pk_rank += 0.5   
    for idx in pk_bp_locs: 
        if (idx != 119) and (dotbracket[idx] != dotbracket[idx+1]):
            if dotbracket[idx+1] == '.':
                None
            else:
                pk_rank -= 1
    for idx in pk_bp_locs: 
        if (idx != 0) and (dotbracket[idx] != dotbracket[idx-1]):
            if dotbracket[idx-1] == '.':
                None
            else:
                pk_rank -= 1
                
    print(pk_rank)
                
get_pk_rank(pk_bp_locs, test_struct)

6.0
