In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from scipy import stats
import sys
from sklearn.preprocessing import normalize

In [74]:
sys.path.insert(0, '/Users/cynthiachen/Downloads/Internship2019/degron_mutation/deepDegron')
from train_nn import *

In [75]:
# Load pre-trained neural network models from pickle file
model_bag_of_words = pickle.load(open( "../neural_network_bag_of_words_v2.pickle", "rb" ))
model_pos_specific = pickle.load(open( "../neural_network_pos_specific_v2.pickle", "rb" ))

# Create pd dataframe of sequence information 
degron_pred = pd.read_csv("../data/degron_pred.csv", index_col=0)
# Select column of input sequences
input_seq = degron_pred.iloc[:, 10]

In [76]:
# Use compute_feature_matrix function to encode sequences
features = compute_feature_matrix(input_seq, 6, True)
condensed_features = features[:, 0:20] # Remove zeros at the end for bag of words prediction

In [77]:
# Use pre-trained model to predict sequence degredataion
pred_bow_wt = model_bag_of_words.predict_proba(condensed_features) # wild-type bag of words prediction
pred_ps_wt = model_pos_specific.predict_proba(features)            # wild-type position-specific prediction
drp_wt = pred_ps_wt-pred_bow_wt                                    # wild-type degron regulatory potential

In [78]:
# def mutate(seq, index, newchar):
#     char_list = list(seq)
#     char_list[index] = newchar
#     return "".join(char_list)

In [79]:
# Convert string to 2D list of characters

seqlist = [] # 2D list of all sequences, with each character as 1 entry 
for sequence in input_seq:
    seqlist.append(list(sequence))

### Mutation 1: Shift mutation
Shifts all the characters at a certin position down by 1 sequence, with the first sequence filled by the last sequence

In [80]:
seq_length = len(seqlist[0]) # 23
num_seq = len(seqlist)       # number of total sequences
shift = 1                    # distance to shift characters by

# List that contains the delta DRP for each position when a shift mutation occurs at that position
diff=[]

# Loop to iterate through all sequence positions
for p in range(seq_length):
    first = seqlist[0][pos]
    for i in range(num_seq-shift):
        #for j in range(seq_length) - iterate through positions
        seqlist[i][pos] = seqlist[i+shift][pos]
    seqlist[num_seq-shift][pos]=first
    
    mutated_seqs = []
    for i in range(num_seq):
        mutated_seqs.append("".join(seqlist[i])) 
        
    # Use compute_feature_matrix function to encode sequences
    features = compute_feature_matrix(pd.Series(mutated_seqs), 6, True)
    condensed_features = features[:, 0:20] # Remove zeros at the end for bag of words prediction

    # Use pre-trained model to predict sequence degredataion
    # Use pre-trained model to predict sequence degredataion
    pred_bow_mut = model_bag_of_words.predict_proba(condensed_features) # bag of words prediction
    pred_ps_mut = model_pos_specific.predict_proba(features)            # position-specific prediction
    drp_mut = pred_ps_mut-pred_bow_mut                                  # mutated degron regulatory potential
    
    # Calculate DRP difference in mutated and wild-type (normal) sequence and add this to a list
    diff.append(drp_wt-drp_mut)

In [82]:
average_delta = []
for i in diff:
    average_delta.append(np.average(i))

In [84]:
# # Plot histogram showing distribution of average DRP deltas

# sns.set(color_codes=True)
# x = np.random.normal(size=200)
# sns.distplot(diff)
# plt.show()

# sns.set(color_codes=True)
# x = np.random.normal(size=200)
# sns.distplot(average_delta, bins = 8)
# plt.show()

### Mutation 2: Random shuffle mutation (considering only Top & Bottom scores)
Shifts all the characters at a certain position using random shuffling, and then considers only the select top and bottom sequences

In [93]:
# Sort degron sequence information by regulatory potential in descending order (highest to lowest)
degron_pred_sorted = degron_pred.sort_values(by=['regulatory potential'], ascending=False)

In [98]:
# Select only the "sequence" and "potential" columns and format values into a list
degron_seq_scores = degron_pred_sorted[['Peptide amino acid sequence','regulatory potential']].values.tolist()

In [None]:
def motif(cutoff = 50, motif_length = 2):
    top = degron_seq_scores[0:cutoff]
    bottom = degron_seq_scores[-cutoff:]
    kmers = []
    
    for row in top:
        sequence = row[0]
        for j in range(seq_length - motif_length + 1):
            kmers.append((sequence[j:(j+2)], row[1])) # add k-mer and corresponding score
    
    # Normalize drp scores to emphasize differences
    kmers = np.asarray(kmers) # Convert kmers to np array from a list
    x = np.asarray(list(map(float, kmers[:, 1])))
    normalized = (x-min(x))/(max(x)-min(x))
    kmers[:,1] = normalized
    
    # break the string into list of words  
    str = str.split()          
    str2 = [] 
  
    # loop till string values present in list str 
    for i in str:              
  
        # checking for the duplicacy 
        if i not in str2: 
  
            # insert value in str2 
            str2.append(i)  
              
    for i in range(0, len(str2)): 
  
        # count the frequency of each word(present  
        # in str2) in str and print 
        print('Frequency of', str2[i], 'is :', str.count(str2[i])) 

In [164]:
" ".join(kmers[:, 0])

'FN NS SE EA AH HL LL LP PI IL LP PK KD DK KK KE EV VE EI IR RE EE GH HS SH HH HS SH HH HG GH HP PS SH HQ QS SH HS SL LP PN NR RR RH VK KD DK KF FS SE EF FW WD DL LD DP PE EV VR RP PT TS SA AV VA AA NQ QG GG GG GG GL LG GL LE ET TL LP PA AL LE EE EG GL LT TR RE EE TY YQ QS SP PP PF FT TE ET TL LD DT TS SP PK KG GY YQ QV VP PA AY AT TS SC CK KP PH HT TQ QH HK KE EC CQ QT TE EC CP PV VR RA AV VC EL LE EN NL LA AA AM MD DL LE EL LQ QK KI IA AE EK KF FS SQ QR RG NY YN NP PL LP PE EE ER RP PG GG GF FA AW WG GE EG GQ QR RL LG GG ES SL LS SA AI IE EA AE EL LE EK KV VA AH HQ QL LQ QA AL LR RR RG QF FV VQ QW WD DE EL LL LC CQ QL LE EA AA AT TQ QV VK KP PA AE EE AN NP PQ QD DR RP PD DA AF FE EL LE ET TR RM MD DQ QV VT TC CA AA GG GQ QP PF FS SE EV VG GE EV VK KD DF FP PD DL LA AV VL LG GA AA PQ QQ QT TS SS SG GT TN NN NK KP PY YR RP PW WG GT TE EV VG GA AF YD DK KL LE EE ER RP PH HL LP PS ST TF FN NY YN NP PA AQ QQ QA AF IG GN NI IG GA AL LM MG GY YA AT TH HK KY YL LD DS SE EE ED DE EE GQ QG GR 

In [165]:
def freq(str): 
  
    # break the string into list of words  
    str = str.split()          
    str2 = [] 
  
    # loop till string values present in list str 
    for i in str:              
  
        # checking for the duplicacy 
        if i not in str2: 
  
            # insert value in str2 
            str2.append(i)  
              
    for i in range(0, len(str2)): 
  
        # count the frequency of each word(present  
        # in str2) in str and print 
        print('Frequency of', str2[i], 'is :', str.count(str2[i]))     
  

In [166]:
freq(" ".join(kmers[:, 0]))

Frequency of FN is : 3
Frequency of NS is : 2
Frequency of SE is : 9
Frequency of EA is : 6
Frequency of AH is : 4
Frequency of HL is : 5
Frequency of LL is : 10
Frequency of LP is : 11
Frequency of PI is : 3
Frequency of IL is : 2
Frequency of PK is : 4
Frequency of KD is : 6
Frequency of DK is : 5
Frequency of KK is : 4
Frequency of KE is : 7
Frequency of EV is : 10
Frequency of VE is : 4
Frequency of EI is : 3
Frequency of IR is : 1
Frequency of RE is : 3
Frequency of EE is : 19
Frequency of GH is : 2
Frequency of HS is : 7
Frequency of SH is : 8
Frequency of HH is : 5
Frequency of HG is : 3
Frequency of HP is : 3
Frequency of PS is : 5
Frequency of HQ is : 3
Frequency of QS is : 5
Frequency of SL is : 10
Frequency of PN is : 4
Frequency of NR is : 2
Frequency of RR is : 8
Frequency of RH is : 1
Frequency of VK is : 8
Frequency of KF is : 3
Frequency of FS is : 5
Frequency of EF is : 3
Frequency of FW is : 1
Frequency of WD is : 2
Frequency of DL is : 6
Frequency of LD is : 3
Freque