In [367]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from scipy import stats 
import sys
import sklearn
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [74]:
sys.path.insert(0, '/Users/cynthiachen/Downloads/Internship2019/degron_mutation/deepDegron')
from train_nn import *

In [401]:
wd = '/Users/cynthiachen/Downloads/Internship2019/degron_mutation/'

In [167]:
# Load pre-trained neural network models from pickle file
model_bag_of_words = pickle.load(open( "../models/neural_network_bag_of_words_v2.pickle", "rb" ))
model_pos_specific = pickle.load(open( "../models/neural_network_pos_specific_v2.pickle", "rb" ))

# Create pd dataframe of sequence information 
degron_pred = pd.read_csv("../data/degron_pred.csv", index_col=0)
# Select column of input sequences
input_seq = degron_pred.iloc[:, 10]

In [76]:
# Use compute_feature_matrix function to encode sequences
features = compute_feature_matrix(input_seq, 6, True)
condensed_features = features[:, 0:20] # Remove zeros at the end for bag of words prediction

In [77]:
# Use pre-trained model to predict sequence degredataion
pred_bow_wt = model_bag_of_words.predict_proba(condensed_features) # wild-type bag of words prediction
pred_ps_wt = model_pos_specific.predict_proba(features)            # wild-type position-specific prediction
drp_wt = pred_ps_wt-pred_bow_wt                                    # wild-type degron regulatory potential

In [79]:
# Convert string to 2D list of characters

seqlist = [] # 2D list of all sequences, with each character as 1 entry 
for sequence in input_seq:
    seqlist.append(list(sequence))

In [528]:
seq_length = len(seqlist[0]) # 23
num_seq = len(seqlist)       # number of total sequences

### Mutation 1: Shift mutation
Shifts all the characters at a certin position down by 1 sequence, with the first sequence filled by the last sequence

In [80]:
shift = 1                    # distance to shift characters by

# List that contains the delta DRP for each position when a shift mutation occurs at that position
diff=[]

# Loop to iterate through all sequence positions
for p in range(seq_length):
    first = seqlist[0][pos]
    for i in range(num_seq-shift):
        #for j in range(seq_length) - iterate through positions
        seqlist[i][pos] = seqlist[i+shift][pos]
    seqlist[num_seq-shift][pos]=first
    
    mutated_seqs = []
    for i in range(num_seq):
        mutated_seqs.append("".join(seqlist[i])) 
        
    # Use compute_feature_matrix function to encode sequences
    features = compute_feature_matrix(pd.Series(mutated_seqs), 6, True)
    condensed_features = features[:, 0:20] # Remove zeros at the end for bag of words prediction

    # Use pre-trained model to predict sequence degredataion
    # Use pre-trained model to predict sequence degredataion
    pred_bow_mut = model_bag_of_words.predict_proba(condensed_features) # bag of words prediction
    pred_ps_mut = model_pos_specific.predict_proba(features)            # position-specific prediction
    drp_mut = pred_ps_mut-pred_bow_mut                                  # mutated degron regulatory potential
    
    # Calculate DRP difference in mutated and wild-type (normal) sequence and add this to a list
    diff.append(drp_wt-drp_mut)

In [82]:
average_delta = []
for i in diff:
    average_delta.append(np.average(i))

In [84]:
# # Plot histogram showing distribution of average DRP deltas

# sns.set(color_codes=True)
# x = np.random.normal(size=200)
# sns.distplot(diff)
# plt.show()

# sns.set(color_codes=True)
# x = np.random.normal(size=200)
# sns.distplot(average_delta, bins = 8)
# plt.show()

### Mutation 2: Random shuffle mutation (considering only Top & Bottom scores)
Shifts all the characters at a certain position using random shuffling, and then considers only the select top and bottom sequences

In [243]:
# Sort degron sequence information by regulatory potential in descending order (highest to lowest)
degron_pred_sorted = degron_pred.sort_values(by=['regulatory potential'], ascending=False)

In [244]:
# Select only the "sequence" and "potential" columns and format values into a list
degron_seq_scores = degron_pred_sorted[['Peptide amino acid sequence','regulatory potential']].values.tolist()