# Sorting Protein Mutations Using STRIDE

This notebook divides the dataset into seperate datasets depended on a mutations secondary structure assignment. It is used for Part 1 of the project.

In [4]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide
import random
import itertools
import more_itertools as mit

In [5]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Methods Used to Format Data

Formatting protein sequence into form for machine learning:

In [6]:
# parameters:
#      "uniprot_id" - string representing uniprot id of desired protein.
# This method uses a given uniprot id to query the uniprot data and 
# return a string respresention of the protein sequence. 
# E.g. MADIT
def get_protein_seq(uniprot_id):
    
    # importing fasta file from uniprot.org and getting protein sequence
    # taken from StackOverflow: 
    # https://stackoverflow.com/questions/52569622/protein-sequence-from-uniprot-protein-id-python
    url = "http://www.uniprot.org/uniprot/"
    complete_url = url + uniprot_id + ".fasta"
    response = requests.post(complete_url)
    data =''.join(response.text)
    sequence =StringIO(data)
    protein_seq=list(SeqIO.parse(sequence,'fasta'))

    # protein sequence as string (single-letter amino acids)
    string_seq = str(protein_seq[0].seq)
    
    # protein sequence w/ three-letter convention
    protein_seq = get_expanded_seq(string_seq)
    return protein_seq

Expanding protein sequence (1 letter AA -> 3 letter AA):

In [7]:
# parameter:
#      "seq" - string representing protein sequence in 1-letter convention.
# This method takes protein sequence string with 1-letter convention and returns
# a protein sequence with 3-letter convention.
# E.g. ADE -> ALA ASP GLU
def get_expanded_seq(seq):
    expanded_list = []
    split_seq = list(seq)
    for letter in split_seq:
        three_letter_abbr = Bio.PDB.Polypeptide.one_to_three(letter)
        expanded_list.append(three_letter_abbr)
    exanded_string = " ".join(expanded_list)
    return(exanded_string)

Returning index range of protein domain within protein:

In [8]:
# parameters: 
#      "full_protein_split" - list of amino acids in full protein in 3 letter convention.
#                             E.g. ["ALA", "GLY", "TYR"]
#      "domain_split" - list of amino acids in protein domain in 3 letter convention.
#                       E.g. ["ALA", "GLY"]
# This method prints the index of the given domain within the given protein.
# Starting value is inclusive and the ending value is exclusive. 
# E.g. [(0, 3)]
def get_index_range(full_protein_split, domain_split):
    indexes = []
    for i in range(len(full_protein_split)):
        if full_protein_split[i:i+len(domain_split)] == domain_split:
            indexes.append((i, i+len(domain_split)))
    print(indexes)
    indexes.clear()

Get variant in mutation-position form from wild-type-position-mutation form: (E.g. G126A -> 126ALA)

In [9]:
# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with wild-type residue (first letter) from variant.
def get_wild_type(split_mutation_column):
    wild_type_list = []
    w_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            wild_type = "wild_type"
        elif "-" in string[0] or len(string) == 0:
            wild_type = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                w_letters.append(mutation_name[0])
                wild_type = ",".join(w_letters)
        wild_type_list.append(wild_type)
        w_letters.clear()
    return wild_type_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with mutation residue (last letter) from variant.
def get_mutation_type(split_mutation_column):
    mutation_list = []
    m_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            mutation = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            mutation = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                m_letters.append(mutation_name[-1])
                mutation = ",".join(m_letters)
        mutation_list.append(mutation)
        m_letters.clear()
    return mutation_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with the position of mutation (number) from variant.
def get_position(split_mutation_column):
    position_list = []
    p_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            position = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            position = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                p_letters.append(mutation_name[1:-1])
                position = ",".join(p_letters)
        position_list.append(position)
        p_letters.clear()
    return(position_list)


# parameter:
#      "df" - dataframe of protein data with "MUTATED_RES" and "POSITION" columns.
# This method returns a list with the correctly formatted variant (mutation-position form).
def get_mutations_names_list(df):
    formatted_list = []
    expanded_abbv = []
    for mutation, position in zip(df["MUTATED_RES"], df["POSITION"]):
        split_mutations = mutation.split(",")
        split_positions = position.split(",")
        if "wild-type" in split_mutations[0].lower() or "wild-type" in split_positions[0].lower():
            abbv_names = "WT"
        else:  
            for mut, pos in zip(split_mutations, split_positions):
                three_letter_mut = Bio.PDB.Polypeptide.one_to_three(mut.upper())
                position = str(int(pos))
                combined_name = position + three_letter_mut
                expanded_abbv.append(combined_name)
                abbv_names = ", ".join(expanded_abbv)
        expanded_abbv.clear()
        formatted_list.append(abbv_names)
    return(formatted_list)

Splits positions in intermediary "POSITION" column to help remove mutations with a certain position

In [10]:
# Parameters:
#      "df" - protein data dataframe with "POSITION" column 
# This method takes the position column in the dataframe and splits it in order
# to help remove or keep mutatations depending on their position.
def get_positions_split(df):
    position_list_split = []

    for item in df["POSITION"]:
        item = item.split(",") # splits positions into list
        int_item = [int(i) for i in item]
        position_list_split.append(int_item)
    
    return position_list_split

Getting Secondary Structure assignment from STRIDE file

In [11]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is secondary strcuture or not
def get_sec_struc_boolean(stride_file):
    is_sec_struc = []
    sec_struc_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            sec_struc_assign.append(split_line[5])

    for sec_struc in sec_struc_assign:
        if (sec_struc =='C' or sec_struc =='T'):
            is_sec_struc.append(False)
        else:
            is_sec_struc.append(True)
            
    return is_sec_struc

Getting Dataset of Mutations within Domain in PDB File

In [12]:
# Parameters:
#      "orig_df" - 
#      "start" -
#      "end" - 
#      "not_included_list"
# This method does even more helpful stuff
def get_domain_dataset(orig_df, start, end, not_included_list):
    in_domain_list = []
    
    for val in orig_df["positions_split"]:
        for position in val:
            if not_included_list.count(position - start) == 0: # if value is not in the list of values to exclude
                if position >= start and position < end:
                    in_domain = True
                else:
                    in_domain = False
            else:
                in_domain = False
        in_domain_list.append(in_domain)
    
    orig_df['in_domain'] = in_domain_list
    # print(in_domain_list)
    condition = orig_df['in_domain'] == True
    rows = orig_df.loc[condition, :]
    
    in_domain_df = pd.DataFrame(columns=orig_df.columns)
    in_domain_df = in_domain_df.append(rows, ignore_index=True)
    in_domain_df = in_domain_df.drop(['in_domain'], axis=1)
    return in_domain_df

Getting Dataset of Mutations _in_ Secondary Structures

In [13]:
# Parameters:
# - orig_df: original dataframe with all mutations and "positions_split" column which has mutation positions in split list
#            as ints
# - sec_st_df: new dataframe with all rows that have mutations in the secondary structure of protein
# - mixed_df: new dataframe with all rows that have mutations in both in and out of the secondary stucture of the protein
# - start: (inclusive) index where the domain of the protein in PDB file starts
# - end: (inclusive) index where the domain of the protein in PDB file ends
def get_ss_dataset(orig_df, bool_ss_list, domain_start_index):
    
    has_sec_str = []
    
    for val in orig_df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []
        
        for position in val:
            # print(position - domain_start_index)
            # print(str(position) + " " + str(domain_start_index))
            if (bool_ss_list[position - domain_start_index] == False): # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)
        
        # all pos sec struc should match val list
        # if there's a value in all_pos_sec_struc that's false, append false
        # otherwise, append true
        # print("val")
        # print(val)
        # print("bool")
        # print(all_pos_sec_struc)
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False
        
        # print(has_only_sec_str)
        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()
        
        
    # print(len(has_sec_str)) # should match dataframe length
    orig_df['has_sec_str'] = has_sec_str
    
    condition = orig_df['has_sec_str'] == True
    rows = orig_df.loc[condition, :]
    
    sec_str_df = pd.DataFrame(columns=orig_df.columns)
    sec_str_df = sec_str_df.append(rows, ignore_index=True)
    # print(sec_str_df.head)
    sec_str_df = sec_str_df.drop(['has_sec_str'], axis=1)
    orig_df = orig_df.drop(['has_sec_str'], axis=1)
    
    return sec_str_df

Getting Dataset of Mutations _not_ in Secondary Structures

In [14]:
def get_not_ss_dataset(orig_df, bool_ss_list, domain_start_index):
    is_not_sec_str = []
    
    for val in orig_df["positions_split"]:
        
        all_pos_sec_struc = []
        
        for position in val:
            # print(position - domain_start_index)
            # print(str(position) + " " + str(domain_start_index))
            if (bool_ss_list[position - domain_start_index] == False):
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)
    
        
        if (all_pos_sec_struc.count(True) == 0):
            has_no_sec_str = True
        else:
            has_no_sec_str = False
        
        is_not_sec_str.append(has_no_sec_str)
        all_pos_sec_struc.clear()
        
    orig_df['is_not_sec_str'] = is_not_sec_str
     
    condition = orig_df['is_not_sec_str'] == True
    rows = orig_df.loc[condition, :]
    
    not_sec_str_df = pd.DataFrame(columns=orig_df.columns)
    not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)
    not_sec_str_df = not_sec_str_df.drop(['is_not_sec_str'], axis=1)
    orig_df = orig_df.drop(['is_not_sec_str'], axis=1)
    
    return not_sec_str_df

Writing formatted data to txt file:

In [15]:
# parameters:
#      "txt_name" - desired name of formatted txt file for network. E.g. "pab1"
#      "protein_seq" - string of protein sequence in 3 letter convention. E.g. ALA GLU TYR
#      "df" - dataframe with cleaned protein data. Must contain "variant" and "score" 
#             columns.
# This method cleans the protein data and formats it into a txt that can be processed by the 
# network. It also prints the name of the file out for reference.
def write_data_file(txt_name, protein_seq, df):
    file_name = txt_name + ".txt"
    path_name = "../ML Script Data Files/" + file_name
    print("Filename: " + file_name)
    
    datafile = open(path_name, "w+")
    datafile.write(protein_seq + "\n")
    for index in range(len(df)-1):
        datafile.write(df["variant"].iloc[index] + ": " + str(df["score"].iloc[index]) + "\n")
    datafile.write(df["variant"].iloc[len(df) - 1] + ": " + str(df["score"].iloc[len(df) - 1]))
    datafile.close()

Getting dataset of mutations that are in alpha helices: (H, G, I)

In [16]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is in an alpha helix or not
def get_alpha_boolean(stride_file):
    # print('hi')
    is_alpha = []
    alpha_assign = []

    for line in stride_file:
        # print(line)
        # print("why isn't this working")
        if line.startswith('ASG'):
            split_line = line.split();
            # print(split_line[5])
            alpha_assign.append(split_line[5])
    
#     print(alpha_assign)
    
    alpha_letters = ['H','G','I']
    for alpha in alpha_assign:
        if (alpha_letters.count(alpha) != 0):
            is_alpha.append(True)
        else:
            is_alpha.append(False)
    
#     print(alpha_assign)
#     print(is_alpha)
    
    return is_alpha

Getting dataset of mutations that are in beta sheets: (E, B or b)

In [305]:
def get_beta_boolean(stride_file):
    is_beta = []
    beta_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            beta_assign.append(split_line[5])
    
    beta_letters = ['E','B','b']
    for beta in beta_assign:
        if (beta_letters.count(beta) != 0):
            is_beta.append(True)
        else:
            is_beta.append(False)
    
#     print(beta_assign)
#     print(is_beta)
#     print(len(is_beta))
    
    return is_beta

Getting dataset of mutations that are turns: (T)

In [15]:

def get_turns_boolean(stride_file):
    is_turn = []
    turn_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            turn_assign.append(split_line[5])

    for turn in turn_assign:
        if (turn == "T"):
            is_turn.append(True)
        else:
            is_turn.append(False)
    
    print(turn_assign)
    print(is_turn)
    
    return is_turn

Getting dataset of mutations in secondary structure **including turns**

In [32]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is secondary strcuture or not
def get_all_sec_struc_boolean(stride_file):
    is_sec_struc = []
    sec_struc_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            sec_struc_assign.append(split_line[5])

    for sec_struc in sec_struc_assign:
        if (sec_struc =='C'):
            is_sec_struc.append(False)
        else:
            is_sec_struc.append(True)
            
    return is_sec_struc

Matching Segments of Non Secondary Structure to Secondary Structure

In [17]:
# limit number of mutations to some number
# **use after get_domain dataset

# Parameters:
#    "indexes" - a boolean list indicating positions with secondary structure (True - in ss, False - not in ss)
# This method returns a list of indexes to exclude in order to match the number of positions in secondary structure
# and out of secondary structure
def get_excluded_res(indexes):
    
    # find the groups of secondary structure
    ss_ind = [i for i,val in enumerate(indexes) if val==True]
    ss_ind_groups = list(find_index_range(ss_ind))
    
    # find the groups of non secondary structure
    not_ss_ind = [i for i,val in enumerate(indexes) if val==False]
    not_ss_ind_groups = list(find_index_range(not_ss_ind))

    ind_to_remove = []
    
    num_false = indexes.count(False)
    num_true = indexes.count(True)
    
    if (num_false < num_true): #is mostly ss
        ind_to_remove = remove_indices_helper(not_ss_ind_groups, ss_ind_groups) # chunk with not_ss groups
    elif (num_false > num_true): # NOT mostly ss
        ind_to_remove = remove_indices_helper(ss_ind_groups, not_ss_ind_groups)
    
    print("Num True Indices: " + str(num_true))
    print("Num False Indices: " +  str(num_false))
    print("Difference: " + str(abs(num_true - num_false)))
    print("Num Indices to Remove: " + str(len(ind_to_remove)))

    
    return ind_to_remove
    # return list of indices to NOT include

In [18]:
# Parameters:
#    "chunked_list" - list of ints/tuples representing either ss/not-ss regions that should be matched by corresponding
#                     ss/not-ss regions
#    "to_chunk_list" - list of regions representing regions with excess values that is matched to regions in chunked list
# This method is a helper method that returns a list of indices to remove in order to match the groups of secondary 
# structure and non-secondary structure
def remove_indices_helper(chunked_list, to_chunk_list):
    remainder = []
    count_to_remove = 0
    
#     print("chunked len: " + str(len(chunked_list)))

#     print("to_chunk len: " + str(len(to_chunk_list)))

    for chunk, to_chunk in zip(chunked_list, to_chunk_list): # zip goes through the smallest of the lists
        
        chunk_exp_list = expand_list(chunk)   
        to_chunk_exp_list = expand_list(to_chunk)
        
        if (len(chunk_exp_list) < len(to_chunk_exp_list)):
            remainder.append(to_chunk_exp_list[len(chunk_exp_list):]) # will add indices to remove to remainder list
        elif (len(chunk_exp_list) > len(to_chunk_exp_list)): 
            count_to_remove =  count_to_remove + (len(chunk_exp_list) - len(to_chunk_exp_list))
            
    if (len(chunked_list) > len(to_chunk_list)): #idk if this works
         count_to_remove =  count_to_remove + len(expand_list(chunked_list[-1]))
    
    
    remainder = list(itertools.chain.from_iterable(remainder))
    if (len(to_chunk_list) > len(chunked_list)):
        remainder_copy = remainder.copy()
        print("remainder before: " + str(len(remainder_copy)))
        remainder.extend(expand_list(to_chunk_list[-1]))
        print("remainder after: " + str(len(remainder)))
    
    
    remainder = delete_random_elems(remainder, count_to_remove)
    
    return remainder         
    # returns indices of values that are not to be included

In [19]:
# Parameters:
#    "val" - integer or tuple to be cast as a list
# This method is a helper method that either casts a single integer as a list or expands the range of a tuple
# (inclusive, inclusive)
def expand_list(val):
    val_list = []
    if isinstance(val, int):
        val_list.append(val)
    else:
        val_list = list(range(val[0], val[-1]))
        val_list.append(val[-1])
    
    return val_list

In [20]:
# https://www.codegrepper.com/code-examples/python/python+remove+n+random+elements+from+a+list
# Parameters:
#    "input_list" - list of values
#    "n" - number of random elements to delete from the list
# This method is a helper method that removes a given number of random elements from a list
def delete_random_elems(input_list, n):
    to_delete = set(random.sample(range(len(input_list)), n))
    return [x for i,x in enumerate(input_list) if not i in to_delete]

In [21]:
# determining the ranges of false values 
# https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list

# Parameters:
#    "int_indexes" - list containing a location values for a protein
# This method is a helper method which determines consecutive values in list in order to group regions of 
# secondary structure and non-secondary structure. It returns a list with integers and tuples (inclusive, inclusive)
# representing where a given type of region starts and stops in the protein.
def find_index_range(int_indexes):
    for segment in mit.consecutive_groups(int_indexes):
        segment = list(segment)
        if len(segment) == 1:
            yield segment[0] # yield is like return, except that it
                             # retains state to enable function to resume where
                             # it left off (sequenve of vals vs. 1)
        else:
            yield segment[0], segment[-1]

## Pab1

Formatting Pab1 Data to Split Dataset into Values in Secondary Structure and NOT in Secondary Structure

In [100]:
# NOTE - stride files + jupyter notebook in winter dir.

In [101]:
path = "../PDB and STRIDE Files/" + 'pab1_stride.txt'
pab1_stride_file = open(path, 'r')

In [102]:
pab1_ss_indexes = get_sec_struc_boolean(pab1_stride_file) # boolean list of secondary structure assignements

In [103]:
print(len(pab1_ss_indexes)) # <- domain is 75 AA long
print(pab1_ss_indexes)

75
[False, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, True, True, True, True, True, False, False, False, False, False, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, False, False, True, True, True, True, True, True, False]


In [104]:
# number of mutations not in secondary structure
count_false = pab1_ss_indexes.count(False)
print(count_false)
count_true = pab1_ss_indexes.count(True)
print(count_true)

23
52


Getting Alpha Helices and Beta Sheets Datasets

In [26]:
# pab1_alpha_indices = get_alpha_boolean(pab1_stride_file)

In [27]:
# # alpha helices 
# # pab1_alpha_indices = get_alpha_boolean(pab1_stride_file)
# is_alpha = pab1_alpha_indices.count(True)
# not_alpha = pab1_alpha_indices.count(False)
# print(is_alpha)
# print(not_alpha)

In [29]:
# print(get_excluded_res(pab1_alpha_indices))

In [30]:
# # beta sheets
# pab1_beta_indices = get_beta_boolean(pab1_stride_file)

In [31]:
# is_beta = pab1_beta_indices.count(True)
# not_beta = pab1_beta_indices.count(False)
# print(is_beta)
# print(not_beta)

- Pab1 has 23 Mutations not in Secondary Structure, so limiting Number of Secondary Structure Mutations to 23

In [32]:
# # index of 23rd true

# highest_true_index = [i for i, n in enumerate(pab1_ss_indexes) if n == True][23]
# print(highest_true_index)
# # need list of indices in secondary structure past this index in order to remove them from dataset

# true_indices = [i for i,val in enumerate(pab1_ss_indexes) if val==True]
# print(true_indices)

# not_included_pab1 = [i for i in true_indices if i > 39]
# print(not_included_pab1)

In [105]:
# changing not included to matching secondary structure + random elements

not_included_pab1 = get_excluded_res(pab1_ss_indexes)

Num True Indices: 52
Num False Indices: 23
Difference: 29
Num Indices to Remove: 29


Limiting Number of Secondary Structure Mutations and Number in alpha helices versus out of it

### Sorting Pab1 Mutations Into 2 Datasets (w & w/o mutations)

In [117]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [118]:
# rounding score column to 2 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star thing cause idk what it means
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
print(len(pab1_df))

40852
37710


In [119]:
# split variant name into wild-type, position, and mutation type
pab1_mut = pab1_df["variant"].str.split(",")
pab1_df["WILD_TYPE_RES"] = get_wild_type(pab1_mut)
pab1_df["MUTATED_RES"] = get_mutation_type(pab1_mut)
pab1_df["POSITION"] = get_position(pab1_mut)
pab1_df["positions_split"] = get_positions_split(pab1_df)
# pab1_df["positions_split"] = positions_split_subtracted

positions_split_subtracted = []
for pos_list in pab1_df["positions_split"]:
    pos_list = [x - 126 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

pab1_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pab1_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(pab1_df["POSITION"]))

pab1_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
pab1_df["variant"] = get_mutations_names_list(pab1_df)

In [120]:
print(pab1_df["positions_split"].head(10))

0    [0]
1    [0]
2    [0]
3    [0]
4    [0]
5    [0]
6    [0]
7    [0]
8    [0]
9    [1]
Name: positions_split, dtype: object


In [124]:
print(pab1_df["POSITION"].tail(10))

40842    25,38
40843    38,43
40844    33,38
40845    29,38
40846    38,40
40847    38,49
40848    38,48
40849    38,48
40850    38,47
40851    35,38
Name: POSITION, dtype: object


In [98]:
# print(pab1_df.head)
print(pab1_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION', 'positions_split'],
      dtype='object')


Moving rows with Secondary Structure position into a different dataframe

In [121]:
pab1_df["positions_split"] = get_positions_split(pab1_df)
print(pab1_df["positions_split"].head(20))

0     [0]
1     [0]
2     [0]
3     [0]
4     [0]
5     [0]
6     [0]
7     [0]
8     [0]
9     [1]
10    [1]
11    [1]
12    [1]
13    [1]
14    [1]
15    [1]
16    [1]
17    [1]
18    [1]
19    [1]
Name: positions_split, dtype: object


In [125]:
pab_in_domain_df = get_domain_dataset(pab1_df, 0, 75, not_included_pab1) # now that positions split has changed, domain should not matter

In [126]:
print(len(pab_in_domain_df))

24507


In [127]:
pab1_ss_df = get_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 0)
print(len(pab1_ss_df))
# 5828 values

7501


In [128]:
pab1_not_ss_df = get_not_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 0)
print(len(pab1_not_ss_df))

3927


1000 Value Test dataset in SS

In [129]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pab1_ss_1000_test_df = pab1_ss_df.sample(n=1000)

In [130]:
pab1_temp_df = pd.concat([pab1_ss_1000_test_df, pab1_ss_df])
print(len(pab1_temp_df))
pab1_ss_df = pab1_temp_df[~pab1_temp_df.index.duplicated(keep=False)]
print(len(pab1_ss_df))

8501
6501


1000 Value Test dataset not in SS

In [131]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pab1_not_ss_1000_test_df = pab1_not_ss_df.sample(n=1000)

In [132]:
pab1_temp_df = pd.concat([pab1_not_ss_1000_test_df, pab1_not_ss_df])
print(len(pab1_temp_df))
pab1_not_ss_df = pab1_temp_df[~pab1_temp_df.index.duplicated(keep=False)]
print(len(pab1_not_ss_df))

4927
2927


Training Data

In [133]:
pab1_ss_df_500_t1 = pab1_ss_df.sample(n=500)
pab1_ss_df_500_t2 = pab1_ss_df.sample(n=500)
pab1_ss_df_500_t3 = pab1_ss_df.sample(n=500)

In [134]:
pab1_ss_df_1000_t1 = pab1_ss_df.sample(n=1000)
pab1_ss_df_1000_t2 = pab1_ss_df.sample(n=1000)
pab1_ss_df_1000_t3 = pab1_ss_df.sample(n=1000)

In [None]:
pab1_ss_df_2000_t1 = pab1_ss_df.sample(n=2000)
pab1_ss_df_2000_t2 = pab1_ss_df.sample(n=2000)
pab1_ss_df_2000_t3 = pab1_ss_df.sample(n=2000)

In [135]:
pab1_not_ss_df_500_t1 = pab1_not_ss_df.sample(n=500)
pab1_not_ss_df_500_t2 = pab1_not_ss_df.sample(n=500)
pab1_not_ss_df_500_t3 = pab1_not_ss_df.sample(n=500)

In [136]:
pab1_not_ss_df_1000_t1 = pab1_not_ss_df.sample(n=1000)
pab1_not_ss_df_1000_t2 = pab1_not_ss_df.sample(n=1000)
pab1_not_ss_df_1000_t3 = pab1_not_ss_df.sample(n=1000)

In [None]:
pab1_not_ss_df_2000_t1 = pab1_not_ss_df.sample(n=2000)
pab1_not_ss_df_2000_t2 = pab1_not_ss_df.sample(n=2000)
pab1_not_ss_df_2000_t3 = pab1_not_ss_df.sample(n=2000)

### Putting Pab1 Datasets into Files

In [71]:
# protein_seq_pab1 = get_protein_seq("P04147")
# protein_seq_pab1_split = protein_seq_pab1.split()

In [74]:
# print(protein_seq_pab1)
# print(protein_seq_pab1_split[126])

MET ALA ASP ILE THR ASP LYS THR ALA GLU GLN LEU GLU ASN LEU ASN ILE GLN ASP ASP GLN LYS GLN ALA ALA THR GLY SER GLU SER GLN SER VAL GLU ASN SER SER ALA SER LEU TYR VAL GLY ASP LEU GLU PRO SER VAL SER GLU ALA HIS LEU TYR ASP ILE PHE SER PRO ILE GLY SER VAL SER SER ILE ARG VAL CYS ARG ASP ALA ILE THR LYS THR SER LEU GLY TYR ALA TYR VAL ASN PHE ASN ASP HIS GLU ALA GLY ARG LYS ALA ILE GLU GLN LEU ASN TYR THR PRO ILE LYS GLY ARG LEU CYS ARG ILE MET TRP SER GLN ARG ASP PRO SER LEU ARG LYS LYS GLY SER GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO HIS LEU SER ARG LYS GLU ARG ASP SER GLN LEU GLU GLU THR LYS ALA HIS TYR THR ASN LEU TYR VAL LYS ASN ILE ASN SER GLU THR THR ASP GLU GLN PHE GLN GLU LEU PHE ALA LYS PHE GLY PRO ILE VAL SER ALA SER LEU 

In [137]:
string_seq_pab1 = "GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP"
protein_seq_pab1 = get_expanded_seq(string_seq_pab1)
print(protein_seq_pab1)

GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO


In [307]:
# NOTE - 3000 vals is actually 2880

In [138]:
pab1_ss_500_df_t1 = pd.concat([pab1_ss_df_500_t1, pab1_ss_1000_test_df])
pab1_not_ss_500_df_t1 = pd.concat([pab1_not_ss_df_500_t1, pab1_not_ss_1000_test_df])
pab1_ss_500_df_t2 = pd.concat([pab1_ss_df_500_t2, pab1_ss_1000_test_df])
pab1_not_ss_500_df_t2 = pd.concat([pab1_not_ss_df_500_t2, pab1_not_ss_1000_test_df])
pab1_ss_500_df_t3 = pd.concat([pab1_ss_df_500_t3, pab1_ss_1000_test_df])
# print(len(pab1_ss_500_df_t3))
pab1_not_ss_500_df_t3 = pd.concat([pab1_not_ss_df_500_t3, pab1_not_ss_1000_test_df])
# print(len(pab1_not_ss_500_df_t3))

1500
1500


In [139]:
pab1_ss_1000_df_t1 = pd.concat([pab1_ss_df_1000_t1, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t1 = pd.concat([pab1_not_ss_df_1000_t1, pab1_not_ss_1000_test_df])
pab1_ss_1000_df_t2 = pd.concat([pab1_ss_df_1000_t2, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t2 = pd.concat([pab1_not_ss_df_1000_t2, pab1_not_ss_1000_test_df])
pab1_ss_1000_df_t3 = pd.concat([pab1_ss_df_1000_t3, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t3 = pd.concat([pab1_not_ss_df_1000_t3, pab1_not_ss_1000_test_df])

In [None]:
pab1_ss_2000_df_t1 = pd.concat([pab1_ss_df_2000_t1, pab1_ss_1000_test_df])
pab1_not_ss_2000_df_t1 = pd.concat([pab1_not_ss_df_2000_t1, pab1_not_ss_1000_test_df])
pab1_ss_2000_df_t2 = pd.concat([pab1_ss_df_2000_t2, pab1_ss_1000_test_df])
pab1_not_ss_2000_df_t2 = pd.concat([pab1_not_ss_df_2000_t2, pab1_not_ss_1000_test_df])
pab1_ss_2000_df_t3 = pd.concat([pab1_ss_df_2000_t3, pab1_ss_1000_test_df])
# print(len(pab1_ss_2000_df_t3))
pab1_not_ss_2000_df_t3 = pd.concat([pab1_not_ss_df_2000_t3, pab1_not_ss_1000_test_df])
# print(len(pab1_not_ss_2000_df_t3))

In [140]:
# write data to formatted txt file

write_data_file("pab1_MLformat_ss_500_train_1000_test_t1", protein_seq_pab1, pab1_ss_500_df_t1)
write_data_file("pab1_MLformat_not_ss_500_train_1000_test_t1", protein_seq_pab1, pab1_not_ss_500_df_t1)

Filename: pab1_MLformat_ss_500_train_1000_test_t1.txt
Filename: pab1_MLformat_not_ss_500_train_1000_test_t1.txt


## Bgl3

Formatting Bgl3 Data to Split Dataset into Values in Secondary Structure and NOT in Secondary Structure

In [61]:
path = "../PDB and STRIDE Files/" + 'bgl3_stride.txt'
bgl3_stride_file = open(path, 'r')

In [62]:
bgl3_ss_indexes = get_sec_struc_boolean(bgl3_stride_file)

In [63]:
print(len(bgl3_ss_indexes))
print(bgl3_ss_indexes)

501
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, True, True, True, True, True, False, False, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, False, False, False, False, False, True, True, True, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, True, True, False, False, True, True, True, True, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, False, False, False, False, True, True, True, True, True, True, True, False, False, False, False, False, True, True, True, True, True, True

Getting Alpha and Beta Indices

In [22]:
# bgl3_alpha_indices = get_alpha_boolean(bgl3_stride_file)

In [23]:
# bgl3_beta_indices = get_beta_boolean(bgl3_stride_file)

In [24]:
# is_alpha_bgl3 = bgl3_alpha_indices.count(True)
# not_alpha_bgl3 = bgl3_alpha_indices.count(False)
# print(is_alpha_bgl3)
# print(not_alpha_bgl3) # diff of 115

# is_beta_bgl3 = bgl3_beta_indices.count(True)
# not_beta_bgl3 = bgl3_beta_indices.count(False)
# print(is_beta_bgl3)
# # print(not_beta_bgl3) # diff of 343

In [25]:
# # get residues to exlude

# not_included_alpha_bgl3 = get_excluded_res(bgl3_alpha_indices)
# not_included_beta_bgl3 = get_excluded_res(bgl3_beta_indices)

In [26]:
# number of mutations in secondary structure (True), and not in secondary structure (False)
count_false = bgl3_ss_indexes.count(False)
print(count_false)

count_true = bgl3_ss_indexes.count(True)
print(count_true)

229
272


In [27]:
# # index of 416 true

# highest_true_index = [i for i, n in enumerate(bgl3_ss_indexes) if n == True][229]
# print(highest_true_index)
# # need list of indices past this index

# true_indices = [i for i,val in enumerate(bgl3_ss_indexes) if val==True]
# # print(true_indices)

# not_included_bgl3 = [i for i in true_indices if i > highest_true_index]
# # print(not_included_bgl3)

In [64]:
# changing not included to matching secondary structure + random elements
not_included_bgl3 = get_excluded_res(bgl3_ss_indexes)

Num True Indices: 272
Num False Indices: 229
Difference: 43
Num Indices to Remove: 43


In [65]:
# importing bgl3 data from Gelman et al.
bgl3_df1 = pd.read_csv("../Raw Data/bgl3.tsv.txt", sep="\t")
bgl3_df = bgl3_df1.dropna()
print(len(bgl3_df))
print(bgl3_df.columns)

26653
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [66]:
# rounding score column to 6 decimal points
bgl3_df["score"] = bgl3_df["score"].round(6)
print(len(bgl3_df))

# remove values with wildcard star
bgl3_df = bgl3_df[bgl3_df["variant"].str.contains("\*") == False]
# bgl3_df = bgl3_df.head(25600)
print(len(bgl3_df))

26653
25737


In [67]:
# bgl3 protein sequence
string_seq = "MVPAAQQTAMAPDAALTFPEGFLWGSATASYQIEGAAAEDGRTPSIWDTYARTPGRVRNGDTGDVATDHYHRWREDVALMAELGLGAYRFSLAWPRIQPTGRGPALQKGLDFYRRLADELLAKGIQPVATLYHWDLPQELENAGGWPERATAERFAEYAAIAADALGDRVKTWTTLNEPWCSAFLGYGSGVHAPGRTDPVAALRAAHHLNLGHGLAVQALRDRLPADAQCSVTLNIHHVRPLTDSDADADAVRRIDALANRVFTGPMLQGAYPEDLVKDTAGLTDWSFVRDGDLRLAHQKLDFLGVNYYSPTLVSEADGSGTHNSDGHGRSAHSPWPGADRVAFHQPPGETTAMGWAVDPSGLYELLRRLSSDFPALPLVITENGAAFHDYADPEGNVNDPERIAYVRDHLAAVHRAIKDGSDVRGYFLWSLLDNFEWAHGYSKRFGAVYVDYPTGTRIPKASARWYAEVARTGVLPTAGDPNSSSVDKLAAALEHHHHHH"

In [68]:
print(len(string_seq))

501


In [69]:
protein_seq_bgl3 = get_expanded_seq(string_seq)
print(protein_seq_bgl3)

MET VAL PRO ALA ALA GLN GLN THR ALA MET ALA PRO ASP ALA ALA LEU THR PHE PRO GLU GLY PHE LEU TRP GLY SER ALA THR ALA SER TYR GLN ILE GLU GLY ALA ALA ALA GLU ASP GLY ARG THR PRO SER ILE TRP ASP THR TYR ALA ARG THR PRO GLY ARG VAL ARG ASN GLY ASP THR GLY ASP VAL ALA THR ASP HIS TYR HIS ARG TRP ARG GLU ASP VAL ALA LEU MET ALA GLU LEU GLY LEU GLY ALA TYR ARG PHE SER LEU ALA TRP PRO ARG ILE GLN PRO THR GLY ARG GLY PRO ALA LEU GLN LYS GLY LEU ASP PHE TYR ARG ARG LEU ALA ASP GLU LEU LEU ALA LYS GLY ILE GLN PRO VAL ALA THR LEU TYR HIS TRP ASP LEU PRO GLN GLU LEU GLU ASN ALA GLY GLY TRP PRO GLU ARG ALA THR ALA GLU ARG PHE ALA GLU TYR ALA ALA ILE ALA ALA ASP ALA LEU GLY ASP ARG VAL LYS THR TRP THR THR LEU ASN GLU PRO TRP CYS SER ALA PHE LEU GLY TYR GLY SER GLY VAL HIS ALA PRO GLY ARG THR ASP PRO VAL ALA ALA LEU ARG ALA ALA HIS HIS LEU ASN LEU GLY HIS GLY LEU ALA VAL GLN ALA LEU ARG ASP ARG LEU PRO ALA ASP ALA GLN CYS SER VAL THR LEU ASN ILE HIS HIS VAL ARG PRO LEU THR ASP SER ASP ALA ASP ALA ASP 

In [33]:
split = protein_seq_bgl3.split()
print(len(split))

501


In [70]:
# split variant name into wild-type, position, and mutation type
bgl3_mut = bgl3_df["variant"].str.split(",")
bgl3_df["WILD_TYPE_RES"] = get_wild_type(bgl3_mut)
bgl3_df["MUTATED_RES"] = get_mutation_type(bgl3_mut)
bgl3_df["POSITION"] = get_position(bgl3_mut)
bgl3_df["variant"] = get_mutations_names_list(bgl3_df)

In [71]:
bgl3_df["positions_split"] = get_positions_split(bgl3_df)

In [36]:
print(bgl3_df["positions_split"].head(5))

0         [104]
1    [104, 142]
2    [104, 152]
3    [104, 170]
4         [104]
Name: positions_split, dtype: object


In [120]:
print(bgl3_df.head(5))

          variant  num_mutations   inp    sel     score WILD_TYPE_RES  \
0          104GLU              1  90.0  248.0 -0.339828             A   
1  104GLU, 142GLU              2   0.0    5.0  1.047974           A,A   
2  104GLU, 152VAL              2   1.0    9.0  0.495906           A,E   
3  104GLU, 170ARG              2   0.0    7.0  1.358129           A,K   
4          104GLY              1  35.0   90.0 -0.414104             A   

  MUTATED_RES POSITION positions_split  in_domain  
0           E      104           [104]       True  
1         E,E  104,142      [104, 142]       True  
2         E,V  104,152      [104, 152]       True  
3         E,R  104,170      [104, 170]       True  
4           G      104           [104]       True  


In [74]:
bgl3_in_domain_df = get_domain_dataset(bgl3_df, 0, 550, not_included_bgl3) # ending is larger than sequence length bc. all mutations inside
print(len(bgl3_in_domain_df))

23768


In [None]:
print()

In [408]:
# bgl3_in_domain_alpha_df = get_domain_dataset(bgl3_df, 0, 550, not_included_alpha_bgl3)
# print(len(bgl3_in_domain_alpha_df))

18781


In [423]:
# bgl3_in_domain_beta_df = get_domain_dataset(bgl3_df, 0, 550, not_included_beta_bgl3)
# print(len(bgl3_in_domain_beta_df))

6915


In [38]:
# bgl3_alpha_df = get_ss_dataset(bgl3_in_domain_alpha_df, bgl3_alpha_indices, 0)
# print(len(bgl3_alpha_df))
# bgl3_alpha_df_2880 = bgl3_alpha_df.sample(n=2880)

In [39]:
# bgl3_beta_df = get_ss_dataset(bgl3_in_domain_beta_df, bgl3_beta_indices, 0)
# print(len(bgl3_beta_df))
# bgl3_beta_df_2880 = bgl3_beta_df.sample(n=800)

In [418]:
# bgl3_not_alpha_df = get_not_ss_dataset(bgl3_in_domain_alpha_df, bgl3_alpha_indices, 0)
# print(len(bgl3_not_alpha_df))
# bgl3_not_alpha_df_2880 = bgl3_not_alpha_df.sample(n=2880)

5979


In [128]:
bgl3_ss_df = get_ss_dataset(bgl3_in_domain_df, bgl3_ss_indexes, 0)
print(len(bgl3_ss_df))
# bgl3_ss_df_3000 = bgl3_ss_df.sample(n=2880)

<bound method NDFrame.head of              variant num_mutations    inp    sel     score WILD_TYPE_RES  \
0     116THR, 128THR             2    1.0   30.0  1.662341           A,A   
1     116THR, 118VAL             2    1.0    6.0  0.116416           A,E   
2     116THR, 122GLU             2    0.0    5.0  1.047974           A,K   
3     116THR, 125PRO             2    1.0    8.0  0.384680           A,Q   
4     116THR, 137ARG             2    0.0    9.0  1.594518           A,Q   
...              ...           ...    ...    ...       ...           ...   
6734   87PHE, 129ALA             2    0.0    8.0  1.483292           Y,T   
6735           87HIS             1  576.0  383.0 -1.757557             Y   
6736           87ASN             1  425.0  211.0 -2.048961             Y   
6737   87ASN, 136LEU             2    2.0    4.0 -0.762134           Y,P   
6738           87SER             1  116.0  260.0 -0.545209             Y   

     MUTATED_RES POSITION positions_split has_sec_str is_

In [134]:
print(bgl3_ss_df.columns)
print(bgl3_ss_df['is_not_sec_str'].value_counts())

Index(['variant', 'num_mutations', 'inp', 'sel', 'score', 'WILD_TYPE_RES',
       'MUTATED_RES', 'POSITION', 'positions_split', 'is_not_sec_str'],
      dtype='object')
False    6739
Name: is_not_sec_str, dtype: int64


In [79]:
bgl3_not_ss_df = get_not_ss_dataset(bgl3_in_domain_df, bgl3_ss_indexes, 0)
print(len(bgl3_not_ss_df))
# bgl3_not_ss_df_3000 = bgl3_not_ss_df.sample(n=2880)

7537


1000 Value Test dataset in SS

In [80]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
bgl3_ss_1000_test_df = bgl3_ss_df.sample(n=1000)

In [81]:
bgl3_temp_df = pd.concat([bgl3_ss_1000_test_df, bgl3_ss_df])
print(len(bgl3_temp_df))
bgl3_ss_df = bgl3_temp_df[~bgl3_temp_df.index.duplicated(keep=False)]
print(len(bgl3_ss_df))

7739
5739


1000 Value Test dataset not in SS

In [82]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
bgl3_not_ss_1000_test_df = bgl3_not_ss_df.sample(n=1000)

In [83]:
bgl3_temp_df = pd.concat([bgl3_not_ss_1000_test_df, bgl3_not_ss_df])
print(len(bgl3_temp_df))
bgl3_not_ss_df = bgl3_temp_df[~bgl3_temp_df.index.duplicated(keep=False)]
print(len(bgl3_not_ss_df))

8537
6537


Training Data

In [92]:
bgl3_ss_df_500_t1 = bgl3_ss_df.sample(n=500)
bgl3_ss_df_500_t2 = bgl3_ss_df.sample(n=500)
bgl3_ss_df_500_t3 = bgl3_ss_df.sample(n=500)

In [93]:
bgl3_ss_df_1000_t1 = bgl3_ss_df.sample(n=1000)
bgl3_ss_df_1000_t2 = bgl3_ss_df.sample(n=1000)
bgl3_ss_df_1000_t3 = bgl3_ss_df.sample(n=1000)

In [94]:
bgl3_ss_df_2000_t1 = bgl3_ss_df.sample(n=2000)
bgl3_ss_df_2000_t2 = bgl3_ss_df.sample(n=2000)
bgl3_ss_df_2000_t3 = bgl3_ss_df.sample(n=2000)

In [95]:
bgl3_ss_df_3000_t1 = bgl3_ss_df.sample(n=3000)
bgl3_ss_df_3000_t2 = bgl3_ss_df.sample(n=3000)
bgl3_ss_df_3000_t3 = bgl3_ss_df.sample(n=3000)

In [96]:
bgl3_not_ss_df_500_t1 = bgl3_not_ss_df.sample(n=500)
bgl3_not_ss_df_500_t2 = bgl3_not_ss_df.sample(n=500)
bgl3_not_ss_df_500_t3 = bgl3_not_ss_df.sample(n=500)

In [97]:
bgl3_not_ss_df_1000_t1 = bgl3_not_ss_df.sample(n=1000)
bgl3_not_ss_df_1000_t2 = bgl3_not_ss_df.sample(n=1000)
bgl3_not_ss_df_1000_t3 = bgl3_not_ss_df.sample(n=1000)

In [98]:
bgl3_not_ss_df_2000_t1 = bgl3_not_ss_df.sample(n=2000)
bgl3_not_ss_df_2000_t2 = bgl3_not_ss_df.sample(n=2000)
bgl3_not_ss_df_2000_t3 = bgl3_not_ss_df.sample(n=2000)

In [99]:
bgl3_not_ss_df_3000_t1 = bgl3_not_ss_df.sample(n=3000)
bgl3_not_ss_df_3000_t2 = bgl3_not_ss_df.sample(n=3000)
bgl3_not_ss_df_3000_t3 = bgl3_not_ss_df.sample(n=3000)

In [100]:
bgl3_ss_500_df_t1 = pd.concat([bgl3_ss_df_500_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_500_df_t1 = pd.concat([bgl3_not_ss_df_500_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_500_df_t2 = pd.concat([bgl3_ss_df_500_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_500_df_t2 = pd.concat([bgl3_not_ss_df_500_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_500_df_t3 = pd.concat([bgl3_ss_df_500_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_500_df_t3 = pd.concat([bgl3_not_ss_df_500_t3, bgl3_not_ss_1000_test_df])

In [101]:
bgl3_ss_1000_df_t1 = pd.concat([bgl3_ss_df_1000_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t1 = pd.concat([bgl3_not_ss_df_1000_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_1000_df_t2 = pd.concat([bgl3_ss_df_1000_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t2 = pd.concat([bgl3_not_ss_df_1000_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_1000_df_t3 = pd.concat([bgl3_ss_df_1000_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t3 = pd.concat([bgl3_not_ss_df_1000_t3, bgl3_not_ss_1000_test_df])

In [102]:
bgl3_ss_2000_df_t1 = pd.concat([bgl3_ss_df_2000_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_2000_df_t1 = pd.concat([bgl3_not_ss_df_2000_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_2000_df_t2 = pd.concat([bgl3_ss_df_2000_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_2000_df_t2 = pd.concat([bgl3_not_ss_df_2000_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_2000_df_t3 = pd.concat([bgl3_ss_df_2000_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_2000_df_t3 = pd.concat([bgl3_not_ss_df_2000_t3, bgl3_not_ss_1000_test_df])

In [103]:
bgl3_ss_3000_df_t1 = pd.concat([bgl3_ss_df_3000_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t1 = pd.concat([bgl3_not_ss_df_3000_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_3000_df_t2 = pd.concat([bgl3_ss_df_3000_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t2 = pd.concat([bgl3_not_ss_df_3000_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_3000_df_t3 = pd.concat([bgl3_ss_df_3000_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t3 = pd.concat([bgl3_not_ss_df_3000_t3, bgl3_not_ss_1000_test_df])

In [115]:
##### write data to formatted txt file

write_data_file("bgl3_MLformat_ss_3000_train_1000_test_t3", protein_seq_bgl3, bgl3_ss_3000_df_t3)
write_data_file("bgl3_MLformat_not_ss_3000_train_1000_test_t3", protein_seq_bgl3, bgl3_not_ss_3000_df_t3)

Filename: bgl3_MLformat_ss_3000_train_1000_test_t3.txt
Filename: bgl3_MLformat_not_ss_3000_train_1000_test_t3.txt


## Ube4B

In [161]:
path = "../PDB and STRIDE Files/" + 'ube4b_stride.txt'
ube4b_stride_file = open(path, 'r')

In [162]:
ube4b_ss_indexes = get_sec_struc_boolean(ube4b_stride_file)

In [163]:
print(len(ube4b_ss_indexes))
print(ube4b_ss_indexes)

102
[False, False, False, False, False, False, False, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, False, False, False, False, True, True, False, False, False, True, True, True, True, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, False, False, False, False, True, True, False, False, False, False, False, False, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False]


In [201]:
# # index of 23rd true

# highest_true_index = [i for i, n in enumerate(ube4b_ss_indexes) if n == True][49]
# print(highest_true_index)
# # need list of indices past this index

# true_indices = [i for i,val in enumerate(ube4b_ss_indexes) if val==True]
# print(true_indices)

# not_included_ube4b = [i for i in true_indices if i > highest_true_index]
# # [x for x in a if x <= 5]
# print(not_included_ube4b)

95
[7, 8, 9, 10, 11, 12, 13, 29, 30, 31, 32, 33, 38, 39, 43, 44, 45, 46, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 66, 67, 72, 73, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98]
[96, 97, 98]


Getting Alpha and Beta Indices Datasets

In [428]:
# ube4b_alpha_indices = get_alpha_boolean(ube4b_stride_file)

In [430]:
# ube4b_beta_indices = get_beta_boolean(ube4b_stride_file)

In [137]:
# not_included_alpha_ube4b = get_excluded_res(ube4b_alpha_indices)
# not_included_beta_ube4b = get_excluded_res(ube4b_beta_indices)

In [164]:
# changing not included to matching secondary structure + random elements
not_included_ube4b = get_excluded_res(ube4b_ss_indexes)

Num True Indices: 53
Num False Indices: 49
Difference: 4
Num Indices to Remove: 4


In [139]:
# number of mutations in secondary structure (True), and not in secondary structure (False)
count_false = ube4b_ss_indexes.count(False)
print(count_false)

count_true = ube4b_ss_indexes.count(True)
print(count_true)

49
53


In [165]:
# importing Ube4b data from Gelman et al.
ube4b_df1 = pd.read_csv("../Raw Data/ube4b.tsv.txt", sep="\t")
ube4b_df = ube4b_df1.dropna()
print(len(ube4b_df))
print(ube4b_df.columns)

98297
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [166]:
# rounding score column to 6 decimal points
ube4b_df["score"] = ube4b_df["score"].round(6)
print(len(ube4b_df))

# remove values with wildcard star
ube4b_df = ube4b_df[ube4b_df["variant"].str.contains("\*") == False]
print(len(ube4b_df))

98297
91031


In [163]:
# protein_seq_ube4b = get_protein_seq("Q9ES00")
# split_entire = protein_seq_ube4b.split()
# # print(len(split_entire))

1173


In [146]:
string_seq = "IEKFKLLAEKVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDPFNRQMLTESMLEPVPELKEQIQAWMREKQSSDH"
print(len(string_seq))

102


In [167]:
protein_seq_ube4b = get_expanded_seq(string_seq)
print(protein_seq_ube4b)

ILE GLU LYS PHE LYS LEU LEU ALA GLU LYS VAL GLU GLU ILE VAL ALA LYS ASN ALA ARG ALA GLU ILE ASP TYR SER ASP ALA PRO ASP GLU PHE ARG ASP PRO LEU MET ASP THR LEU MET THR ASP PRO VAL ARG LEU PRO SER GLY THR VAL MET ASP ARG SER ILE ILE LEU ARG HIS LEU LEU ASN SER PRO THR ASP PRO PHE ASN ARG GLN MET LEU THR GLU SER MET LEU GLU PRO VAL PRO GLU LEU LYS GLU GLN ILE GLN ALA TRP MET ARG GLU LYS GLN SER SER ASP HIS


In [152]:
# split = protein_seq_ube4b_domain.split()
# print(len(split)) 
# print(split[96])

In [168]:
ube4b_mut = ube4b_df["variant"].str.split(",")

ube4b_df["WILD_TYPE_RES"] = get_wild_type(ube4b_mut)
ube4b_df["MUTATED_RES"] = get_mutation_type(ube4b_mut)
ube4b_df["POSITION"] = get_position(ube4b_mut)

ube4b_df["variant"] = get_mutations_names_list(ube4b_df)
print(ube4b_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION'],
      dtype='object')


In [153]:
ube4b_df["positions_split"] = get_positions_split(ube4b_df)

In [154]:
# ube4b_in_domain_df = get_domain_dataset(ube4b_df, 0, 1200)
ube4b_in_domain_df = get_domain_dataset(ube4b_df, 0, 2000, not_included_ube4b)
print(len(ube4b_in_domain_df))

87554


In [155]:
# ube4b_in_domain_alpha_df = get_domain_dataset(ube4b_df, 0, 2000, not_included_alpha_ube4b)
# print(len(ube4b_in_domain_alpha_df))

In [156]:
# ube4b_in_domain_beta_df = get_domain_dataset(ube4b_df, 0, 2000, not_included_beta_ube4b)
# print(len(ube4b_in_domain_beta_df))

In [169]:
ube4b_ss_df = get_ss_dataset(ube4b_in_domain_df, ube4b_ss_indexes, 0)
print(len(ube4b_ss_df))

13483


In [170]:
ube4b_not_ss_df = get_not_ss_dataset(ube4b_in_domain_df, ube4b_ss_indexes, 0)
print(len(ube4b_not_ss_df))

20301


In [449]:
# ube4b_alpha_df = get_ss_dataset(ube4b_in_domain_alpha_df, ube4b_alpha_indices, 0)
# print(len(ube4b_alpha_df))
# ube4b_alpha_df_2880 = ube4b_alpha_df.sample(n=2880)

6947


In [453]:
# ube4b_beta_df = get_ss_dataset(ube4b_in_domain_beta_df, ube4b_beta_indices, 0)
# print(len(ube4b_beta_df))
# ube4b_beta_df_800 = ube4b_beta_df.sample(n=800)

939


1000 Value Test dataset in SS

In [171]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
ube4b_ss_1000_test_df = ube4b_ss_df.sample(n=1000)

In [172]:
ube4b_temp_df = pd.concat([ube4b_ss_1000_test_df, ube4b_ss_df])
print(len(ube4b_temp_df))
ube4b_ss_df = ube4b_temp_df[~ube4b_temp_df.index.duplicated(keep=False)]
print(len(ube4b_ss_df))

14483
12483


1000 Value Test dataset not in SS

In [173]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
ube4b_not_ss_1000_test_df = ube4b_not_ss_df.sample(n=1000)

In [174]:
ube4b_temp_df = pd.concat([ube4b_not_ss_1000_test_df, ube4b_not_ss_df])
print(len(ube4b_temp_df))
ube4b_not_ss_df = ube4b_temp_df[~ube4b_temp_df.index.duplicated(keep=False)]
print(len(ube4b_not_ss_df))

21301
19301


Training Data

In [185]:
ube4b_ss_df_500_t1 = ube4b_ss_df.sample(n=500)
ube4b_ss_df_500_t2 = ube4b_ss_df.sample(n=500)
ube4b_ss_df_500_t3 = ube4b_ss_df.sample(n=500)

In [186]:
ube4b_ss_df_1000_t1 = ube4b_ss_df.sample(n=1000)
ube4b_ss_df_1000_t2 = ube4b_ss_df.sample(n=1000)
ube4b_ss_df_1000_t3 = ube4b_ss_df.sample(n=1000)

In [187]:
ube4b_ss_df_2000_t1 = ube4b_ss_df.sample(n=2000)
ube4b_ss_df_2000_t2 = ube4b_ss_df.sample(n=2000)
ube4b_ss_df_2000_t3 = ube4b_ss_df.sample(n=2000)

In [188]:
ube4b_ss_df_3000_t1 = ube4b_ss_df.sample(n=3000)
ube4b_ss_df_3000_t2 = ube4b_ss_df.sample(n=3000)
ube4b_ss_df_3000_t3 = ube4b_ss_df.sample(n=3000)

In [189]:
ube4b_not_ss_df_500_t1 = ube4b_not_ss_df.sample(n=500)
ube4b_not_ss_df_500_t2 = ube4b_not_ss_df.sample(n=500)
ube4b_not_ss_df_500_t3 = ube4b_not_ss_df.sample(n=500)

In [190]:
ube4b_not_ss_df_1000_t1 = ube4b_not_ss_df.sample(n=1000)
ube4b_not_ss_df_1000_t2 = ube4b_not_ss_df.sample(n=1000)
ube4b_not_ss_df_1000_t3 = ube4b_not_ss_df.sample(n=1000)

In [191]:
ube4b_not_ss_df_2000_t1 = ube4b_not_ss_df.sample(n=2000)
ube4b_not_ss_df_2000_t2 = ube4b_not_ss_df.sample(n=2000)
ube4b_not_ss_df_2000_t3 = ube4b_not_ss_df.sample(n=2000)

In [192]:
ube4b_not_ss_df_3000_t1 = ube4b_not_ss_df.sample(n=3000)
ube4b_not_ss_df_3000_t2 = ube4b_not_ss_df.sample(n=3000)
ube4b_not_ss_df_3000_t3 = ube4b_not_ss_df.sample(n=3000)

In [193]:
ube4b_ss_500_df_t1 = pd.concat([ube4b_ss_df_500_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_500_df_t1 = pd.concat([ube4b_not_ss_df_500_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_500_df_t2 = pd.concat([ube4b_ss_df_500_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_500_df_t2 = pd.concat([ube4b_not_ss_df_500_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_500_df_t3 = pd.concat([ube4b_ss_df_500_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_500_df_t3 = pd.concat([ube4b_not_ss_df_500_t3, ube4b_not_ss_1000_test_df])

In [194]:
ube4b_ss_1000_df_t1 = pd.concat([ube4b_ss_df_1000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t1 = pd.concat([ube4b_not_ss_df_1000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_1000_df_t2 = pd.concat([ube4b_ss_df_1000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t2 = pd.concat([ube4b_not_ss_df_1000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_1000_df_t3 = pd.concat([ube4b_ss_df_1000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t3 = pd.concat([ube4b_not_ss_df_1000_t3, ube4b_not_ss_1000_test_df])

In [195]:
ube4b_ss_2000_df_t1 = pd.concat([ube4b_ss_df_2000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t1 = pd.concat([ube4b_not_ss_df_2000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_2000_df_t2 = pd.concat([ube4b_ss_df_2000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t2 = pd.concat([ube4b_not_ss_df_2000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_2000_df_t3 = pd.concat([ube4b_ss_df_2000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t3 = pd.concat([ube4b_not_ss_df_2000_t3, ube4b_not_ss_1000_test_df])

In [196]:
ube4b_ss_3000_df_t1 = pd.concat([ube4b_ss_df_3000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t1 = pd.concat([ube4b_not_ss_df_3000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_3000_df_t2 = pd.concat([ube4b_ss_df_3000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t2 = pd.concat([ube4b_not_ss_df_3000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_3000_df_t3 = pd.concat([ube4b_ss_df_3000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t3 = pd.concat([ube4b_not_ss_df_3000_t3, ube4b_not_ss_1000_test_df])

In [208]:
# write data to formatted txt file

write_data_file("ube4b_MLformat_ss_3000_train_1000_test_t3", protein_seq_ube4b, ube4b_ss_3000_df_t3)
write_data_file("ube4b_MLformat_not_ss_3000_train_1000_test_t3", protein_seq_ube4b, ube4b_not_ss_3000_df_t3)

Filename: ube4b_MLformat_ss_3000_train_1000_test_t3.txt
Filename: ube4b_MLformat_not_ss_3000_train_1000_test_t3.txt
