# Sorting Protein Mutations Using STRIDE

This notebook divides the dataset into seperate datasets depended on a mutations secondary structure assignment. It is used for Part 1 of the project.

In [3]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide
import random
import itertools
import more_itertools as mit

In [4]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

In [5]:
import secStrucFormatting as ssf

### Methods Used to Format Data

Formatting protein sequence into form for machine learning:

In [6]:
# parameters:
#      "uniprot_id" - string representing uniprot id of desired protein.
# This method uses a given uniprot id to query the uniprot data and 
# return a string respresention of the protein sequence. 
# E.g. MADIT
def get_protein_seq(uniprot_id):
    
    # importing fasta file from uniprot.org and getting protein sequence
    # taken from StackOverflow: 
    # https://stackoverflow.com/questions/52569622/protein-sequence-from-uniprot-protein-id-python
    url = "http://www.uniprot.org/uniprot/"
    complete_url = url + uniprot_id + ".fasta"
    response = requests.post(complete_url)
    data =''.join(response.text)
    sequence =StringIO(data)
    protein_seq=list(SeqIO.parse(sequence,'fasta'))

    # protein sequence as string (single-letter amino acids)
    string_seq = str(protein_seq[0].seq)
    
    # protein sequence w/ three-letter convention
    protein_seq = get_expanded_seq(string_seq)
    return protein_seq

Expanding protein sequence (1 letter AA -> 3 letter AA):

In [7]:
# parameter:
#      "seq" - string representing protein sequence in 1-letter convention.
# This method takes protein sequence string with 1-letter convention and returns
# a protein sequence with 3-letter convention.
# E.g. ADE -> ALA ASP GLU
def get_expanded_seq(seq):
    expanded_list = []
    split_seq = list(seq)
    for letter in split_seq:
        three_letter_abbr = Bio.PDB.Polypeptide.one_to_three(letter)
        expanded_list.append(three_letter_abbr)
    exanded_string = " ".join(expanded_list)
    return(exanded_string)

Returning index range of protein domain within protein:

In [8]:
# parameters: 
#      "full_protein_split" - list of amino acids in full protein in 3 letter convention.
#                             E.g. ["ALA", "GLY", "TYR"]
#      "domain_split" - list of amino acids in protein domain in 3 letter convention.
#                       E.g. ["ALA", "GLY"]
# This method prints the index of the given domain within the given protein.
# Starting value is inclusive and the ending value is exclusive. 
# E.g. [(0, 3)]
def get_index_range(full_protein_split, domain_split):
    indexes = []
    for i in range(len(full_protein_split)):
        if full_protein_split[i:i+len(domain_split)] == domain_split:
            indexes.append((i, i+len(domain_split)))
    print(indexes)
    indexes.clear()

Get variant in mutation-position form from wild-type-position-mutation form: (E.g. G126A -> 126ALA)

In [9]:
# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with wild-type residue (first letter) from variant.
def get_wild_type(split_mutation_column):
    wild_type_list = []
    w_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            wild_type = "wild_type"
        elif "-" in string[0] or len(string) == 0:
            wild_type = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                w_letters.append(mutation_name[0])
                wild_type = ",".join(w_letters)
        wild_type_list.append(wild_type)
        w_letters.clear()
    return wild_type_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with mutation residue (last letter) from variant.
def get_mutation_type(split_mutation_column):
    mutation_list = []
    m_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            mutation = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            mutation = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                m_letters.append(mutation_name[-1])
                mutation = ",".join(m_letters)
        mutation_list.append(mutation)
        m_letters.clear()
    return mutation_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with the position of mutation (number) from variant.
def get_position(split_mutation_column):
    position_list = []
    p_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            position = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            position = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                p_letters.append(mutation_name[1:-1])
                position = ",".join(p_letters)
        position_list.append(position)
        p_letters.clear()
    return(position_list)


# parameter:
#      "df" - dataframe of protein data with "MUTATED_RES" and "POSITION" columns.
# This method returns a list with the correctly formatted variant (mutation-position form).
def get_mutations_names_list(df):
    formatted_list = []
    expanded_abbv = []
    for mutation, position in zip(df["MUTATED_RES"], df["POSITION"]):
        split_mutations = mutation.split(",")
        split_positions = position.split(",")
        if "wild-type" in split_mutations[0].lower() or "wild-type" in split_positions[0].lower():
            abbv_names = "WT"
        else:  
            for mut, pos in zip(split_mutations, split_positions):
                three_letter_mut = Bio.PDB.Polypeptide.one_to_three(mut.upper())
                position = str(int(pos))
                combined_name = position + three_letter_mut
                expanded_abbv.append(combined_name)
                abbv_names = ", ".join(expanded_abbv)
        expanded_abbv.clear()
        formatted_list.append(abbv_names)
    return(formatted_list)

Splits positions in intermediary "POSITION" column to help remove mutations with a certain position

In [10]:
# Parameters:
#      "df" - protein data dataframe with "POSITION" column 
# This method takes the position column in the dataframe and splits it in order
# to help remove or keep mutatations depending on their position.
def get_positions_split(df):
    position_list_split = []

    for item in df["POSITION"]:
        item = item.split(",") # splits positions into list
        int_item = [int(i) for i in item]
        position_list_split.append(int_item)
    
    return position_list_split

Getting Secondary Structure assignment from STRIDE file

In [11]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is secondary strcuture or not
def get_sec_struc_boolean(stride_file):
    is_sec_struc = []
    sec_struc_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            sec_struc_assign.append(split_line[5])

    for sec_struc in sec_struc_assign:
        if (sec_struc =='C' or sec_struc =='T'):
            is_sec_struc.append(False)
        else:
            is_sec_struc.append(True)
            
    return is_sec_struc

In [12]:
# getting confidence values from data (ONLY FOR STRIDE FILES W/ ALPHAFOLD CONFIDENCE LEVELS)
# removes indices less than Very low (pLDDT < 50)

def get_low_confidence_indices(pdb_file):
    indices_to_remove = []
    confidence_num = []

    for line in pdb_file:
        if line.startswith('ATOM'):
            split_line = line.split();
            # confidence_num.append(split_line[10])
            confidence_num = float(split_line[10])
            if (confidence_num < 50.00):
                indices_to_remove.append(int(split_line[5]))

#     for val in confidence_num:
#        #  print(confidence_num)
#         if (float(val) < 50.00):
#             is_low_confidence.append(True)
#         else:
#             is_low_confidence.append(False)
#     # get indices that are low confidence
    
    return sorted(list(set(indices_to_remove)))

In [None]:
# def find(lst, a, b):
#     return [i for i, x in enumerate(lst) if x<a or x>b]

In [13]:
path = "../PDB and STRIDE Files/" + 'glucokinase.pdb'
glucokinase_pdb = open(path, 'r')

In [14]:
path = "../PDB and STRIDE Files/" + 'modifier_1.pdb.pdb'
modifier_1_pdb = open(path, 'r')

In [15]:
path = "../PDB and STRIDE Files/" + 'thermonuclease_pdb.txt'
nuclease_pdb = open(path, 'r')

In [16]:
print(get_low_confidence_indices(modifier_1_pdb)) # 1-17, 99-100

[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 99, 101]


In [17]:
print(get_low_confidence_indices(glucokinase_pdb)) # just take out end value from the pdb

[465]


In [18]:
print(get_low_confidence_indices(nuclease_pdb)) # 59-85, 226-231

[59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 226, 227, 228, 229, 230, 231]


Getting Dataset of Mutations within Domain in PDB File

In [19]:
# Parameters:
#      "orig_df" - 
#      "start" -
#      "end" - 
#      "not_included_list"
# This method does even more helpful stuff
def get_domain_dataset(orig_df, start, end, not_included_list):
    in_domain_list = []
    
    for val in orig_df["positions_split"]:
        for position in val:
            if not_included_list.count(position - start) == 0: # if value is not in the list of values to exclude
                if position >= start and position < end:
                    in_domain = True
                else:
                    in_domain = False
            else:
                in_domain = False
        in_domain_list.append(in_domain)
    
    orig_df['in_domain'] = in_domain_list
    # print(in_domain_list)
    condition = orig_df['in_domain'] == True
    rows = orig_df.loc[condition, :]
    
    in_domain_df = pd.DataFrame(columns=orig_df.columns)
    in_domain_df = in_domain_df.append(rows, ignore_index=True)
    in_domain_df = in_domain_df.drop(['in_domain'], axis=1)
    return in_domain_df

In [20]:
# Parameters:
#      "orig_df" - 
#      "start" -
#      "end" - 
#      "not_included_list"
# This method does even more helpful stuff
def get_domain_dataset_v2(orig_df, start, end, not_included_list):
    in_domain_list = []
    
    for val in orig_df["positions_split"]:
        all_vals_in_dom = []
        for position in val:
            if not_included_list.count(position - start) == 0: # if value is not in the list of values to exclude
                if position >= start and position < end:
                    all_vals_in_dom.append(True)
                else:
                    all_vals_in_dom.append(False)
            else:
                all_vals_in_dom.append(False)
    
        if (all_vals_in_dom.count(False) == 0):
            in_domain_list.append(True)
        else:
            in_domain_list.append(False)
        

    orig_df['in_domain'] = in_domain_list

    condition = orig_df['in_domain'] == True
    rows = orig_df.loc[condition, :]
    
    in_domain_df = pd.DataFrame(columns=orig_df.columns)
    in_domain_df = in_domain_df.append(rows, ignore_index=True)
    in_domain_df = in_domain_df.drop(['in_domain'], axis=1)
    return in_domain_df

In [39]:
# finally works the way it's supposed to (texting domain_2)

not_included_list = [1, 3, 5, 7, 9, 11, 13, 15]
positions_split = [[1,2],[2,1],[2,6],[9, 10, 11],[6, 36, 48],[1, 9],[2,6],[1,3],[3,7]]

df = pd.DataFrame()
df["positions_split"] = positions_split
print(df)
domain_df_2 = get_domain_dataset_v2(df, 0, 100, not_included_list)
print(domain_df_2)

  positions_split
0          [1, 2]
1          [2, 1]
2          [2, 6]
3     [9, 10, 11]
4     [6, 36, 48]
5          [1, 9]
6          [2, 6]
7          [1, 3]
8          [3, 7]
  positions_split
0          [2, 6]
1     [6, 36, 48]
2          [2, 6]


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [41]:
# finally works the way it's supposed to (testing ss and not ss)

ss_indexes = [False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True]
positions_split = [[1,2],[2,1],[2,6],[9, 10, 11],[6, 8, 10],[1, 9],[2,6],[1,3],[3,7]]
# ss_ind = [1, 3, 5, 7, 9, 11, 13, 15]
# not_ss_ind = [0, 2, 4, 6, 8, 10, 12, 14]

df = pd.DataFrame()
df["positions_split"] = positions_split
# print(df)
# print()
ss_df = get_ss_dataset(df, ss_indexes, 0)
not_ss_df = get_not_ss_dataset(df, ss_indexes, 0)
print(ss_df)
print(not_ss_df)

  positions_split  has_sec_str
0          [1, 2]        False
1          [2, 1]        False
2          [2, 6]        False
3     [9, 10, 11]        False
4      [6, 8, 10]        False
5          [1, 9]         True
6          [2, 6]        False
7          [1, 3]         True
8          [3, 7]         True
  positions_split
0          [1, 9]
1          [1, 3]
2          [3, 7]
  positions_split has_sec_str
0          [2, 6]       False
1      [6, 8, 10]       False
2          [2, 6]       False


  sec_str_df = sec_str_df.append(rows, ignore_index=True)
  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


Getting Dataset of Mutations _in_ Secondary Structures

In [24]:
# Parameters:
# - orig_df: original dataframe with all mutations and "positions_split" column which has mutation positions in split list
#            as ints
# - sec_st_df: new dataframe with all rows that have mutations in the secondary structure of protein
# - mixed_df: new dataframe with all rows that have mutations in both in and out of the secondary stucture of the protein
# - start: (inclusive) index where the domain of the protein in PDB file starts
# - end: (inclusive) index where the domain of the protein in PDB file ends
def get_ss_dataset(orig_df, bool_ss_list, domain_start_index):
    
    has_sec_str = []
    
    for val in orig_df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []
        
        for position in val:
            if (bool_ss_list[position - domain_start_index] == False): # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)
        
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False
        
        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()
        
    orig_df['has_sec_str'] = has_sec_str
    print(orig_df)
    condition = orig_df['has_sec_str'] == True
    rows = orig_df.loc[condition, :]
    
    sec_str_df = pd.DataFrame(columns=orig_df.columns)
    sec_str_df = sec_str_df.append(rows, ignore_index=True)
    sec_str_df = sec_str_df.drop(['has_sec_str'], axis=1)
    orig_df = orig_df.drop(['has_sec_str'], axis=1)
    
    return sec_str_df

Getting Dataset of Mutations _not_ in Secondary Structures

In [25]:
def get_not_ss_dataset(orig_df, bool_ss_list, domain_start_index):
    is_not_sec_str = []
    
    for val in orig_df["positions_split"]:
        
        all_pos_sec_struc = []
        
        for position in val:
            # print(position - domain_start_index)
            # print(str(position) + " " + str(domain_start_index))
            if (bool_ss_list[position - domain_start_index] == False):
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)
    
        
        if (all_pos_sec_struc.count(True) == 0):
            has_no_sec_str = True
        else:
            has_no_sec_str = False
        
        is_not_sec_str.append(has_no_sec_str)
        all_pos_sec_struc.clear()
        
    orig_df['is_not_sec_str'] = is_not_sec_str
     
    condition = orig_df['is_not_sec_str'] == True
    rows = orig_df.loc[condition, :]
    
    not_sec_str_df = pd.DataFrame(columns=orig_df.columns)
    not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)
    not_sec_str_df = not_sec_str_df.drop(['is_not_sec_str'], axis=1)
    orig_df = orig_df.drop(['is_not_sec_str'], axis=1)
    
    return not_sec_str_df

Writing formatted data to txt file:

In [26]:
# parameters:
#      "txt_name" - desired name of formatted txt file for network. E.g. "pab1"
#      "protein_seq" - string of protein sequence in 3 letter convention. E.g. ALA GLU TYR
#      "df" - dataframe with cleaned protein data. Must contain "variant" and "score" 
#             columns.
# This method cleans the protein data and formats it into a txt that can be processed by the 
# network. It also prints the name of the file out for reference.
def write_data_file(txt_name, protein_seq, df):
    file_name = txt_name + ".txt"
    path_name = "../ML Script Data Files/" + file_name
    print("Filename: " + file_name)
    
    datafile = open(path_name, "w+")
    datafile.write(protein_seq + "\n")
    for index in range(len(df)-1):
        datafile.write(df["variant"].iloc[index] + ": " + str(df["score"].iloc[index]) + "\n")
    datafile.write(df["variant"].iloc[len(df) - 1] + ": " + str(df["score"].iloc[len(df) - 1]))
    datafile.close()

Getting dataset of mutations that are in alpha helices: (H, G, I)

In [27]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is in an alpha helix or not
def get_alpha_boolean(stride_file):
    # print('hi')
    is_alpha = []
    alpha_assign = []

    for line in stride_file:
        # print(line)
        # print("why isn't this working")
        if line.startswith('ASG'):
            split_line = line.split();
            # print(split_line[5])
            alpha_assign.append(split_line[5])
    
#     print(alpha_assign)
    
    alpha_letters = ['H','G','I']
    for alpha in alpha_assign:
        if (alpha_letters.count(alpha) != 0):
            is_alpha.append(True)
        else:
            is_alpha.append(False)
    
#     print(alpha_assign)
#     print(is_alpha)
    
    return is_alpha

Getting dataset of mutations that are in beta sheets: (E, B or b)

In [20]:
def get_beta_boolean(stride_file):
    is_beta = []
    beta_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            beta_assign.append(split_line[5])
    
    beta_letters = ['E','B','b']
    for beta in beta_assign:
        if (beta_letters.count(beta) != 0):
            is_beta.append(True)
        else:
            is_beta.append(False)
    
#     print(beta_assign)
#     print(is_beta)
#     print(len(is_beta))
    
    return is_beta

Getting dataset of mutations that are turns: (T)

In [41]:
def get_turns_boolean(stride_file):
    is_turn = []
    turn_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            turn_assign.append(split_line[5])

    for turn in turn_assign:
        if (turn == "T"):
            is_turn.append(True)
        else:
            is_turn.append(False)
    
    print(turn_assign)
    print(is_turn)
    
    return is_turn

Getting dataset of mutations in secondary structure **including turns**

In [28]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is secondary strcuture or not
def get_all_sec_struc_boolean(stride_file):
    is_sec_struc = []
    sec_struc_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            sec_struc_assign.append(split_line[5])

    for sec_struc in sec_struc_assign:
        if (sec_struc =='C'):
            is_sec_struc.append(False)
        else:
            is_sec_struc.append(True)
            
    return is_sec_struc

Matching Segments of Non Secondary Structure to Secondary Structure

In [29]:
# limit number of mutations to some number
# **use after get_domain dataset

# Parameters:
#    "indexes" - a boolean list indicating positions with secondary structure (True - in ss, False - not in ss)
# This method returns a list of indexes to exclude in order to match the number of positions in secondary structure
# and out of secondary structure
def get_excluded_res(indexes):
    
    # find the groups of secondary structure
    ss_ind = [i for i,val in enumerate(indexes) if val==True]
    ss_ind_groups = list(find_index_range(ss_ind))
    
    # find the groups of non secondary structure
    not_ss_ind = [i for i,val in enumerate(indexes) if val==False]
    not_ss_ind_groups = list(find_index_range(not_ss_ind))

    ind_to_remove = []
    
    num_false = indexes.count(False)
    num_true = indexes.count(True)
    
    if (num_false < num_true): #is mostly ss
        ind_to_remove = remove_indices_helper(not_ss_ind_groups, ss_ind_groups) # chunk with not_ss groups
    elif (num_false > num_true): # NOT mostly ss
        ind_to_remove = remove_indices_helper(ss_ind_groups, not_ss_ind_groups)
    
    print("Num True Indices: " + str(num_true))
    print("Num False Indices: " +  str(num_false))
    print("Difference: " + str(abs(num_true - num_false)))
    print("Num Indices to Remove: " + str(len(ind_to_remove)))

    
    return ind_to_remove
    # return list of indices to NOT include

In [30]:
# Parameters:
#    "chunked_list" - list of ints/tuples representing either ss/not-ss regions that should be matched by corresponding
#                     ss/not-ss regions
#    "to_chunk_list" - list of regions representing regions with excess values that is matched to regions in chunked list
# This method is a helper method that returns a list of indices to remove in order to match the groups of secondary 
# structure and non-secondary structure
def remove_indices_helper(chunked_list, to_chunk_list):
    remainder = []
    count_to_remove = 0
    
#     print("chunked len: " + str(len(chunked_list)))

#     print("to_chunk len: " + str(len(to_chunk_list)))

    for chunk, to_chunk in zip(chunked_list, to_chunk_list): # zip goes through the smallest of the lists
        
        chunk_exp_list = expand_list(chunk)   
        to_chunk_exp_list = expand_list(to_chunk)
        
        if (len(chunk_exp_list) < len(to_chunk_exp_list)):
            remainder.append(to_chunk_exp_list[len(chunk_exp_list):]) # will add indices to remove to remainder list
        elif (len(chunk_exp_list) > len(to_chunk_exp_list)): 
            count_to_remove =  count_to_remove + (len(chunk_exp_list) - len(to_chunk_exp_list))
            
    if (len(chunked_list) > len(to_chunk_list)): #idk if this works
         count_to_remove =  count_to_remove + len(expand_list(chunked_list[-1]))
    
    
    remainder = list(itertools.chain.from_iterable(remainder))
    if (len(to_chunk_list) > len(chunked_list)):
        remainder_copy = remainder.copy()
        print("remainder before: " + str(len(remainder_copy)))
        remainder.extend(expand_list(to_chunk_list[-1]))
        print("remainder after: " + str(len(remainder)))
    
    
    remainder = delete_random_elems(remainder, count_to_remove)
    
    return remainder         
    # returns indices of values that are not to be included

In [31]:
# Parameters:
#    "val" - integer or tuple to be cast as a list
# This method is a helper method that either casts a single integer as a list or expands the range of a tuple
# (inclusive, inclusive)
def expand_list(val):
    val_list = []
    if isinstance(val, int):
        val_list.append(val)
    else:
        val_list = list(range(val[0], val[-1]))
        val_list.append(val[-1])
    
    return val_list

In [32]:
# https://www.codegrepper.com/code-examples/python/python+remove+n+random+elements+from+a+list
# Parameters:
#    "input_list" - list of values
#    "n" - number of random elements to delete from the list
# This method is a helper method that removes a given number of random elements from a list
def delete_random_elems(input_list, n):
    to_delete = set(random.sample(range(len(input_list)), n))
    return [x for i,x in enumerate(input_list) if not i in to_delete]

In [33]:
# determining the ranges of false values 
# https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list

# Parameters:
#    "int_indexes" - list containing a location values for a protein
# This method is a helper method which determines consecutive values in list in order to group regions of 
# secondary structure and non-secondary structure. It returns a list with integers and tuples (inclusive, inclusive)
# representing where a given type of region starts and stops in the protein.
def find_index_range(int_indexes):
    for segment in mit.consecutive_groups(int_indexes):
        segment = list(segment)
        if len(segment) == 1:
            yield segment[0] # yield is like return, except that it
                             # retains state to enable function to resume where
                             # it left off (sequenve of vals vs. 1)
        else:
            yield segment[0], segment[-1]

In [34]:
# adds boolean column to dataframe to indicate whether value is in secondary structure
# needs positions split column
def add_sec_str_col(df, bool_ss_list, domain_start_index):
    has_sec_str = []
    for val in df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []

        for position in val:
            if (bool_ss_list[position - domain_start_index] == False):  # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)

        # all pos sec struc should match val list
        # if there's a value in all_pos_sec_struc that's false, append false
        # otherwise, append true
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False

        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()

    # print(len(has_sec_str)) # should match dataframe length
    df['in_sec_str'] = has_sec_str
    return df

In [35]:
def get_seq_from_stride(stride_file):    
    aa_str = ""

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split()
            aa_str =  aa_str + split_line[1] + " "

    return aa_str.rstrip()

In [36]:
import re

def format_mavedb_variant(df, variant_col_name, offset):
    new_var_col = []
    for variant in df[variant_col_name]:
        wild_type = Bio.PDB.Polypeptide.three_to_one(variant[2:5].upper())
        position = int(re.findall("[0-9]+", variant)[0]) + offset
        mut_type = Bio.PDB.Polypeptide.three_to_one(variant[-3:].upper())
        new_var_col.append(wild_type + str(position) + mut_type)
    return new_var_col

In [37]:
# def remove_mixed_pos(df, bool_ss_list, offset):
#     is_completely_one_type = []
    
#     for val in orig_df["positions_split"]:
#         sec_str_assign = []
#         for position in val:
#             if bool_ss_list[position - offset] == 0: # if value is not in the list of values to exclude
#                 if position >= start and position < end:
#                     all_vals_in_dom.append(True)
#                 else:
#                     all_vals_in_dom.append(False)
#             else:
#                 all_vals_in_dom.append(False)
    
#         if (all_vals_in_dom.count(False) == 0):
#             in_domain_list.append(True)
#         else:
#             in_domain_list.append(False)
        

#     orig_df['in_domain'] = in_domain_list

#     condition = orig_df['in_domain'] == True
#     rows = orig_df.loc[condition, :]
    
#     in_domain_df = pd.DataFrame(columns=orig_df.columns)
#     in_domain_df = in_domain_df.append(rows, ignore_index=True)
#     in_domain_df = in_domain_df.drop(['in_domain'], axis=1)

## Pab1

Formatting Pab1 Data to Split Dataset into Values in Secondary Structure and NOT in Secondary Structure

In [31]:
# NOTE - stride files + jupyter notebook in winter dir.

In [197]:
path = "../PDB and STRIDE Files/" + 'pab1_stride.txt'
pab1_stride_file = open(path, 'r')

In [198]:
pab1_ss_indexes = get_all_sec_struc_boolean(pab1_stride_file) # w/ turns
# pab1_ss_indexes = get_sec_struc_boolean(pab1_stride_file) # boolean list of secondary structure assignements

In [34]:
print(len(pab1_ss_indexes)) # <- domain is 75 AA long
print(pab1_ss_indexes)

75
[False, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False]


In [35]:
# number of mutations not in secondary structure
count_false = pab1_ss_indexes.count(False)
print(count_false)
count_true = pab1_ss_indexes.count(True)
print(count_true)

7
68


Getting Alpha Helices and Beta Sheets Datasets

In [26]:
# pab1_alpha_indices = get_alpha_boolean(pab1_stride_file)

In [27]:
# # alpha helices 
# # pab1_alpha_indices = get_alpha_boolean(pab1_stride_file)
# is_alpha = pab1_alpha_indices.count(True)
# not_alpha = pab1_alpha_indices.count(False)
# print(is_alpha)
# print(not_alpha)

In [29]:
# print(get_excluded_res(pab1_alpha_indices))

In [30]:
# # beta sheets
# pab1_beta_indices = get_beta_boolean(pab1_stride_file)

In [31]:
# is_beta = pab1_beta_indices.count(True)
# not_beta = pab1_beta_indices.count(False)
# print(is_beta)
# print(not_beta)

- Pab1 has 23 Mutations not in Secondary Structure, so limiting Number of Secondary Structure Mutations to 23

In [32]:
# # index of 23rd true

# highest_true_index = [i for i, n in enumerate(pab1_ss_indexes) if n == True][23]
# print(highest_true_index)
# # need list of indices in secondary structure past this index in order to remove them from dataset

# true_indices = [i for i,val in enumerate(pab1_ss_indexes) if val==True]
# print(true_indices)

# not_included_pab1 = [i for i in true_indices if i > 39]
# print(not_included_pab1)

In [36]:
# changing not included to matching secondary structure + random elements

not_included_pab1 = get_excluded_res(pab1_ss_indexes)

Num True Indices: 68
Num False Indices: 7
Difference: 61
Num Indices to Remove: 61


Limiting Number of Secondary Structure Mutations and Number in alpha helices versus out of it

### Sorting Pab1 Mutations Into 2 Datasets (w & w/o mutations)

In [37]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [38]:
# rounding score column to 2 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star thing cause idk what it means
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
print(len(pab1_df))

40852
37710


In [39]:
# split variant name into wild-type, position, and mutation type
pab1_mut = pab1_df["variant"].str.split(",")
pab1_df["WILD_TYPE_RES"] = get_wild_type(pab1_mut)
pab1_df["MUTATED_RES"] = get_mutation_type(pab1_mut)
pab1_df["POSITION"] = get_position(pab1_mut)
pab1_df["positions_split"] = get_positions_split(pab1_df)
# pab1_df["positions_split"] = positions_split_subtracted

positions_split_subtracted = []
for pos_list in pab1_df["positions_split"]:
    pos_list = [x - 126 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

pab1_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pab1_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(pab1_df["POSITION"]))

pab1_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
pab1_df["variant"] = get_mutations_names_list(pab1_df)

In [213]:
print(pab1_df["positions_split"].head(10))

0    [0]
1    [0]
2    [0]
3    [0]
4    [0]
5    [0]
6    [0]
7    [0]
8    [0]
9    [1]
Name: positions_split, dtype: object


In [124]:
print(pab1_df["POSITION"].tail(10))

40842    25,38
40843    38,43
40844    33,38
40845    29,38
40846    38,40
40847    38,49
40848    38,48
40849    38,48
40850    38,47
40851    35,38
Name: POSITION, dtype: object


In [98]:
# print(pab1_df.head)
print(pab1_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION', 'positions_split'],
      dtype='object')


Moving rows with Secondary Structure position into a different dataframe

In [91]:
pab1_df["positions_split"] = get_positions_split(pab1_df)
print(pab1_df["positions_split"].head(20))

0     [0]
1     [0]
2     [0]
3     [0]
4     [0]
5     [0]
6     [0]
7     [0]
8     [0]
9     [1]
10    [1]
11    [1]
12    [1]
13    [1]
14    [1]
15    [1]
16    [1]
17    [1]
18    [1]
19    [1]
Name: positions_split, dtype: object


In [40]:
pab_in_domain_df = get_domain_dataset_v2(pab1_df, 0, 75, not_included_pab1) # now that positions split has changed, domain should not matter

  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [41]:
print(len(pab_in_domain_df))

1648


In [42]:
pab1_ss_df = get_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 0)
print(len(pab1_ss_df))

           variant num_mutations     score WILD_TYPE_RES MUTATED_RES POSITION  \
0             0ALA             1 -0.490571             G           A        0   
1             0CYS             1 -1.155127             G           C        0   
2             0ASP             1 -2.121218             G           D        0   
3             0GLU             1 -0.763836             G           E        0   
4             0ASN             1 -0.557593             G           N        0   
...            ...           ...       ...           ...         ...      ...   
1643  38ASN, 39PHE             2 -0.503557           K,S         N,F    38,39   
1644  26LEU, 38ASN             2 -1.027143           I,K         L,N    26,38   
1645  29ARG, 38ASN             2 -2.859673           S,K         R,N    29,38   
1646  25GLU, 38ASN             2 -0.351314           D,K         E,N    25,38   
1647  29GLY, 38ASN             2 -0.338770           S,K         G,N    29,38   

     positions_split  has_s

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [43]:
pab1_not_ss_df = get_not_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 0)
print(len(pab1_not_ss_df))

362


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


1000 Value Test dataset in SS

In [199]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pab1_ss_50_test_df = pab1_ss_df.sample(n=50)

In [200]:
pab1_temp_df = pd.concat([pab1_ss_50_test_df, pab1_ss_df])
print(len(pab1_temp_df))
pab1_ss_df = pab1_temp_df[~pab1_temp_df.index.duplicated(keep=False)]
print(len(pab1_ss_df))

573
473


1000 Value Test dataset not in SS

In [201]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pab1_not_ss_50_test_df = pab1_not_ss_df.sample(n=50)

In [202]:
pab1_temp_df = pd.concat([pab1_not_ss_50_test_df, pab1_not_ss_df])
print(len(pab1_temp_df))
pab1_not_ss_df = pab1_temp_df[~pab1_temp_df.index.duplicated(keep=False)]
print(len(pab1_not_ss_df))

412
312


Training Data

In [None]:
# diff test set for 50 vals

In [41]:
pab1_ss_df_50_t1 = pab1_ss_df.sample(n=50)
pab1_ss_df_50_t2 = pab1_ss_df.sample(n=50)
pab1_ss_df_50_t3 = pab1_ss_df.sample(n=50)

In [203]:
pab1_ss_df_200_t1 = pab1_ss_df.sample(n=200)
pab1_ss_df_200_t2 = pab1_ss_df.sample(n=200)
pab1_ss_df_200_t3 = pab1_ss_df.sample(n=200)

In [68]:
pab1_ss_df_500_t1 = pab1_ss_df.sample(n=500)
# pab1_ss_df_500_t2 = pab1_ss_df.sample(n=500)
# pab1_ss_df_500_t3 = pab1_ss_df.sample(n=500)

In [134]:
pab1_ss_df_1000_t1 = pab1_ss_df.sample(n=1000)
pab1_ss_df_1000_t2 = pab1_ss_df.sample(n=1000)
pab1_ss_df_1000_t3 = pab1_ss_df.sample(n=1000)

In [146]:
pab1_ss_df_2000_t1 = pab1_ss_df.sample(n=2000)
pab1_ss_df_2000_t2 = pab1_ss_df.sample(n=2000)
pab1_ss_df_2000_t3 = pab1_ss_df.sample(n=2000)

In [42]:
pab1_not_ss_df_50_t1 = pab1_not_ss_df.sample(n=50)
pab1_not_ss_df_50_t2 = pab1_not_ss_df.sample(n=50)
pab1_not_ss_df_50_t3 = pab1_not_ss_df.sample(n=50)

In [204]:
pab1_not_ss_df_200_t1 = pab1_not_ss_df.sample(n=200)
pab1_not_ss_df_200_t2 = pab1_not_ss_df.sample(n=200)
pab1_not_ss_df_200_t3 = pab1_not_ss_df.sample(n=200)

In [69]:
pab1_not_ss_df_500_t1 = pab1_not_ss_df.sample(n=500)
# pab1_not_ss_df_500_t2 = pab1_not_ss_df.sample(n=500)
# pab1_not_ss_df_500_t3 = pab1_not_ss_df.sample(n=500)

In [136]:
pab1_not_ss_df_1000_t1 = pab1_not_ss_df.sample(n=1000)
pab1_not_ss_df_1000_t2 = pab1_not_ss_df.sample(n=1000)
pab1_not_ss_df_1000_t3 = pab1_not_ss_df.sample(n=1000)

In [147]:
pab1_not_ss_df_2000_t1 = pab1_not_ss_df.sample(n=2000)
pab1_not_ss_df_2000_t2 = pab1_not_ss_df.sample(n=2000)
pab1_not_ss_df_2000_t3 = pab1_not_ss_df.sample(n=2000)

### Putting Pab1 Datasets into Files

In [71]:
# protein_seq_pab1 = get_protein_seq("P04147")
# protein_seq_pab1_split = protein_seq_pab1.split()

In [74]:
# print(protein_seq_pab1)
# print(protein_seq_pab1_split[126])

MET ALA ASP ILE THR ASP LYS THR ALA GLU GLN LEU GLU ASN LEU ASN ILE GLN ASP ASP GLN LYS GLN ALA ALA THR GLY SER GLU SER GLN SER VAL GLU ASN SER SER ALA SER LEU TYR VAL GLY ASP LEU GLU PRO SER VAL SER GLU ALA HIS LEU TYR ASP ILE PHE SER PRO ILE GLY SER VAL SER SER ILE ARG VAL CYS ARG ASP ALA ILE THR LYS THR SER LEU GLY TYR ALA TYR VAL ASN PHE ASN ASP HIS GLU ALA GLY ARG LYS ALA ILE GLU GLN LEU ASN TYR THR PRO ILE LYS GLY ARG LEU CYS ARG ILE MET TRP SER GLN ARG ASP PRO SER LEU ARG LYS LYS GLY SER GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO HIS LEU SER ARG LYS GLU ARG ASP SER GLN LEU GLU GLU THR LYS ALA HIS TYR THR ASN LEU TYR VAL LYS ASN ILE ASN SER GLU THR THR ASP GLU GLN PHE GLN GLU LEU PHE ALA LYS PHE GLY PRO ILE VAL SER ALA SER LEU 

In [207]:
string_seq_pab1 = "GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP"
protein_seq_pab1 = get_expanded_seq(string_seq_pab1)
print(protein_seq_pab1)

GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO


In [307]:
# NOTE - 3000 vals is actually 2880

In [70]:
pab1_ss_50_df_t1 = pd.concat([pab1_ss_df_50_t1, pab1_ss_1000_test_df])
pab1_not_ss_50_df_t1 = pd.concat([pab1_not_ss_df_50_t1, pab1_not_ss_1000_test_df])
# pab1_ss_50_df_t2 = pd.concat([pab1_ss_df_50_t2, pab1_ss_1000_test_df])
# pab1_not_ss_50_df_t2 = pd.concat([pab1_not_ss_df_50_t2, pab1_not_ss_1000_test_df])
# pab1_ss_50_df_t3 = pd.concat([pab1_ss_df_50_t3, pab1_ss_1000_test_df])
# # print(len(pab1_ss_50_df_t3))
# pab1_not_ss_50_df_t3 = pd.concat([pab1_not_ss_df_50_t3, pab1_not_ss_1000_test_df])
# # print(len(pab1_not_ss_50_df_t3))

In [205]:
pab1_ss_200_df_t1 = pd.concat([pab1_ss_df_200_t1, pab1_ss_50_test_df])
pab1_not_ss_200_df_t1 = pd.concat([pab1_not_ss_df_200_t1, pab1_ss_50_test_df])
pab1_ss_200_df_t2 = pd.concat([pab1_ss_df_200_t2, pab1_ss_50_test_df])
pab1_not_ss_200_df_t2 = pd.concat([pab1_not_ss_df_200_t2, pab1_ss_50_test_df])
pab1_ss_200_df_t3 = pd.concat([pab1_ss_df_200_t3, pab1_ss_50_test_df])
pab1_not_ss_200_df_t3 = pd.concat([pab1_not_ss_df_200_t3, pab1_ss_50_test_df])

In [72]:
pab1_ss_500_df_t1 = pd.concat([pab1_ss_df_500_t1, pab1_ss_1000_test_df])
pab1_not_ss_500_df_t1 = pd.concat([pab1_not_ss_df_500_t1, pab1_not_ss_1000_test_df])
# pab1_ss_500_df_t2 = pd.concat([pab1_ss_df_500_t2, pab1_ss_1000_test_df])
# # pab1_not_ss_500_df_t2 = pd.concat([pab1_not_ss_df_500_t2, pab1_not_ss_1000_test_df])
# # pab1_ss_500_df_t3 = pd.concat([pab1_ss_df_500_t3, pab1_ss_1000_test_df])
# # # print(len(pab1_ss_500_df_t3))
# # pab1_not_ss_500_df_t3 = pd.concat([pab1_not_ss_df_500_t3, pab1_not_ss_1000_test_df])
# # # print(len(pab1_not_ss_500_df_t3))

In [139]:
pab1_ss_1000_df_t1 = pd.concat([pab1_ss_df_1000_t1, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t1 = pd.concat([pab1_not_ss_df_1000_t1, pab1_not_ss_1000_test_df])
pab1_ss_1000_df_t2 = pd.concat([pab1_ss_df_1000_t2, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t2 = pd.concat([pab1_not_ss_df_1000_t2, pab1_not_ss_1000_test_df])
pab1_ss_1000_df_t3 = pd.concat([pab1_ss_df_1000_t3, pab1_ss_1000_test_df])
pab1_not_ss_1000_df_t3 = pd.concat([pab1_not_ss_df_1000_t3, pab1_not_ss_1000_test_df])

In [148]:
pab1_ss_2000_df_t1 = pd.concat([pab1_ss_df_2000_t1, pab1_ss_1000_test_df])
pab1_not_ss_2000_df_t1 = pd.concat([pab1_not_ss_df_2000_t1, pab1_not_ss_1000_test_df])
pab1_ss_2000_df_t2 = pd.concat([pab1_ss_df_2000_t2, pab1_ss_1000_test_df])
pab1_not_ss_2000_df_t2 = pd.concat([pab1_not_ss_df_2000_t2, pab1_not_ss_1000_test_df])
pab1_ss_2000_df_t3 = pd.concat([pab1_ss_df_2000_t3, pab1_ss_1000_test_df])
# print(len(pab1_ss_2000_df_t3))
pab1_not_ss_2000_df_t3 = pd.concat([pab1_not_ss_df_2000_t3, pab1_not_ss_1000_test_df])
# print(len(pab1_not_ss_2000_df_t3))

In [228]:
# write data to formatted txt file


write_data_file("pab1_MLformat_ss_200_train_50_test_turns1", protein_seq_pab1, pab1_ss_200_df_t1)
write_data_file("pab1_MLformat_not_ss_200_train_50_test_turns1", protein_seq_pab1, pab1_not_ss_200_df_t1)
write_data_file("pab1_MLformat_ss_200_train_50_test_turns2", protein_seq_pab1, pab1_ss_200_df_t1)
write_data_file("pab1_MLformat_not_ss_200_train_50_test_turns2", protein_seq_pab1, pab1_not_ss_200_df_t1)
write_data_file("pab1_MLformat_ss_200_train_50_test_turns3", protein_seq_pab1, pab1_ss_200_df_t1)
write_data_file("pab1_MLformat_not_ss_200_train_50_test_turns3", protein_seq_pab1, pab1_not_ss_200_df_t1)
# write_data_file("pab1_MLformat_ss_200_train_50_test_v3", protein_seq_pab1, pab1_ss_200_df_t3)
# write_data_file("pab1_MLformat_not_ss_200_train_50_test_v3", protein_seq_pab1, pab1_not_ss_200_df_t3)

Filename: pab1_MLformat_ss_200_train_50_test_turns1.txt
Filename: pab1_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: pab1_MLformat_ss_200_train_50_test_turns2.txt
Filename: pab1_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: pab1_MLformat_ss_200_train_50_test_turns3.txt
Filename: pab1_MLformat_not_ss_200_train_50_test_turns3.txt


## Bgl3

Formatting Bgl3 Data to Split Dataset into Values in Secondary Structure and NOT in Secondary Structure

In [44]:
path = "../PDB and STRIDE Files/" + 'bgl3_stride.txt'
bgl3_stride_file = open(path, 'r')

In [45]:
# bgl3_ss_indexes = get_sec_struc_boolean(bgl3_stride_file)
bgl3_ss_indexes = get_all_sec_struc_boolean(bgl3_stride_file)

In [46]:
print(len(bgl3_ss_indexes))
print(bgl3_ss_indexes)

501
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, False, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, 

In [120]:
ind = 0
false_ind = []
true_ind = []
for val in bgl3_ss_indexes:
    if val == False:
        false_ind.append(ind)
    else:
        true_ind.append(ind)
    ind+= 1

In [121]:
print(false_ind)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 27, 28, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 83, 84, 85, 91, 92, 97, 98, 99, 100, 101, 102, 103, 104, 105, 123, 124, 132, 133, 134, 135, 143, 144, 145, 146, 147, 167, 168, 169, 170, 177, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 223, 224, 225, 226, 227, 235, 236, 237, 241, 242, 243, 244, 269, 270, 271, 272, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 297, 298, 299, 300, 301, 307, 308, 309, 310, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 344, 345, 346, 347, 348, 349, 351, 352, 353, 354, 355, 357, 358, 373, 374, 375, 376, 377, 384, 385, 386, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 420, 421, 422, 429, 433, 434, 435, 441, 442, 443, 445, 452, 453, 454, 455, 460, 473, 476, 477, 478, 479, 480, 481, 482, 483, 484, 

In [126]:
str_exp = "("

for val in false_ind:
    str_exp += str(val) + "|"

In [127]:
print(str_exp)

(0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|17|18|19|20|21|27|28|34|35|36|37|38|39|40|41|42|43|52|53|54|55|56|60|61|62|63|64|65|66|67|68|69|70|83|84|85|91|92|97|98|99|100|101|102|103|104|105|123|124|132|133|134|135|143|144|145|146|147|167|168|169|170|177|188|189|190|191|192|193|194|195|196|197|223|224|225|226|227|235|236|237|241|242|243|244|269|270|271|272|280|281|282|283|284|285|286|287|288|289|290|291|297|298|299|300|301|307|308|309|310|314|315|316|317|318|319|320|321|322|323|324|325|326|327|328|329|330|331|332|333|334|335|336|337|338|339|340|341|342|344|345|346|347|348|349|351|352|353|354|355|357|358|373|374|375|376|377|384|385|386|388|389|390|391|392|393|394|395|396|397|398|399|420|421|422|429|433|434|435|441|442|443|445|452|453|454|455|460|473|476|477|478|479|480|481|482|483|484|485|486|487|488|489|490|491|492|493|494|495|496|497|498|499|500|


Getting Alpha and Beta Indices

In [22]:
# bgl3_alpha_indices = get_alpha_boolean(bgl3_stride_file)

In [23]:
# bgl3_beta_indices = get_beta_boolean(bgl3_stride_file)

In [24]:
# is_alpha_bgl3 = bgl3_alpha_indices.count(True)
# not_alpha_bgl3 = bgl3_alpha_indices.count(False)
# print(is_alpha_bgl3)
# print(not_alpha_bgl3) # diff of 115

# is_beta_bgl3 = bgl3_beta_indices.count(True)
# not_beta_bgl3 = bgl3_beta_indices.count(False)
# print(is_beta_bgl3)
# # print(not_beta_bgl3) # diff of 343

In [25]:
# # get residues to exlude

# not_included_alpha_bgl3 = get_excluded_res(bgl3_alpha_indices)
# not_included_beta_bgl3 = get_excluded_res(bgl3_beta_indices)

In [47]:
# number of mutations in secondary structure (True), and not in secondary structure (False)
count_false = bgl3_ss_indexes.count(False)
print(count_false)

count_true = bgl3_ss_indexes.count(True)
print(count_true)

130
371


In [27]:
# # index of 416 true

# highest_true_index = [i for i, n in enumerate(bgl3_ss_indexes) if n == True][229]
# print(highest_true_index)
# # need list of indices past this index

# true_indices = [i for i,val in enumerate(bgl3_ss_indexes) if val==True]
# # print(true_indices)

# not_included_bgl3 = [i for i in true_indices if i > highest_true_index]
# # print(not_included_bgl3)

In [48]:
# changing not included to matching secondary structure + random elements
not_included_bgl3 = get_excluded_res(bgl3_ss_indexes)

Num True Indices: 371
Num False Indices: 130
Difference: 241
Num Indices to Remove: 241


In [49]:
# importing bgl3 data from Gelman et al.
bgl3_df1 = pd.read_csv("../Raw Data/bgl3.tsv.txt", sep="\t")
bgl3_df = bgl3_df1.dropna()
print(len(bgl3_df))
print(bgl3_df.columns)

26653
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [50]:
# rounding score column to 6 decimal points
bgl3_df["score"] = bgl3_df["score"].round(6)
print(len(bgl3_df))

# remove values with wildcard star
bgl3_df = bgl3_df[bgl3_df["variant"].str.contains("\*") == False]
# bgl3_df = bgl3_df.head(25600)
print(len(bgl3_df))

26653
25737


In [209]:
# bgl3 protein sequence
string_seq = "MVPAAQQTAMAPDAALTFPEGFLWGSATASYQIEGAAAEDGRTPSIWDTYARTPGRVRNGDTGDVATDHYHRWREDVALMAELGLGAYRFSLAWPRIQPTGRGPALQKGLDFYRRLADELLAKGIQPVATLYHWDLPQELENAGGWPERATAERFAEYAAIAADALGDRVKTWTTLNEPWCSAFLGYGSGVHAPGRTDPVAALRAAHHLNLGHGLAVQALRDRLPADAQCSVTLNIHHVRPLTDSDADADAVRRIDALANRVFTGPMLQGAYPEDLVKDTAGLTDWSFVRDGDLRLAHQKLDFLGVNYYSPTLVSEADGSGTHNSDGHGRSAHSPWPGADRVAFHQPPGETTAMGWAVDPSGLYELLRRLSSDFPALPLVITENGAAFHDYADPEGNVNDPERIAYVRDHLAAVHRAIKDGSDVRGYFLWSLLDNFEWAHGYSKRFGAVYVDYPTGTRIPKASARWYAEVARTGVLPTAGDPNSSSVDKLAAALEHHHHHH"

In [68]:
print(len(string_seq))

501


In [210]:
protein_seq_bgl3 = get_expanded_seq(string_seq)
print(protein_seq_bgl3)

MET VAL PRO ALA ALA GLN GLN THR ALA MET ALA PRO ASP ALA ALA LEU THR PHE PRO GLU GLY PHE LEU TRP GLY SER ALA THR ALA SER TYR GLN ILE GLU GLY ALA ALA ALA GLU ASP GLY ARG THR PRO SER ILE TRP ASP THR TYR ALA ARG THR PRO GLY ARG VAL ARG ASN GLY ASP THR GLY ASP VAL ALA THR ASP HIS TYR HIS ARG TRP ARG GLU ASP VAL ALA LEU MET ALA GLU LEU GLY LEU GLY ALA TYR ARG PHE SER LEU ALA TRP PRO ARG ILE GLN PRO THR GLY ARG GLY PRO ALA LEU GLN LYS GLY LEU ASP PHE TYR ARG ARG LEU ALA ASP GLU LEU LEU ALA LYS GLY ILE GLN PRO VAL ALA THR LEU TYR HIS TRP ASP LEU PRO GLN GLU LEU GLU ASN ALA GLY GLY TRP PRO GLU ARG ALA THR ALA GLU ARG PHE ALA GLU TYR ALA ALA ILE ALA ALA ASP ALA LEU GLY ASP ARG VAL LYS THR TRP THR THR LEU ASN GLU PRO TRP CYS SER ALA PHE LEU GLY TYR GLY SER GLY VAL HIS ALA PRO GLY ARG THR ASP PRO VAL ALA ALA LEU ARG ALA ALA HIS HIS LEU ASN LEU GLY HIS GLY LEU ALA VAL GLN ALA LEU ARG ASP ARG LEU PRO ALA ASP ALA GLN CYS SER VAL THR LEU ASN ILE HIS HIS VAL ARG PRO LEU THR ASP SER ASP ALA ASP ALA ASP 

In [86]:
split = protein_seq_bgl3.split()
print(len(split))

501


In [53]:
# split variant name into wild-type, position, and mutation type
bgl3_mut = bgl3_df["variant"].str.split(",")
bgl3_df["WILD_TYPE_RES"] = get_wild_type(bgl3_mut)
bgl3_df["MUTATED_RES"] = get_mutation_type(bgl3_mut)
bgl3_df["POSITION"] = get_position(bgl3_mut)
bgl3_df["variant"] = get_mutations_names_list(bgl3_df)

In [54]:
bgl3_df["positions_split"] = get_positions_split(bgl3_df)
bgl3_df = add_sec_str_col(bgl3_df, bgl3_ss_indexes, 0)

# print(len(bgl3_df))
# col_length = add_sec_str_col(bgl3_df, bgl3_ss_indexes, 0)
# print(len(col_length))
# # print(col_length.head(10000))

In [55]:
print(bgl3_df["positions_split"].head(5))

0         [104]
1    [104, 142]
2    [104, 152]
3    [104, 170]
4         [104]
Name: positions_split, dtype: object


In [56]:
print(bgl3_df.head(5))

          variant  num_mutations   inp    sel     score WILD_TYPE_RES  \
0          104GLU              1  90.0  248.0 -0.339828             A   
1  104GLU, 142GLU              2   0.0    5.0  1.047974           A,A   
2  104GLU, 152VAL              2   1.0    9.0  0.495906           A,E   
3  104GLU, 170ARG              2   0.0    7.0  1.358129           A,K   
4          104GLY              1  35.0   90.0 -0.414104             A   

  MUTATED_RES POSITION positions_split  in_sec_str  
0           E      104           [104]       False  
1         E,E  104,142      [104, 142]       False  
2         E,V  104,152      [104, 152]       False  
3         E,R  104,170      [104, 170]       False  
4           G      104           [104]       False  


In [57]:
bgl3_in_domain_df = get_domain_dataset_v2(bgl3_df, 0, 550, not_included_bgl3) # ending is larger than sequence length bc. all mutations inside
print(len(bgl3_in_domain_df))

8632


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [None]:
print(len())

In [408]:
# bgl3_in_domain_alpha_df = get_domain_dataset(bgl3_df, 0, 550, not_included_alpha_bgl3)
# print(len(bgl3_in_domain_alpha_df))

18781


In [423]:
# bgl3_in_domain_beta_df = get_domain_dataset(bgl3_df, 0, 550, not_included_beta_bgl3)
# print(len(bgl3_in_domain_beta_df))

6915


In [38]:
# bgl3_alpha_df = get_ss_dataset(bgl3_in_domain_alpha_df, bgl3_alpha_indices, 0)
# print(len(bgl3_alpha_df))
# bgl3_alpha_df_2880 = bgl3_alpha_df.sample(n=2880)

In [39]:
# bgl3_beta_df = get_ss_dataset(bgl3_in_domain_beta_df, bgl3_beta_indices, 0)
# print(len(bgl3_beta_df))
# bgl3_beta_df_2880 = bgl3_beta_df.sample(n=800)

In [418]:
# bgl3_not_alpha_df = get_not_ss_dataset(bgl3_in_domain_alpha_df, bgl3_alpha_indices, 0)
# print(len(bgl3_not_alpha_df))
# bgl3_not_alpha_df_2880 = bgl3_not_alpha_df.sample(n=2880)

5979


In [58]:
bgl3_ss_df = get_ss_dataset(bgl3_in_domain_df, bgl3_ss_indexes, 0)
print(len(bgl3_ss_df))
print(bgl3_ss_df["in_sec_str"].value_counts())
# bgl3_ss_df_3000 = bgl3_ss_df.sample(n=2880)

             variant num_mutations    inp    sel     score WILD_TYPE_RES  \
0             104GLU             1   90.0  248.0 -0.339828             A   
1     104GLU, 170ARG             2    0.0    7.0  1.358129           A,K   
2             104GLY             1   35.0   90.0 -0.414104             A   
3             104PRO             1   32.0   56.0 -0.796920             A   
4             104SER             1  120.0  502.0  0.078025             A   
...              ...           ...    ...    ...       ...           ...   
8627           87ASN             1  425.0  211.0 -2.048961             Y   
8628    87ASN, 89LEU             2    1.0    5.0 -0.050638           Y,F   
8629   87ASN, 136LEU             2    2.0    4.0 -0.762134           Y,P   
8630           87SER             1  116.0  260.0 -0.545209             Y   
8631   87SER, 133ARG             2    2.0    4.0 -0.762134           Y,W   

     MUTATED_RES POSITION positions_split in_sec_str  has_sec_str  
0              E   

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [93]:
# print(bgl3_ss_df.columns)
# print(bgl3_ss_df['is_not_sec_str'].value_counts())

In [59]:
bgl3_not_ss_df = get_not_ss_dataset(bgl3_in_domain_df, bgl3_ss_indexes, 0)
print(len(bgl3_not_ss_df))
# bgl3_not_ss_df_3000 = bgl3_not_ss_df.sample(n=2880)

3450


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


50 Value Test dataset in SS

In [211]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
bgl3_ss_50_test_df = bgl3_ss_df.sample(n=50)

In [212]:
bgl3_temp_df = pd.concat([bgl3_ss_50_test_df, bgl3_ss_df])
print(len(bgl3_temp_df))
bgl3_ss_df = bgl3_temp_df[~bgl3_temp_df.index.duplicated(keep=False)]
print(len(bgl3_ss_df))

2206
2106


50 Value Test dataset not in SS

In [213]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
bgl3_not_ss_50_test_df = bgl3_not_ss_df.sample(n=50)

In [214]:
bgl3_temp_df = pd.concat([bgl3_not_ss_50_test_df, bgl3_not_ss_df])
print(len(bgl3_temp_df))
bgl3_not_ss_df = bgl3_temp_df[~bgl3_temp_df.index.duplicated(keep=False)]
print(len(bgl3_not_ss_df))

3500
3400


Training Data

In [101]:
bgl3_ss_df_500_t1 = bgl3_ss_df.sample(n=500)
# bgl3_ss_df_500_t2 = bgl3_ss_df.sample(n=500)
# bgl3_ss_df_500_t3 = bgl3_ss_df.sample(n=500)

In [215]:
bgl3_ss_df_200_t1 = bgl3_ss_df.sample(n=200)
bgl3_ss_df_200_t2 = bgl3_ss_df.sample(n=200)
bgl3_ss_df_200_t3 = bgl3_ss_df.sample(n=200)

In [94]:
bgl3_ss_df_2000_t1 = bgl3_ss_df.sample(n=2000)
bgl3_ss_df_2000_t2 = bgl3_ss_df.sample(n=2000)
bgl3_ss_df_2000_t3 = bgl3_ss_df.sample(n=2000)

In [95]:
bgl3_ss_df_3000_t1 = bgl3_ss_df.sample(n=3000)
bgl3_ss_df_3000_t2 = bgl3_ss_df.sample(n=3000)
bgl3_ss_df_3000_t3 = bgl3_ss_df.sample(n=3000)

In [102]:
bgl3_not_ss_df_500_t1 = bgl3_not_ss_df.sample(n=500)
# bgl3_not_ss_df_500_t2 = bgl3_not_ss_df.sample(n=500)
# bgl3_not_ss_df_500_t3 = bgl3_not_ss_df.sample(n=500)

In [216]:
bgl3_not_ss_df_200_t1 = bgl3_not_ss_df.sample(n=200)
bgl3_not_ss_df_200_t2 = bgl3_not_ss_df.sample(n=200)
bgl3_not_ss_df_200_t3 = bgl3_not_ss_df.sample(n=200)

In [98]:
bgl3_not_ss_df_2000_t1 = bgl3_not_ss_df.sample(n=2000)
bgl3_not_ss_df_2000_t2 = bgl3_not_ss_df.sample(n=2000)
bgl3_not_ss_df_2000_t3 = bgl3_not_ss_df.sample(n=2000)

In [99]:
bgl3_not_ss_df_3000_t1 = bgl3_not_ss_df.sample(n=3000)
bgl3_not_ss_df_3000_t2 = bgl3_not_ss_df.sample(n=3000)
bgl3_not_ss_df_3000_t3 = bgl3_not_ss_df.sample(n=3000)

In [103]:
bgl3_ss_500_df_t1 = pd.concat([bgl3_ss_df_500_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_500_df_t1 = pd.concat([bgl3_not_ss_df_500_t1, bgl3_not_ss_1000_test_df])
# bgl3_ss_500_df_t2 = pd.concat([bgl3_ss_df_500_t2, bgl3_ss_1000_test_df])
# bgl3_not_ss_500_df_t2 = pd.concat([bgl3_not_ss_df_500_t2, bgl3_not_ss_1000_test_df])
# bgl3_ss_500_df_t3 = pd.concat([bgl3_ss_df_500_t3, bgl3_ss_1000_test_df])
# bgl3_not_ss_500_df_t3 = pd.concat([bgl3_not_ss_df_500_t3, bgl3_not_ss_1000_test_df])

In [101]:
bgl3_ss_1000_df_t1 = pd.concat([bgl3_ss_df_1000_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t1 = pd.concat([bgl3_not_ss_df_1000_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_1000_df_t2 = pd.concat([bgl3_ss_df_1000_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t2 = pd.concat([bgl3_not_ss_df_1000_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_1000_df_t3 = pd.concat([bgl3_ss_df_1000_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_1000_df_t3 = pd.concat([bgl3_not_ss_df_1000_t3, bgl3_not_ss_1000_test_df])

In [217]:
bgl3_ss_200_df_t1 = pd.concat([bgl3_ss_df_200_t1, bgl3_ss_50_test_df])
bgl3_not_ss_200_df_t1 = pd.concat([bgl3_not_ss_df_200_t1, bgl3_not_ss_50_test_df])
bgl3_ss_200_df_t2 = pd.concat([bgl3_ss_df_200_t2, bgl3_ss_50_test_df])
bgl3_not_ss_200_df_t2 = pd.concat([bgl3_not_ss_df_200_t2, bgl3_not_ss_50_test_df])
bgl3_ss_200_df_t3 = pd.concat([bgl3_ss_df_200_t3, bgl3_ss_50_test_df])
bgl3_not_ss_200_df_t3 = pd.concat([bgl3_not_ss_df_200_t3, bgl3_not_ss_50_test_df])

In [103]:
bgl3_ss_3000_df_t1 = pd.concat([bgl3_ss_df_3000_t1, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t1 = pd.concat([bgl3_not_ss_df_3000_t1, bgl3_not_ss_1000_test_df])
bgl3_ss_3000_df_t2 = pd.concat([bgl3_ss_df_3000_t2, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t2 = pd.concat([bgl3_not_ss_df_3000_t2, bgl3_not_ss_1000_test_df])
bgl3_ss_3000_df_t3 = pd.concat([bgl3_ss_df_3000_t3, bgl3_ss_1000_test_df])
bgl3_not_ss_3000_df_t3 = pd.concat([bgl3_not_ss_df_3000_t3, bgl3_not_ss_1000_test_df])

In [298]:
##### write data to formatted txt file

write_data_file("bgl3_MLformat_ss_200_train_50_test_turns1", protein_seq_bgl3, bgl3_ss_200_df_t1)
# write_data_file("bgl3_MLformat_not_ss_200_train_50_test_turns1", protein_seq_bgl3, bgl3_not_ss_200_df_t1)
# write_data_file("bgl3_MLformat_ss_200_train_50_test_test_turns2", protein_seq_bgl3, bgl3_ss_200_df_t2)
# write_data_file("bgl3_MLformat_not_ss_200_train_50_test_test_turns2", protein_seq_bgl3, bgl3_not_ss_200_df_t2)
# write_data_file("bgl3_MLformat_ss_200_train_50_test_test_turns3", protein_seq_bgl3, bgl3_ss_200_df_t3)
# write_data_file("bgl3_MLformat_not_ss_200_train_50_test_test_turns3", protein_seq_bgl3, bgl3_not_ss_200_df_t3)

Filename: bgl3_MLformat_ss_200_train_50_test_turns1.txt


## Ube4B

In [219]:
path = "../PDB and STRIDE Files/" + 'ube4b_stride.txt'
ube4b_stride_file = open(path, 'r')

In [61]:
ube4b_ss_indexes = get_all_sec_struc_boolean(ube4b_stride_file)

In [62]:
print(len(ube4b_ss_indexes))
print(ube4b_ss_indexes)

102
[True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, True, True, True, False, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False]


Getting Alpha and Beta Indices Datasets

In [63]:
# changing not included to matching secondary structure + random elements
not_included_ube4b = get_excluded_res(ube4b_ss_indexes)

Num True Indices: 93
Num False Indices: 9
Difference: 84
Num Indices to Remove: 84


In [64]:
# number of mutations in secondary structure (True), and not in secondary structure (False)
count_false = ube4b_ss_indexes.count(False)
print(count_false)

count_true = ube4b_ss_indexes.count(True)
print(count_true)

9
93


In [65]:
# importing Ube4b data from Gelman et al.
ube4b_df1 = pd.read_csv("../Raw Data/ube4b.tsv.txt", sep="\t")
ube4b_df = ube4b_df1.dropna()
print(len(ube4b_df))
print(ube4b_df.columns)

98297
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [66]:
# rounding score column to 6 decimal points
ube4b_df["score"] = ube4b_df["score"].round(6)
print(len(ube4b_df))

# remove values with wildcard star
ube4b_df = ube4b_df[ube4b_df["variant"].str.contains("\*") == False]
print(len(ube4b_df))

98297
91031


In [67]:
string_seq = "IEKFKLLAEKVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDPFNRQMLTESMLEPVPELKEQIQAWMREKQSSDH"
print(len(string_seq))

102


In [68]:
protein_seq_ube4b = get_expanded_seq(string_seq)
print(protein_seq_ube4b)

ILE GLU LYS PHE LYS LEU LEU ALA GLU LYS VAL GLU GLU ILE VAL ALA LYS ASN ALA ARG ALA GLU ILE ASP TYR SER ASP ALA PRO ASP GLU PHE ARG ASP PRO LEU MET ASP THR LEU MET THR ASP PRO VAL ARG LEU PRO SER GLY THR VAL MET ASP ARG SER ILE ILE LEU ARG HIS LEU LEU ASN SER PRO THR ASP PRO PHE ASN ARG GLN MET LEU THR GLU SER MET LEU GLU PRO VAL PRO GLU LEU LYS GLU GLN ILE GLN ALA TRP MET ARG GLU LYS GLN SER SER ASP HIS


In [69]:
ube4b_mut = ube4b_df["variant"].str.split(",")

ube4b_df["WILD_TYPE_RES"] = get_wild_type(ube4b_mut)
ube4b_df["MUTATED_RES"] = get_mutation_type(ube4b_mut)
ube4b_df["POSITION"] = get_position(ube4b_mut)

ube4b_df["variant"] = get_mutations_names_list(ube4b_df)
print(ube4b_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION'],
      dtype='object')


In [70]:
ube4b_df["positions_split"] = get_positions_split(ube4b_df)
ube4b_df = add_sec_str_col(ube4b_df, ube4b_ss_indexes, 0)

In [71]:
# ube4b_in_domain_df = get_domain_dataset(ube4b_df, 0, 1200)
ube4b_in_domain_df = get_domain_dataset_v2(ube4b_df, 0, 2000, not_included_ube4b)
print(len(ube4b_in_domain_df))

2449


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [72]:
ube4b_ss_df = get_ss_dataset(ube4b_in_domain_df, ube4b_ss_indexes, 0)
print(len(ube4b_ss_df))

                   variant num_mutations     score WILD_TYPE_RES MUTATED_RES  \
0     16THR, 84ALA, 100ASN             3  4.261533         K,E,D       T,A,N   
1             82MET, 83THR             2  4.257914           V,P         M,T   
2             16MET, 75SER             2  4.216601           K,T         M,S   
3            29VAL, 100GLU             2  3.778562           D,D         V,E   
4             82LEU, 83ALA             2  3.699182           V,P         L,A   
...                    ...           ...       ...           ...         ...   
2444          15GLU, 28THR             2 -6.337782           A,P         E,T   
2445          16ASN, 74ARG             2 -5.986310           K,L         N,R   
2446          1VAL, 101LEU             2 -5.986310           E,H         V,L   
2447         85PRO, 100ASN             2 -6.128329           L,D         P,N   
2448   16ARG, 85GLN, 99GLY             3 -6.257612         K,L,S       R,Q,G   

       POSITION positions_split in_sec_

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [73]:
ube4b_not_ss_df = get_not_ss_dataset(ube4b_in_domain_df, ube4b_ss_indexes, 0)
print(len(ube4b_not_ss_df))

641


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


50 Value Test dataset in SS

In [220]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
ube4b_ss_50_test_df = ube4b_ss_df.sample(n=50)

In [221]:
ube4b_temp_df = pd.concat([ube4b_ss_50_test_df, ube4b_ss_df])
print(len(ube4b_temp_df))
ube4b_ss_df = ube4b_temp_df[~ube4b_temp_df.index.duplicated(keep=False)]
print(len(ube4b_ss_df))

581
481


50 Value Test dataset not in SS

In [222]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
ube4b_not_ss_50_test_df = ube4b_not_ss_df.sample(n=50)

In [223]:
ube4b_temp_df = pd.concat([ube4b_not_ss_50_test_df, ube4b_not_ss_df])
print(len(ube4b_temp_df))
ube4b_not_ss_df = ube4b_temp_df[~ube4b_temp_df.index.duplicated(keep=False)]
print(len(ube4b_not_ss_df))

691
591


Training Data

In [224]:
ube4b_ss_df_200_t1 = ube4b_ss_df.sample(n=200)
ube4b_ss_df_200_t2 = ube4b_ss_df.sample(n=200)
ube4b_ss_df_200_t3 = ube4b_ss_df.sample(n=200)

In [186]:
ube4b_ss_df_1000_t1 = ube4b_ss_df.sample(n=1000)
ube4b_ss_df_1000_t2 = ube4b_ss_df.sample(n=1000)
ube4b_ss_df_1000_t3 = ube4b_ss_df.sample(n=1000)

In [187]:
ube4b_ss_df_2000_t1 = ube4b_ss_df.sample(n=2000)
ube4b_ss_df_2000_t2 = ube4b_ss_df.sample(n=2000)
ube4b_ss_df_2000_t3 = ube4b_ss_df.sample(n=2000)

In [188]:
ube4b_ss_df_3000_t1 = ube4b_ss_df.sample(n=3000)
ube4b_ss_df_3000_t2 = ube4b_ss_df.sample(n=3000)
ube4b_ss_df_3000_t3 = ube4b_ss_df.sample(n=3000)

In [225]:
ube4b_not_ss_df_200_t1 = ube4b_not_ss_df.sample(n=200)
ube4b_not_ss_df_200_t2 = ube4b_not_ss_df.sample(n=200)
ube4b_not_ss_df_200_t3 = ube4b_not_ss_df.sample(n=200)

In [190]:
ube4b_not_ss_df_1000_t1 = ube4b_not_ss_df.sample(n=1000)
ube4b_not_ss_df_1000_t2 = ube4b_not_ss_df.sample(n=1000)
ube4b_not_ss_df_1000_t3 = ube4b_not_ss_df.sample(n=1000)

In [191]:
ube4b_not_ss_df_2000_t1 = ube4b_not_ss_df.sample(n=2000)
ube4b_not_ss_df_2000_t2 = ube4b_not_ss_df.sample(n=2000)
ube4b_not_ss_df_2000_t3 = ube4b_not_ss_df.sample(n=2000)

In [192]:
ube4b_not_ss_df_3000_t1 = ube4b_not_ss_df.sample(n=3000)
ube4b_not_ss_df_3000_t2 = ube4b_not_ss_df.sample(n=3000)
ube4b_not_ss_df_3000_t3 = ube4b_not_ss_df.sample(n=3000)

In [226]:
ube4b_ss_200_df_t1 = pd.concat([ube4b_ss_df_200_t1, ube4b_ss_50_test_df])
ube4b_not_ss_200_df_t1 = pd.concat([ube4b_not_ss_df_200_t1, ube4b_not_ss_50_test_df])
ube4b_ss_200_df_t2 = pd.concat([ube4b_ss_df_200_t2, ube4b_ss_50_test_df])
ube4b_not_ss_200_df_t2 = pd.concat([ube4b_not_ss_df_200_t2, ube4b_not_ss_50_test_df])
ube4b_ss_200_df_t3 = pd.concat([ube4b_ss_df_200_t3, ube4b_ss_50_test_df])
ube4b_not_ss_200_df_t3 = pd.concat([ube4b_not_ss_df_200_t3, ube4b_not_ss_50_test_df])

In [194]:
ube4b_ss_1000_df_t1 = pd.concat([ube4b_ss_df_1000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t1 = pd.concat([ube4b_not_ss_df_1000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_1000_df_t2 = pd.concat([ube4b_ss_df_1000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t2 = pd.concat([ube4b_not_ss_df_1000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_1000_df_t3 = pd.concat([ube4b_ss_df_1000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_1000_df_t3 = pd.concat([ube4b_not_ss_df_1000_t3, ube4b_not_ss_1000_test_df])

In [195]:
ube4b_ss_2000_df_t1 = pd.concat([ube4b_ss_df_2000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t1 = pd.concat([ube4b_not_ss_df_2000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_2000_df_t2 = pd.concat([ube4b_ss_df_2000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t2 = pd.concat([ube4b_not_ss_df_2000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_2000_df_t3 = pd.concat([ube4b_ss_df_2000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_2000_df_t3 = pd.concat([ube4b_not_ss_df_2000_t3, ube4b_not_ss_1000_test_df])

In [196]:
ube4b_ss_3000_df_t1 = pd.concat([ube4b_ss_df_3000_t1, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t1 = pd.concat([ube4b_not_ss_df_3000_t1, ube4b_not_ss_1000_test_df])
ube4b_ss_3000_df_t2 = pd.concat([ube4b_ss_df_3000_t2, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t2 = pd.concat([ube4b_not_ss_df_3000_t2, ube4b_not_ss_1000_test_df])
ube4b_ss_3000_df_t3 = pd.concat([ube4b_ss_df_3000_t3, ube4b_ss_1000_test_df])
ube4b_not_ss_3000_df_t3 = pd.concat([ube4b_not_ss_df_3000_t3, ube4b_not_ss_1000_test_df])

In [229]:
# write data to formatted txt file

write_data_file("ube4b_MLformat_ss_200_train_50_test_turns1", protein_seq_ube4b, ube4b_ss_200_df_t1)
write_data_file("ube4b_MLformat_not_ss_200_train_50_test_turns1", protein_seq_ube4b, ube4b_not_ss_200_df_t1)
write_data_file("ube4b_MLformat_ss_200_train_50_test_turns2", protein_seq_ube4b, ube4b_ss_200_df_t2)
write_data_file("ube4b_MLformat_not_ss_200_train_50_test_turns2", protein_seq_ube4b, ube4b_not_ss_200_df_t2)
write_data_file("ube4b_MLformat_ss_200_train_50_test_turns3", protein_seq_ube4b, ube4b_ss_200_df_t3)
write_data_file("ube4b_MLformat_not_ss_200_train_50_test_turns3", protein_seq_ube4b, ube4b_not_ss_200_df_t3)

Filename: ube4b_MLformat_ss_200_train_50_test_turns1.txt
Filename: ube4b_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: ube4b_MLformat_ss_200_train_50_test_turns2.txt
Filename: ube4b_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: ube4b_MLformat_ss_200_train_50_test_turns3.txt
Filename: ube4b_MLformat_not_ss_200_train_50_test_turns3.txt


## avGFP

In [74]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [75]:
avgfp_ss_indexes = get_all_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [76]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

209
28


Formatting Data

In [77]:
# importing avGFP data from Gelman et al.
avgfp_df1 = pd.read_csv("../Raw Data/avgfp.tsv.txt", sep="\t")
avgfp_df = avgfp_df1.dropna()
print(len(avgfp_df))
print(avgfp_df.columns)

54024
Index(['variant', 'num_mutations', 'score', 'score_wt_norm'], dtype='object')


In [78]:
# rounding score column to 2 decimal points
avgfp_df["score"] = avgfp_df["score"].round(6)
print(len(avgfp_df))

# remove values with wildcard star thing cause idk what it means
avgfp_df = avgfp_df[avgfp_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
# avgfp_df = avgfp_df.sample(n=160)
print(len(avgfp_df))

54024
51714


In [79]:
# getting dataset size to run

string_seq = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(len(string_seq)) 
protein_seq_avgfp = get_expanded_seq(string_seq)
print(protein_seq_avgfp)

237
SER LYS GLY GLU GLU LEU PHE THR GLY VAL VAL PRO ILE LEU VAL GLU LEU ASP GLY ASP VAL ASN GLY HIS LYS PHE SER VAL SER GLY GLU GLY GLU GLY ASP ALA THR TYR GLY LYS LEU THR LEU LYS PHE ILE CYS THR THR GLY LYS LEU PRO VAL PRO TRP PRO THR LEU VAL THR THR LEU SER TYR GLY VAL GLN CYS PHE SER ARG TYR PRO ASP HIS MET LYS GLN HIS ASP PHE PHE LYS SER ALA MET PRO GLU GLY TYR VAL GLN GLU ARG THR ILE PHE PHE LYS ASP ASP GLY ASN TYR LYS THR ARG ALA GLU VAL LYS PHE GLU GLY ASP THR LEU VAL ASN ARG ILE GLU LEU LYS GLY ILE ASP PHE LYS GLU ASP GLY ASN ILE LEU GLY HIS LYS LEU GLU TYR ASN TYR ASN SER HIS ASN VAL TYR ILE MET ALA ASP LYS GLN LYS ASN GLY ILE LYS VAL ASN PHE LYS ILE ARG HIS ASN ILE GLU ASP GLY SER VAL GLN LEU ALA ASP HIS TYR GLN GLN ASN THR PRO ILE GLY ASP GLY PRO VAL LEU LEU PRO ASP ASN HIS TYR LEU SER THR GLN SER ALA LEU SER LYS ASP PRO ASN GLU LYS ARG ASP HIS MET VAL LEU LEU GLU PHE VAL THR ALA ALA GLY ILE THR HIS GLY MET ASP GLU LEU TYR LYS


In [80]:
protein_seq_avgfp_split = protein_seq_avgfp.split()
print(len(protein_seq_avgfp_split))
print(protein_seq_avgfp_split[60])

# 165VAL -> ILE

237
THR


In [81]:
# splitting variant list if there are multiple mutations
avgfp_mut = avgfp_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
avgfp_df["WILD_TYPE_RES"] = get_wild_type(avgfp_mut)

# get mutated residue and place in seperate col
avgfp_df["MUTATED_RES"] = get_mutation_type(avgfp_mut)

# get position and place in seperate col
avgfp_df["POSITION"] = get_position(avgfp_mut)

# replace variant column with reformatted variant name
avgfp_df["variant"] = get_mutations_names_list(avgfp_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# avgfp_df = avgfp_df.drop(columns=to_drop)

In [82]:

# need positionssplit
avgfp_df["positions_split"] = get_positions_split(avgfp_df)

not_included_avgfp = get_excluded_res(avgfp_ss_indexes)
print(not_included_avgfp)
print(len(not_included_avgfp))
# add in_sec_str_col
avgfp_df = add_sec_str_col(avgfp_df, avgfp_ss_indexes, 0)

Num True Indices: 209
Num False Indices: 28
Difference: 181
Num Indices to Remove: 181
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 130, 131, 132, 133, 134, 135, 136, 137, 141, 142, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 185, 198, 199, 200, 201, 202, 203, 204, 205, 206, 211, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235]
181


In [88]:
print(avgfp_df['in_sec_str'].value_counts())

True     31353
False    20361
Name: in_sec_str, dtype: int64


In [83]:
# avgfp_in_domain_df = get_domain_dataset(avgfp_df, 0, 1200)
avgfp_in_domain_df = get_domain_dataset_v2(avgfp_df, 0, 2000, not_included_avgfp)
print(len(avgfp_in_domain_df))

1173


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [84]:
avgfp_ss_df = get_ss_dataset(avgfp_in_domain_df, avgfp_ss_indexes, 0)
print(len(avgfp_ss_df))

                    variant num_mutations     score  score_wt_norm  \
0                    177PRO             1  1.618027      -2.101185   
1                    177SER             1  3.604242      -0.114970   
2                    177THR             1  3.629315      -0.089897   
3                    177VAL             1  3.691308      -0.027904   
4            177VAL, 188GLY             2  3.636740      -0.082472   
...                     ...           ...       ...            ...   
1168          72HIS, 212GLU             2  3.021625      -0.697587   
1169          72HIS, 103ASP             2  1.624842      -2.094370   
1170          72HIS, 157TYR             2  1.301030      -2.418182   
1171  72HIS, 175ARG, 196ILE             3  2.114350      -1.604862   
1172          72ASN, 175ARG             2  1.301030      -2.418182   

     WILD_TYPE_RES MUTATED_RES    POSITION positions_split in_sec_str  \
0                A           P         177           [177]       True   
1            

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [85]:
# not all of not in ss set bc. of mixed mutation numbers

avgfp_not_ss_df = get_not_ss_dataset(avgfp_in_domain_df, avgfp_ss_indexes, 0)
print(len(avgfp_not_ss_df))

392


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


50 Value Test dataset SS

In [230]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
avgfp_ss_50_test_df = avgfp_ss_df.sample(n=50)

In [231]:
avgfp_temp_df = pd.concat([avgfp_ss_50_test_df, avgfp_ss_df])
print(len(avgfp_temp_df))
avgfp_ss_df = avgfp_temp_df[~avgfp_temp_df.index.duplicated(keep=False)]
print(len(avgfp_ss_df))

314
214


50 Value Test dataset not in SS

In [232]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
avgfp_not_ss_50_test_df = avgfp_not_ss_df.sample(n=50)

In [233]:
avgfp_temp_df = pd.concat([avgfp_not_ss_50_test_df, avgfp_not_ss_df])
print(len(avgfp_temp_df))
avgfp_not_ss_df = avgfp_temp_df[~avgfp_temp_df.index.duplicated(keep=False)]
print(len(avgfp_not_ss_df))

442
342


Training Data

In [234]:
avgfp_ss_df_200_t1 = avgfp_ss_df.sample(n=200)
avgfp_ss_df_200_t2 = avgfp_ss_df.sample(n=200)
avgfp_ss_df_200_t3 = avgfp_ss_df.sample(n=200)

In [235]:
avgfp_not_ss_df_200_t1 = avgfp_not_ss_df.sample(n=200)
avgfp_not_ss_df_200_t2 = avgfp_not_ss_df.sample(n=200)
avgfp_not_ss_df_200_t3 = avgfp_not_ss_df.sample(n=200)

In [40]:
avgfp_ss_df_1000_t1 = avgfp_ss_df.sample(n=1000)
avgfp_ss_df_1000_t2 = avgfp_ss_df.sample(n=1000)
avgfp_ss_df_1000_t3 = avgfp_ss_df.sample(n=1000)

In [42]:
avgfp_not_ss_df_1000_t1 = avgfp_not_ss_df.sample(n=1000)
avgfp_not_ss_df_1000_t2 = avgfp_not_ss_df.sample(n=1000)
avgfp_not_ss_df_1000_t3 = avgfp_not_ss_df.sample(n=1000)

In [236]:
avgfp_ss_200_df_t1 = pd.concat([avgfp_ss_df_200_t1, avgfp_ss_50_test_df])
avgfp_not_ss_200_df_t1 = pd.concat([avgfp_not_ss_df_200_t1, avgfp_not_ss_50_test_df])
avgfp_ss_200_df_t2 = pd.concat([avgfp_ss_df_200_t2, avgfp_ss_50_test_df])
avgfp_not_ss_200_df_t2 = pd.concat([avgfp_not_ss_df_200_t2, avgfp_not_ss_50_test_df])
avgfp_ss_200_df_t3 = pd.concat([avgfp_ss_df_200_t3, avgfp_ss_50_test_df])
avgfp_not_ss_200_df_t3 = pd.concat([avgfp_not_ss_df_200_t3, avgfp_not_ss_50_test_df])

In [44]:
avgfp_ss_1000_df_t1 = pd.concat([avgfp_ss_df_1000_t1, avgfp_ss_1000_test_df])
avgfp_not_ss_1000_df_t1 = pd.concat([avgfp_not_ss_df_1000_t1, avgfp_not_ss_1000_test_df])
avgfp_ss_1000_df_t2 = pd.concat([avgfp_ss_df_1000_t2, avgfp_ss_1000_test_df])
avgfp_not_ss_1000_df_t2 = pd.concat([avgfp_not_ss_df_1000_t2, avgfp_not_ss_1000_test_df])
avgfp_ss_1000_df_t3 = pd.concat([avgfp_ss_df_1000_t3, avgfp_ss_1000_test_df])
avgfp_not_ss_1000_df_t3 = pd.concat([avgfp_not_ss_df_1000_t3, avgfp_not_ss_1000_test_df])

In [237]:
# write data to formatted txt file

write_data_file("avgfp_MLformat_ss_200_train_50_test_turns1", protein_seq_avgfp, avgfp_ss_200_df_t1)
write_data_file("avgfp_MLformat_not_ss_200_train_50_test_turns1", protein_seq_avgfp, avgfp_not_ss_200_df_t1)
write_data_file("avgfp_MLformat_ss_200_train_50_test_turns2", protein_seq_avgfp, avgfp_ss_200_df_t2)
write_data_file("avgfp_MLformat_not_ss_200_train_50_test_turns2", protein_seq_avgfp, avgfp_not_ss_200_df_t2)
write_data_file("avgfp_MLformat_ss_200_train_50_test_turns3", protein_seq_avgfp, avgfp_ss_200_df_t3)
write_data_file("avgfp_MLformat_not_ss_200_train_50_test_turns3", protein_seq_avgfp, avgfp_not_ss_200_df_t3)

Filename: avgfp_MLformat_ss_200_train_50_test_turns1.txt
Filename: avgfp_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: avgfp_MLformat_ss_200_train_50_test_turns2.txt
Filename: avgfp_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: avgfp_MLformat_ss_200_train_50_test_turns3.txt
Filename: avgfp_MLformat_not_ss_200_train_50_test_turns3.txt


## GB1

In [50]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [51]:
gb1_ss_indexes = get_all_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [52]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
9


In [53]:
# importing pab1 data from Gelman et al.
gb1_df1 = pd.read_csv("../Raw Data/gb1.tsv.txt", sep="\t")
gb1_df = gb1_df1.dropna()
print(len(gb1_df))
# gb1_df = gb1_df.sample(n=480)
print(gb1_df.columns)
gb1_df = gb1_df.sample(frac=1)

536084
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [54]:
# rounding score column to 2 decimal points
gb1_df["score"] = gb1_df["score"].round(6)
print(len(gb1_df))

# remove values with wildcard star thing cause idk what it means
gb1_df = gb1_df[gb1_df["variant"].str.contains("\*") == False]

# gb1_df = gb1_df.sample(n=40)
# pab1_df = pab1_df.head(37600)
print(len(gb1_df))

536084
536084


In [55]:
# getting dataset size to run

string_seq = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
print(len(string_seq)) # <- domain length of 75
protein_seq_gb1 = get_expanded_seq(string_seq)
print(protein_seq_gb1)

56
MET GLN TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA THR ALA GLU LYS VAL PHE LYS GLN TYR ALA ASN ASP ASN GLY VAL ASP GLY GLU TRP THR TYR ASP ASP ALA THR LYS THR PHE THR VAL THR GLU


In [56]:
# splitting variant list if there are multiple mutations
gb1_mut = gb1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gb1_df["WILD_TYPE_RES"] = ssf.get_wild_type(gb1_mut)

# get mutated residue and place in seperate col
gb1_df["MUTATED_RES"] = ssf.get_mutation_type(gb1_mut)

# get position and place in seperate col
gb1_df["POSITION"] = ssf.get_position(gb1_mut)

# replace variant column with reformatted variant name
gb1_df["variant"] = ssf.get_mutations_names_list(gb1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# gb1_df = gb1_df.drop(columns=to_drop)

In [57]:
# need positionssplit
gb1_df["positions_split"] = ssf.get_positions_split(gb1_df)

# add in_sec_str_col
gb1_df = add_sec_str_col(gb1_df, gb1_ss_indexes, 0)

In [94]:
# gb1_in_domain_df = get_domain_dataset(gb1_df, 0, 1200)
not_included_gb1 = get_excluded_res(gb1_ss_indexes)
gb1_in_domain_df = get_domain_dataset_v2(gb1_df, 0, 2000, not_included_gb1)
print(len(gb1_in_domain_df))

Num True Indices: 47
Num False Indices: 9
Difference: 38
Num Indices to Remove: 38
49389


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [95]:
gb1_ss_df = get_ss_dataset(gb1_in_domain_df, gb1_ss_indexes, 0)
print(len(gb1_ss_df))

            variant num_mutations      inp      sel     score WILD_TYPE_RES  \
0      43PHE, 42HIS             2    374.0      0.0 -7.166099           T,W   
1      40ILE, 38LEU             2   1227.0      5.0 -5.955347           G,V   
2       22MET, 1HIS             2    427.0     24.0 -3.406641           A,Q   
3      37ASP, 20SER             2    322.0    257.0 -0.772443           G,V   
4       1GLY, 43CYS             2    186.0    366.0  0.128207           Q,T   
...             ...           ...      ...      ...       ...           ...   
49384  22ASP, 39VAL             2    256.0    289.0 -0.426333           A,D   
49385  39CYS, 41ALA             2     41.0     71.0 -0.003356           D,E   
49386  37GLN, 42CYS             2    466.0      6.0 -4.820816           G,W   
49387  39ASN, 41PHE             2    126.0    384.0  0.564342           D,E   
49388          1ASP             1  11488.0  18085.0 -0.093596             Q   

      MUTATED_RES POSITION positions_split in_sec_s

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [96]:
# print(gb1_ss_df.head(3000))
print(len(gb1_ss_df))
print(gb1_ss_df["in_sec_str"].value_counts())

13164
True    13164
Name: in_sec_str, dtype: int64


In [97]:
gb1_not_ss_df = get_not_ss_dataset(gb1_in_domain_df, gb1_ss_indexes, 0)
print(len(gb1_not_ss_df))

10245


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


50 Value Test dataset in SS

In [238]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
gb1_ss_50_test_df = gb1_ss_df.sample(n=50)

In [239]:
gb1_temp_df = pd.concat([gb1_ss_50_test_df, gb1_ss_df])
print(len(gb1_temp_df))
gb1_ss_df = gb1_temp_df[~gb1_temp_df.index.duplicated(keep=False)]
print(len(gb1_ss_df))

13214
13114


50 Value Test dataset not in SS

In [240]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
gb1_not_ss_50_test_df = gb1_not_ss_df.sample(n=50)

In [241]:
gb1_temp_df = pd.concat([gb1_not_ss_50_test_df, gb1_not_ss_df])
print(len(gb1_temp_df))
gb1_not_ss_df = gb1_temp_df[~gb1_temp_df.index.duplicated(keep=False)]
print(len(gb1_not_ss_df))

10295
10195


Training Data

In [242]:
gb1_ss_df_200_t1 = gb1_ss_df.sample(n=200)
gb1_ss_df_200_t2 = gb1_ss_df.sample(n=200)
gb1_ss_df_200_t3 = gb1_ss_df.sample(n=200)

In [243]:
gb1_not_ss_df_200_t1 = gb1_not_ss_df.sample(n=200)
gb1_not_ss_df_200_t2 = gb1_not_ss_df.sample(n=200)
gb1_not_ss_df_200_t3 = gb1_not_ss_df.sample(n=200)

In [84]:
gb1_ss_df_1000_t1 = gb1_ss_df.sample(n=1000)
gb1_ss_df_1000_t2 = gb1_ss_df.sample(n=1000)
gb1_ss_df_1000_t3 = gb1_ss_df.sample(n=1000)

In [85]:
gb1_ss_df_2000_t1 = gb1_ss_df.sample(n=2000)
gb1_ss_df_2000_t2 = gb1_ss_df.sample(n=2000)
gb1_ss_df_2000_t3 = gb1_ss_df.sample(n=2000)

In [86]:
gb1_ss_df_3000_t1 = gb1_ss_df.sample(n=3000)
gb1_ss_df_3000_t2 = gb1_ss_df.sample(n=3000)
gb1_ss_df_3000_t3 = gb1_ss_df.sample(n=3000)

In [88]:
gb1_not_ss_df_1000_t1 = gb1_not_ss_df.sample(n=1000)
gb1_not_ss_df_1000_t2 = gb1_not_ss_df.sample(n=1000)
gb1_not_ss_df_1000_t3 = gb1_not_ss_df.sample(n=1000)

In [89]:
gb1_not_ss_df_2000_t1 = gb1_not_ss_df.sample(n=2000)
gb1_not_ss_df_2000_t2 = gb1_not_ss_df.sample(n=2000)
gb1_not_ss_df_2000_t3 = gb1_not_ss_df.sample(n=2000)

In [90]:
gb1_not_ss_df_3000_t1 = gb1_not_ss_df.sample(n=3000)
gb1_not_ss_df_3000_t2 = gb1_not_ss_df.sample(n=3000)
gb1_not_ss_df_3000_t3 = gb1_not_ss_df.sample(n=3000)

In [244]:
gb1_ss_200_df_t1 = pd.concat([gb1_ss_df_200_t1, gb1_ss_50_test_df])
gb1_not_ss_200_df_t1 = pd.concat([gb1_not_ss_df_200_t1, gb1_not_ss_50_test_df])
gb1_ss_200_df_t2 = pd.concat([gb1_ss_df_200_t2, gb1_ss_50_test_df])
gb1_not_ss_200_df_t2 = pd.concat([gb1_not_ss_df_200_t2, gb1_not_ss_50_test_df])
gb1_ss_200_df_t3 = pd.concat([gb1_ss_df_200_t3, gb1_ss_50_test_df])
gb1_not_ss_200_df_t3 = pd.concat([gb1_not_ss_df_200_t3, gb1_not_ss_50_test_df])

In [92]:
gb1_ss_1000_df_t1 = pd.concat([gb1_ss_df_1000_t1, gb1_ss_1000_test_df])
gb1_not_ss_1000_df_t1 = pd.concat([gb1_not_ss_df_1000_t1, gb1_not_ss_1000_test_df])
gb1_ss_1000_df_t2 = pd.concat([gb1_ss_df_1000_t2, gb1_ss_1000_test_df])
gb1_not_ss_1000_df_t2 = pd.concat([gb1_not_ss_df_1000_t2, gb1_not_ss_1000_test_df])
gb1_ss_1000_df_t3 = pd.concat([gb1_ss_df_1000_t3, gb1_ss_1000_test_df])
gb1_not_ss_1000_df_t3 = pd.concat([gb1_not_ss_df_1000_t3, gb1_not_ss_1000_test_df])

In [93]:
gb1_ss_2000_df_t1 = pd.concat([gb1_ss_df_2000_t1, gb1_ss_1000_test_df])
gb1_not_ss_2000_df_t1 = pd.concat([gb1_not_ss_df_2000_t1, gb1_not_ss_1000_test_df])
gb1_ss_2000_df_t2 = pd.concat([gb1_ss_df_2000_t2, gb1_ss_1000_test_df])
gb1_not_ss_2000_df_t2 = pd.concat([gb1_not_ss_df_2000_t2, gb1_not_ss_1000_test_df])
gb1_ss_2000_df_t3 = pd.concat([gb1_ss_df_2000_t3, gb1_ss_1000_test_df])
gb1_not_ss_2000_df_t3 = pd.concat([gb1_not_ss_df_2000_t3, gb1_not_ss_1000_test_df])

In [94]:
gb1_ss_3000_df_t1 = pd.concat([gb1_ss_df_3000_t1, gb1_ss_1000_test_df])
gb1_not_ss_3000_df_t1 = pd.concat([gb1_not_ss_df_3000_t1, gb1_not_ss_1000_test_df])
gb1_ss_3000_df_t2 = pd.concat([gb1_ss_df_3000_t2, gb1_ss_1000_test_df])
gb1_not_ss_3000_df_t2 = pd.concat([gb1_not_ss_df_3000_t2, gb1_not_ss_1000_test_df])
gb1_ss_3000_df_t3 = pd.concat([gb1_ss_df_3000_t3, gb1_ss_1000_test_df])
gb1_not_ss_3000_df_t3 = pd.concat([gb1_not_ss_df_3000_t3, gb1_not_ss_1000_test_df])

In [245]:
# write data to formatted txt file

write_data_file("gb1_MLformat_ss_200_train_50_test_turns1", protein_seq_gb1, gb1_ss_200_df_t1)
write_data_file("gb1_MLformat_not_ss_200_train_50_test_turns1", protein_seq_gb1, gb1_not_ss_200_df_t1)
write_data_file("gb1_MLformat_ss_200_train_50_test_turns2", protein_seq_gb1, gb1_ss_200_df_t2)
write_data_file("gb1_MLformat_not_ss_200_train_50_test_turns2", protein_seq_gb1, gb1_not_ss_200_df_t2)
write_data_file("gb1_MLformat_ss_200_train_50_test_turns3", protein_seq_gb1, gb1_ss_200_df_t3)
write_data_file("gb1_MLformat_not_ss_200_train_50_test_turns3", protein_seq_gb1, gb1_not_ss_200_df_t3)

# write_data_file("gb1_MLformat_ss_50_train_50_testv1", protein_seq_gb1, gb1_ss_50_dfv1)
# write_data_file("gb1_MLformat_not_ss_50_train_50_testv1", protein_seq_gb1, gb1_not_ss_50_dfv1)
# write_data_file("gb1_MLformat_ss_50_train_50_test_t2", protein_seq_gb1, gb1_ss_50_df_t2)
# write_data_file("gb1_MLformat_not_ss_50_train_50_test_t2", protein_seq_gb1, gb1_not_ss_50_df_t2)
# write_data_file("gb1_MLformat_ss_50_train_50_test_t3", protein_seq_gb1, gb1_ss_50_df_t3)
# write_data_file("gb1_MLformat_not_ss_50_train_50_test_t3", protein_seq_gb1, gb1_not_ss_50_df_t3)

# write_data_file("gb1_MLformat_ss_2000_train_50_testv1", protein_seq_gb1, gb1_ss_2000_dfv1)
# write_data_file("gb1_MLformat_not_ss_2000_train_50_testv1", protein_seq_gb1, gb1_not_ss_2000_dfv1)
# write_data_file("gb1_MLformat_ss_2000_train_50_test_t2", protein_seq_gb1, gb1_ss_2000_df_t2)
# write_data_file("gb1_MLformat_not_ss_2000_train_50_test_t2", protein_seq_gb1, gb1_not_ss_2000_df_t2)
# write_data_file("gb1_MLformat_ss_2000_train_50_test_t3", protein_seq_gb1, gb1_ss_2000_df_t3)
# write_data_file("gb1_MLformat_not_ss_2000_train_50_test_t3", protein_seq_gb1, gb1_not_ss_2000_df_t3)

Filename: gb1_MLformat_ss_200_train_50_test_turns1.txt
Filename: gb1_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: gb1_MLformat_ss_200_train_50_test_turns2.txt
Filename: gb1_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: gb1_MLformat_ss_200_train_50_test_turns3.txt
Filename: gb1_MLformat_not_ss_200_train_50_test_turns3.txt


## Ind4

In [108]:
# loading data
pro_1nd4_df1 = pd.read_csv("../Raw Data/1nd4.txt", sep=",")

# renaming mutated residue column
pro_1nd4_df1 = pro_1nd4_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [109]:
# reorganizing data to other form
col_list_1nd4 = list(pro_1nd4_df1)
col_list_1nd4 = col_list_1nd4[1:]

# mutations going down from the leftmost column
mutations_1nd4 = []

for column in col_list_1nd4:
    for mutation in pro_1nd4_df1["mutated_res"]:
        mutations_1nd4.append(column + mutation)

# getting scores
scores_1nd4 = []

for column in pro_1nd4_df1.drop('mutated_res', axis=1):
    for val in pro_1nd4_df1[column]:
        scores_1nd4.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_1nd4_df = pd.DataFrame(list(zip(mutations_1nd4, scores_1nd4)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [110]:
# drop nans
print(len(pro_1nd4_df))
pro_1nd4_df = pro_1nd4_df.dropna(axis=0)
pro_1nd4_df = pro_1nd4_df[pro_1nd4_df['score'] != 0.0]
print(len(pro_1nd4_df))
# print(pro_1nd4_df.tail(30))

5100
5095


In [111]:
# rounding score column to 6 decimal points
pro_1nd4_df["score"] = pro_1nd4_df["score"].round(6)

# shuffling
pro_1nd4_df = pro_1nd4_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_1nd4_mut = pro_1nd4_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_1nd4_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_1nd4_mut)

# get mutated residue and place in seperate col
pro_1nd4_df["MUTATED_RES"] = ssf.get_mutation_type(pro_1nd4_mut)

# get position and place in seperate col
pro_1nd4_df["POSITION"] = ssf.get_position(pro_1nd4_mut)

# need positionssplit
pro_1nd4_df["positions_split"] = ssf.get_positions_split(pro_1nd4_df)

positions_split_subtracted = []
for pos_list in pro_1nd4_df["positions_split"]:
    pos_list = [x - 10 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_1nd4_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_1nd4_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_1nd4_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_1nd4_df["variant"] = ssf.get_mutations_names_list(pro_1nd4_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [112]:
print(len(pro_1nd4_df))

5095


In [113]:
string_seq_1nd4 = "GSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNELQDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAMRRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPDGEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRFLVLYGIAAPDSQRIAFYRLLDEFFGSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNELQDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAMRRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPDGEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRFLVLYGIAAPDSQRIAFYRLLDEFF"
protein_seq_1nd4 = ssf.get_expanded_seq(string_seq_1nd4)
protein_seq_1nd4_split = protein_seq_1nd4.split()
print(len(protein_seq_1nd4_split))
print(protein_seq_1nd4_split[38])

510
PHE


In [114]:
path = "../PDB and STRIDE Files/" + '1nd4_stride.txt'
pro_1nd4_stride_file = open(path, 'r')

pro_1nd4_ss_indexes = ssf.get_all_sec_struc_boolean(pro_1nd4_stride_file)
print(len(pro_1nd4_ss_indexes))
print(pro_1nd4_ss_indexes.count(True))
print(pro_1nd4_ss_indexes.count(False))

# add in_sec_str_col
pro_1nd4_df = add_sec_str_col(pro_1nd4_df, pro_1nd4_ss_indexes, 0)

510
415
95


In [115]:
# pro_1nd4_in_domain_df = get_domain_dataset(pro_1nd4_df, 0, 1200)
not_included_pro_1nd4 = get_excluded_res(pro_1nd4_ss_indexes)
pro_1nd4_in_domain_df = get_domain_dataset_v2(pro_1nd4_df, 0, 2000, not_included_pro_1nd4)
print(len(pro_1nd4_in_domain_df))

Num True Indices: 415
Num False Indices: 95
Difference: 320
Num Indices to Remove: 320
1738


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [116]:
pro_1nd4_ss_df = get_ss_dataset(pro_1nd4_in_domain_df, pro_1nd4_ss_indexes, 0)
print(len(pro_1nd4_ss_df))

     variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split  \
0     174GLN -0.015248             L           Q      174           [174]   
1     182VAL  0.071293             C           V      182           [182]   
2      88ARG  0.036349             P           R       88            [88]   
3     126HIS -0.125402             Q           H      126           [126]   
4     241ALA  0.659317             S           A      241           [241]   
...      ...       ...           ...         ...      ...             ...   
1733   89PRO -0.666150             G           P       89            [89]   
1734  240TRP -1.058489             D           W      240           [240]   
1735   44THR  0.139848             S           T       44            [44]   
1736  143GLN -0.958213             V           Q      143           [143]   
1737  122GLN -0.058986             P           Q      122           [122]   

     in_sec_str  has_sec_str  
0          True         True  
1         Fal

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [117]:
pro_1nd4_not_ss_df = get_not_ss_dataset(pro_1nd4_in_domain_df, pro_1nd4_ss_indexes, 0)
print(len(pro_1nd4_not_ss_df))

880


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


1000 Value Test dataset in SS

In [262]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
# pro_1nd4_ss_1000_test_df = pro_1nd4_ss_df.sample(n=1000)
pro_1nd4_ss_50_test_df = pro_1nd4_ss_df.sample(n=50)

In [263]:
pro_1nd4_temp_df = pd.concat([pro_1nd4_ss_50_test_df, pro_1nd4_ss_df])
print(len(pro_1nd4_temp_df))
pro_1nd4_ss_df = pro_1nd4_temp_df[~pro_1nd4_temp_df.index.duplicated(keep=False)]
print(len(pro_1nd4_ss_df))

908
808


1000 Value Test dataset not in SS

In [264]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
# pro_1nd4_not_ss_1000_test_df = pro_1nd4_not_ss_df.sample(n=1000)
pro_1nd4_not_ss_50_test_df = pro_1nd4_not_ss_df.sample(n=50)

In [265]:
pro_1nd4_temp_df = pd.concat([pro_1nd4_not_ss_50_test_df, pro_1nd4_not_ss_df])
print(len(pro_1nd4_temp_df))
pro_1nd4_not_ss_df = pro_1nd4_temp_df[~pro_1nd4_temp_df.index.duplicated(keep=False)]
print(len(pro_1nd4_not_ss_df))

930
830


Training Data

In [266]:
pro_1nd4_ss_df_200_t1 = pro_1nd4_ss_df.sample(n=200)
pro_1nd4_ss_df_200_t2 = pro_1nd4_ss_df.sample(n=200)
pro_1nd4_ss_df_200_t3 = pro_1nd4_ss_df.sample(n=200)

In [267]:
pro_1nd4_not_ss_df_200_t1 = pro_1nd4_not_ss_df.sample(n=200)
pro_1nd4_not_ss_df_200_t2 = pro_1nd4_not_ss_df.sample(n=200)
pro_1nd4_not_ss_df_200_t3 = pro_1nd4_not_ss_df.sample(n=200)

In [269]:
pro_1nd4_ss_200_df_t1 = pd.concat([pro_1nd4_ss_df_200_t1, pro_1nd4_ss_50_test_df])
pro_1nd4_not_ss_200_df_t1 = pd.concat([pro_1nd4_not_ss_df_200_t1, pro_1nd4_not_ss_50_test_df])
pro_1nd4_ss_200_df_t2 = pd.concat([pro_1nd4_ss_df_200_t2, pro_1nd4_ss_50_test_df])
pro_1nd4_not_ss_200_df_t2 = pd.concat([pro_1nd4_not_ss_df_200_t2, pro_1nd4_not_ss_50_test_df])
pro_1nd4_ss_200_df_t3 = pd.concat([pro_1nd4_ss_df_200_t3, pro_1nd4_ss_50_test_df])
pro_1nd4_not_ss_200_df_t3 = pd.concat([pro_1nd4_not_ss_df_200_t3, pro_1nd4_not_ss_50_test_df])

In [271]:
# write data to formatted txt file

write_data_file("pro_1nd4_MLformat_ss_200_train_50_test_turns1", protein_seq_1nd4, pro_1nd4_ss_200_df_t1)
write_data_file("pro_1nd4_MLformat_not_ss_200_train_50_test_turns1", protein_seq_1nd4, pro_1nd4_not_ss_200_df_t1)
write_data_file("pro_1nd4_MLformat_ss_200_train_50_test_turns2", protein_seq_1nd4, pro_1nd4_ss_200_df_t2)
write_data_file("pro_1nd4_MLformat_not_ss_200_train_50_test_turns2", protein_seq_1nd4, pro_1nd4_not_ss_200_df_t2)
write_data_file("pro_1nd4_MLformat_ss_200_train_50_test_turns3", protein_seq_1nd4, pro_1nd4_ss_200_df_t3)
write_data_file("pro_1nd4_MLformat_not_ss_200_train_50_test_turns3", protein_seq_1nd4, pro_1nd4_not_ss_200_df_t3)

Filename: pro_1nd4_MLformat_ss_200_train_50_test_turns1.txt
Filename: pro_1nd4_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: pro_1nd4_MLformat_ss_200_train_50_test_turns2.txt
Filename: pro_1nd4_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: pro_1nd4_MLformat_ss_200_train_50_test_turns3.txt
Filename: pro_1nd4_MLformat_not_ss_200_train_50_test_turns3.txt


## 4bz3

In [129]:
# loading data
pro_4bz3_df1 = pd.read_csv("../Raw Data/4bz3.txt", sep=",")

# renaming mutated residue column
pro_4bz3_df1 = pro_4bz3_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [130]:
# reorganizing data to other form
col_list_4bz3 = list(pro_4bz3_df1)
col_list_4bz3 = col_list_4bz3[1:]

# mutations going down from the leftmost column
mutations_4bz3 = []

for column in col_list_4bz3:
    for mutation in pro_4bz3_df1["mutated_res"]:
        mutations_4bz3.append(column + mutation)

# getting scores
scores_4bz3 = []

for column in pro_4bz3_df1.drop('mutated_res', axis=1):
    for val in pro_4bz3_df1[column]:
        scores_4bz3.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_4bz3_df = pd.DataFrame(list(zip(mutations_4bz3, scores_4bz3)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [131]:
print(len(pro_4bz3_df))
pro_4bz3_df = pro_4bz3_df.dropna(axis=0)
print(len(pro_4bz3_df))

4620
4554


In [132]:
# rounding score column to 6 decimal points
pro_4bz3_df["score"] = pro_4bz3_df["score"].round(6)

# shuffling
pro_4bz3_df = pro_4bz3_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_4bz3_mut = pro_4bz3_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_4bz3_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_4bz3_mut)

# get mutated residue and place in seperate col
pro_4bz3_df["MUTATED_RES"] = ssf.get_mutation_type(pro_4bz3_mut)

# get position and place in seperate col
pro_4bz3_df["POSITION"] = ssf.get_position(pro_4bz3_mut)

# need positionssplit
pro_4bz3_df["positions_split"] = ssf.get_positions_split(pro_4bz3_df)

positions_split_subtracted = []
for pos_list in pro_4bz3_df["positions_split"]:
    pos_list = [x - 32 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_4bz3_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_4bz3_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_4bz3_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_4bz3_df["variant"] = ssf.get_mutations_names_list(pro_4bz3_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [133]:
print(len(pro_4bz3_df))

4554


In [134]:
path = "../PDB and STRIDE Files/" + '4bz3_stride.txt'
pro_4bz3_stride_file = open(path, 'r')
protein_seq_4bz3 = get_seq_from_stride(pro_4bz3_stride_file)

In [136]:
protein_seq_4bz3_split = protein_seq_4bz3.split()
print(len(protein_seq_4bz3_split))
print(protein_seq_4bz3_split[51])

463
ILE


In [137]:
path = "../PDB and STRIDE Files/" + '4bz3_stride.txt'
pro_4bz3_stride_file = open(path, 'r')

pro_4bz3_ss_indexes = ssf.get_all_sec_struc_boolean(pro_4bz3_stride_file)
print(len(pro_4bz3_ss_indexes))
print(pro_4bz3_ss_indexes.count(True))
print(pro_4bz3_ss_indexes.count(False))

# add in_sec_str_col
pro_4bz3_df = add_sec_str_col(pro_4bz3_df, pro_4bz3_ss_indexes, 0)

463
380
83


In [138]:
# pro_4bz3_in_domain_df = get_domain_dataset(pro_4bz3_df, 0, 1200)
not_included_pro_4bz3 = get_excluded_res(pro_4bz3_ss_indexes)
pro_4bz3_in_domain_df = get_domain_dataset_v2(pro_4bz3_df, 0, 2000, not_included_pro_4bz3)
print(len(pro_4bz3_in_domain_df))

Num True Indices: 380
Num False Indices: 83
Difference: 297
Num Indices to Remove: 297
1601


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [139]:
pro_4bz3_ss_df = get_ss_dataset(pro_4bz3_in_domain_df, pro_4bz3_ss_indexes, 0)
print(len(pro_4bz3_ss_df))

     variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split  \
0      60ASP -9.593820             T           D       60            [60]   
1     146PRO -4.749831             A           P      146           [146]   
2      53SER -5.391890             T           S       53            [53]   
3      53CYS -6.838109             T           C       53            [53]   
4     105ASN -0.724895             P           N      105           [105]   
...      ...       ...           ...         ...      ...             ...   
1596   43LEU  0.015557             R           L       43            [43]   
1597   88PRO -5.790531             V           P       88            [88]   
1598   53LEU -8.404293             T           L       53            [53]   
1599    4ALA  0.789339             V           A        4             [4]   
1600  129GLN  0.152708             S           Q      129           [129]   

     in_sec_str  has_sec_str  
0          True         True  
1          Tr

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [140]:
pro_4bz3_not_ss_df = get_not_ss_dataset(pro_4bz3_in_domain_df, pro_4bz3_ss_indexes, 0)
print(len(pro_4bz3_not_ss_df))

811


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


1000 Value Test dataset in SS

In [288]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_4bz3_ss_50_test_df = pro_4bz3_ss_df.sample(n=50)

In [290]:
pro_4bz3_temp_df = pd.concat([pro_4bz3_ss_50_test_df, pro_4bz3_ss_df])
print(len(pro_4bz3_temp_df))
pro_4bz3_ss_df = pro_4bz3_temp_df[~pro_4bz3_temp_df.index.duplicated(keep=False)]
print(len(pro_4bz3_ss_df))

840
740


1000 Value Test dataset not in SS

In [291]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_4bz3_not_ss_50_test_df = pro_4bz3_not_ss_df.sample(n=50)

In [292]:
pro_4bz3_temp_df = pd.concat([pro_4bz3_not_ss_50_test_df, pro_4bz3_not_ss_df])
print(len(pro_4bz3_temp_df))
pro_4bz3_not_ss_df = pro_4bz3_temp_df[~pro_4bz3_temp_df.index.duplicated(keep=False)]
print(len(pro_4bz3_not_ss_df))

861
761


Training Data

In [293]:
pro_4bz3_ss_df_200_t1 = pro_4bz3_ss_df.sample(n=200)
pro_4bz3_ss_df_200_t2 = pro_4bz3_ss_df.sample(n=200)
pro_4bz3_ss_df_200_t3 = pro_4bz3_ss_df.sample(n=200)

In [294]:
pro_4bz3_not_ss_df_200_t1 = pro_4bz3_not_ss_df.sample(n=200)
pro_4bz3_not_ss_df_200_t2 = pro_4bz3_not_ss_df.sample(n=200)
pro_4bz3_not_ss_df_200_t3 = pro_4bz3_not_ss_df.sample(n=200)

In [295]:
pro_4bz3_ss_200_df_t1 = pd.concat([pro_4bz3_ss_df_200_t1, pro_4bz3_ss_50_test_df])
pro_4bz3_not_ss_200_df_t1 = pd.concat([pro_4bz3_not_ss_df_200_t1, pro_4bz3_not_ss_50_test_df])
pro_4bz3_ss_200_df_t2 = pd.concat([pro_4bz3_ss_df_200_t2, pro_4bz3_ss_50_test_df])
pro_4bz3_not_ss_200_df_t2 = pd.concat([pro_4bz3_not_ss_df_200_t2, pro_4bz3_not_ss_50_test_df])
pro_4bz3_ss_200_df_t3 = pd.concat([pro_4bz3_ss_df_200_t3, pro_4bz3_ss_50_test_df])
pro_4bz3_not_ss_200_df_t3 = pd.concat([pro_4bz3_not_ss_df_200_t3, pro_4bz3_not_ss_50_test_df])

In [296]:
# write data to formatted txt file

write_data_file("pro_4bz3_MLformat_ss_200_train_50_test_turns1", protein_seq_4bz3, pro_4bz3_ss_200_df_t1)
write_data_file("pro_4bz3_MLformat_not_ss_200_train_50_test_turns1", protein_seq_4bz3, pro_4bz3_not_ss_200_df_t1)
write_data_file("pro_4bz3_MLformat_ss_200_train_50_test_turns2", protein_seq_4bz3, pro_4bz3_ss_200_df_t2)
write_data_file("pro_4bz3_MLformat_not_ss_200_train_50_test_turns2", protein_seq_4bz3, pro_4bz3_not_ss_200_df_t2)
write_data_file("pro_4bz3_MLformat_ss_200_train_50_test_turns3", protein_seq_4bz3, pro_4bz3_ss_200_df_t3)
write_data_file("pro_4bz3_MLformat_not_ss_200_train_50_test_turns3", protein_seq_4bz3, pro_4bz3_not_ss_200_df_t3)

Filename: pro_4bz3_MLformat_ss_200_train_50_test_turns1.txt
Filename: pro_4bz3_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: pro_4bz3_MLformat_ss_200_train_50_test_turns2.txt
Filename: pro_4bz3_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: pro_4bz3_MLformat_ss_200_train_50_test_turns3.txt
Filename: pro_4bz3_MLformat_not_ss_200_train_50_test_turns3.txt


## 3dqw

In [118]:
# loading data
pro_3dqw_df1 = pd.read_csv("../Raw Data/3dqw.txt", sep=",")

# renaming mutated residue column
pro_3dqw_df1 = pro_3dqw_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [119]:
# reorganizing data to other form
col_list_3dqw = list(pro_3dqw_df1)
col_list_3dqw = col_list_3dqw[1:]

# mutations going down from the leftmost column
mutations_3dqw = []

for column in col_list_3dqw:
    for mutation in pro_3dqw_df1["mutated_res"]:
        mutations_3dqw.append(column + mutation)

# getting scores
scores_3dqw = []

for column in pro_3dqw_df1.drop('mutated_res', axis=1):
    for val in pro_3dqw_df1[column]:
        scores_3dqw.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_3dqw_df = pd.DataFrame(list(zip(mutations_3dqw, scores_3dqw)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [120]:
pro_3dqw_df = pro_3dqw_df.dropna(axis=0)
print(len(pro_3dqw_df))

3315


In [121]:
# rounding score column to 6 decimal points
pro_3dqw_df["score"] = pro_3dqw_df["score"].round(6)

# shuffling
pro_3dqw_df = pro_3dqw_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_3dqw_mut = pro_3dqw_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_3dqw_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_3dqw_mut)

# get mutated residue and place in seperate col
pro_3dqw_df["MUTATED_RES"] = ssf.get_mutation_type(pro_3dqw_mut)

# get position and place in seperate col
pro_3dqw_df["POSITION"] = ssf.get_position(pro_3dqw_mut)

# need positionssplit
pro_3dqw_df["positions_split"] = ssf.get_positions_split(pro_3dqw_df)

positions_split_subtracted = []
for pos_list in pro_3dqw_df["positions_split"]:
    pos_list = [x - 255 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_3dqw_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_3dqw_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_3dqw_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_3dqw_df["variant"] = ssf.get_mutations_names_list(pro_3dqw_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [122]:
print(len(pro_3dqw_df))

3315


In [286]:
path = "../PDB and STRIDE Files/" + '3dqw_stride.txt'
pro_3dqw_stride_file = open(path, 'r')
protein_seq_pro_3dqw = get_seq_from_stride(pro_3dqw_stride_file)

In [284]:
protein_seq_pro_3dqw_split = protein_seq_3dqw.split()
print(len(protein_seq_3dqw_split))
print(protein_seq_3dqw_split[128])

1107
VAL


In [125]:
path = "../PDB and STRIDE Files/" + '3dqw_stride.txt'
pro_3dqw_stride_file = open(path, 'r')

pro_3dqw_ss_indexes = ssf.get_all_sec_struc_boolean(pro_3dqw_stride_file)
print(len(pro_3dqw_ss_indexes))
print(pro_3dqw_ss_indexes.count(True))
print(pro_3dqw_ss_indexes.count(False))

# add in_sec_str_col
pro_3dqw_df = add_sec_str_col(pro_3dqw_df, pro_3dqw_ss_indexes, 0)

1107
926
181


In [126]:
# pro_3dqw_in_domain_df = get_domain_dataset(pro_3dqw_df, 0, 1200)
not_included_pro_3dqw = get_excluded_res(pro_3dqw_ss_indexes)
pro_3dqw_in_domain_df = get_domain_dataset_v2(pro_3dqw_df, 0, 2000, not_included_pro_3dqw)
print(len(pro_3dqw_in_domain_df))

Num True Indices: 926
Num False Indices: 181
Difference: 745
Num Indices to Remove: 745
1126


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [127]:
pro_3dqw_ss_df = get_ss_dataset(pro_3dqw_in_domain_df, pro_3dqw_ss_indexes, 0)
print(len(pro_3dqw_ss_df))

     variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split  \
0      62ARG -1.607662             L           R       62            [62]   
1     227GLU -0.246321             P           E      227           [227]   
2     162HIS  0.646216             T           H      162           [162]   
3     232MET -0.521292             C           M      232           [232]   
4     182GLU -1.638408             G           E      182           [182]   
...      ...       ...           ...         ...      ...             ...   
1121  229ALA -0.405353             P           A      229           [229]   
1122  251PRO -1.587072             R           P      251           [251]   
1123  181GLU -0.546863             Y           E      181           [181]   
1124  171HIS -1.562270             I           H      171           [171]   
1125  174ILE -1.533342             T           I      174           [174]   

     in_sec_str  has_sec_str  
0         False        False  
1         Fal

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [128]:
pro_3dqw_not_ss_df = get_not_ss_dataset(pro_3dqw_in_domain_df, pro_3dqw_ss_indexes, 0)
print(len(pro_3dqw_not_ss_df))

579


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


50 Value Test dataset in SS

In [272]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_3dqw_ss_50_test_df = pro_3dqw_ss_df.sample(n=50)

In [273]:
pro_3dqw_temp_df = pd.concat([pro_3dqw_ss_50_test_df, pro_3dqw_ss_df])
print(len(pro_3dqw_temp_df))
pro_3dqw_ss_df = pro_3dqw_temp_df[~pro_3dqw_temp_df.index.duplicated(keep=False)]
print(len(pro_3dqw_ss_df))

597
497


50 Value Test dataset not in SS

In [274]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_3dqw_not_ss_50_test_df = pro_3dqw_not_ss_df.sample(n=50)

In [275]:
pro_3dqw_temp_df = pd.concat([pro_3dqw_not_ss_50_test_df, pro_3dqw_not_ss_df])
print(len(pro_3dqw_temp_df))
pro_3dqw_not_ss_df = pro_3dqw_temp_df[~pro_3dqw_temp_df.index.duplicated(keep=False)]
print(len(pro_3dqw_not_ss_df))

629
529


Training Data

In [276]:
pro_3dqw_ss_df_200_t1 = pro_3dqw_ss_df.sample(n=200)
pro_3dqw_ss_df_200_t2 = pro_3dqw_ss_df.sample(n=200)
pro_3dqw_ss_df_200_t3 = pro_3dqw_ss_df.sample(n=200)

In [277]:
pro_3dqw_not_ss_df_200_t1 = pro_3dqw_not_ss_df.sample(n=200)
pro_3dqw_not_ss_df_200_t2 = pro_3dqw_not_ss_df.sample(n=200)
pro_3dqw_not_ss_df_200_t3 = pro_3dqw_not_ss_df.sample(n=200)

In [84]:
pro_3dqw_ss_df_1000_t1 = pro_3dqw_ss_df.sample(n=1000)
pro_3dqw_ss_df_1000_t2 = pro_3dqw_ss_df.sample(n=1000)
pro_3dqw_ss_df_1000_t3 = pro_3dqw_ss_df.sample(n=1000)

In [85]:
pro_3dqw_ss_df_2000_t1 = pro_3dqw_ss_df.sample(n=2000)
pro_3dqw_ss_df_2000_t2 = pro_3dqw_ss_df.sample(n=2000)
pro_3dqw_ss_df_2000_t3 = pro_3dqw_ss_df.sample(n=2000)

In [86]:
pro_3dqw_ss_df_3000_t1 = pro_3dqw_ss_df.sample(n=3000)
pro_3dqw_ss_df_3000_t2 = pro_3dqw_ss_df.sample(n=3000)
pro_3dqw_ss_df_3000_t3 = pro_3dqw_ss_df.sample(n=3000)

In [88]:
pro_3dqw_not_ss_df_1000_t1 = pro_3dqw_not_ss_df.sample(n=1000)
pro_3dqw_not_ss_df_1000_t2 = pro_3dqw_not_ss_df.sample(n=1000)
pro_3dqw_not_ss_df_1000_t3 = pro_3dqw_not_ss_df.sample(n=1000)

In [89]:
pro_3dqw_not_ss_df_2000_t1 = pro_3dqw_not_ss_df.sample(n=2000)
pro_3dqw_not_ss_df_2000_t2 = pro_3dqw_not_ss_df.sample(n=2000)
pro_3dqw_not_ss_df_2000_t3 = pro_3dqw_not_ss_df.sample(n=2000)

In [90]:
pro_3dqw_not_ss_df_3000_t1 = pro_3dqw_not_ss_df.sample(n=3000)
pro_3dqw_not_ss_df_3000_t2 = pro_3dqw_not_ss_df.sample(n=3000)
pro_3dqw_not_ss_df_3000_t3 = pro_3dqw_not_ss_df.sample(n=3000)

In [278]:
pro_3dqw_ss_200_df_t1 = pd.concat([pro_3dqw_ss_df_200_t1, pro_3dqw_ss_50_test_df])
pro_3dqw_not_ss_200_df_t1 = pd.concat([pro_3dqw_not_ss_df_200_t1, pro_3dqw_not_ss_50_test_df])
pro_3dqw_ss_200_df_t2 = pd.concat([pro_3dqw_ss_df_200_t2, pro_3dqw_ss_50_test_df])
pro_3dqw_not_ss_200_df_t2 = pd.concat([pro_3dqw_not_ss_df_200_t2, pro_3dqw_not_ss_50_test_df])
pro_3dqw_ss_200_df_t3 = pd.concat([pro_3dqw_ss_df_200_t3, pro_3dqw_ss_50_test_df])
pro_3dqw_not_ss_200_df_t3 = pd.concat([pro_3dqw_not_ss_df_200_t3, pro_3dqw_not_ss_50_test_df])

In [92]:
pro_3dqw_ss_1000_df_t1 = pd.concat([pro_3dqw_ss_df_1000_t1, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_1000_df_t1 = pd.concat([pro_3dqw_not_ss_df_1000_t1, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_1000_df_t2 = pd.concat([pro_3dqw_ss_df_1000_t2, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_1000_df_t2 = pd.concat([pro_3dqw_not_ss_df_1000_t2, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_1000_df_t3 = pd.concat([pro_3dqw_ss_df_1000_t3, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_1000_df_t3 = pd.concat([pro_3dqw_not_ss_df_1000_t3, pro_3dqw_not_ss_1000_test_df])

In [93]:
pro_3dqw_ss_2000_df_t1 = pd.concat([pro_3dqw_ss_df_2000_t1, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_2000_df_t1 = pd.concat([pro_3dqw_not_ss_df_2000_t1, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_2000_df_t2 = pd.concat([pro_3dqw_ss_df_2000_t2, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_2000_df_t2 = pd.concat([pro_3dqw_not_ss_df_2000_t2, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_2000_df_t3 = pd.concat([pro_3dqw_ss_df_2000_t3, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_2000_df_t3 = pd.concat([pro_3dqw_not_ss_df_2000_t3, pro_3dqw_not_ss_1000_test_df])

In [94]:
pro_3dqw_ss_3000_df_t1 = pd.concat([pro_3dqw_ss_df_3000_t1, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_3000_df_t1 = pd.concat([pro_3dqw_not_ss_df_3000_t1, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_3000_df_t2 = pd.concat([pro_3dqw_ss_df_3000_t2, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_3000_df_t2 = pd.concat([pro_3dqw_not_ss_df_3000_t2, pro_3dqw_not_ss_1000_test_df])
pro_3dqw_ss_3000_df_t3 = pd.concat([pro_3dqw_ss_df_3000_t3, pro_3dqw_ss_1000_test_df])
pro_3dqw_not_ss_3000_df_t3 = pd.concat([pro_3dqw_not_ss_df_3000_t3, pro_3dqw_not_ss_1000_test_df])

In [287]:
# write data to formatted txt file

write_data_file("pro_3dqw_MLformat_ss_200_train_50_test_turns1", protein_seq_pro_3dqw, pro_3dqw_ss_200_df_t1)
write_data_file("pro_3dqw_MLformat_not_ss_200_train_50_test_turns1", protein_seq_pro_3dqw, pro_3dqw_not_ss_200_df_t1)
write_data_file("pro_3dqw_MLformat_ss_200_train_50_test_turns2", protein_seq_pro_3dqw, pro_3dqw_ss_200_df_t2)
write_data_file("pro_3dqw_MLformat_not_ss_200_train_50_test_turns2", protein_seq_pro_3dqw, pro_3dqw_not_ss_200_df_t2)
write_data_file("pro_3dqw_MLformat_ss_200_train_50_test_turns3", protein_seq_pro_3dqw, pro_3dqw_ss_200_df_t3)
write_data_file("pro_3dqw_MLformat_not_ss_200_train_50_test_turns3", protein_seq_pro_3dqw, pro_3dqw_not_ss_200_df_t3)

# write_data_file("pro_3dqw_MLformat_ss_50_train_50_test_t1", protein_seq_pro_3dqw, pro_3dqw_ss_50_df_t1)
# write_data_file("pro_3dqw_MLformat_not_ss_50_train_50_test_t1", protein_seq_pro_3dqw, pro_3dqw_not_ss_50_df_t1)
# write_data_file("pro_3dqw_MLformat_ss_50_train_50_test_t2", protein_seq_pro_3dqw, pro_3dqw_ss_50_df_t2)
# write_data_file("pro_3dqw_MLformat_not_ss_50_train_50_test_t2", protein_seq_pro_3dqw, pro_3dqw_not_ss_50_df_t2)
# write_data_file("pro_3dqw_MLformat_ss_50_train_50_test_t3", protein_seq_pro_3dqw, pro_3dqw_ss_50_df_t3)
# write_data_file("pro_3dqw_MLformat_not_ss_50_train_50_test_t3", protein_seq_pro_3dqw, pro_3dqw_not_ss_50_df_t3)

# write_data_file("pro_3dqw_MLformat_ss_2000_train_50_test_t1", protein_seq_pro_3dqw, pro_3dqw_ss_2000_df_t1)
# write_data_file("pro_3dqw_MLformat_not_ss_2000_train_50_test_t1", protein_seq_pro_3dqw, pro_3dqw_not_ss_2000_df_t1)
# write_data_file("pro_3dqw_MLformat_ss_2000_train_50_test_t2", protein_seq_pro_3dqw, pro_3dqw_ss_2000_df_t2)
# write_data_file("pro_3dqw_MLformat_not_ss_2000_train_50_test_t2", protein_seq_pro_3dqw, pro_3dqw_not_ss_2000_df_t2)
# write_data_file("pro_3dqw_MLformat_ss_2000_train_50_test_t3", protein_seq_pro_3dqw, pro_3dqw_ss_2000_df_t3)
# write_data_file("pro_3dqw_MLformat_not_ss_2000_train_50_test_t3", protein_seq_pro_3dqw, pro_3dqw_not_ss_2000_df_t3)

Filename: pro_3dqw_MLformat_ss_200_train_50_test_turns1.txt
Filename: pro_3dqw_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: pro_3dqw_MLformat_ss_200_train_50_test_turns2.txt
Filename: pro_3dqw_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: pro_3dqw_MLformat_ss_200_train_50_test_turns3.txt
Filename: pro_3dqw_MLformat_not_ss_200_train_50_test_turns3.txt


In [None]:
# not enough for 500, have to run smaller cells

## 1be9

In [98]:
# loading data
pro_1be9_df1 = pd.read_csv("../Raw Data/functional_1be9.csv", sep=",")

# renaming mutated residue column
pro_1be9_df1 = pro_1be9_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [99]:
# reorganizing data to other form
col_list_1be9 = list(pro_1be9_df1)
col_list_1be9 = col_list_1be9[1:]

# mutations going down from the leftmost column
mutations_1be9 = []

for column in col_list_1be9:
    for mutation in pro_1be9_df1["mutated_res"]:
        mutations_1be9.append(column + mutation)

# getting scores
scores_1be9 = []

for column in pro_1be9_df1.drop('mutated_res', axis=1):
    for val in pro_1be9_df1[column]:
        scores_1be9.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_1be9_df = pd.DataFrame(list(zip(mutations_1be9, scores_1be9)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [100]:
# rounding score column to 6 decimal points
pro_1be9_df["score"] = pro_1be9_df["score"].round(6)

# shuffling
pro_1be9_df = pro_1be9_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_1be9_mut = pro_1be9_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_1be9_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_1be9_mut)

# get mutated residue and place in seperate col
pro_1be9_df["MUTATED_RES"] = ssf.get_mutation_type(pro_1be9_mut)

# get position and place in seperate col
pro_1be9_df["POSITION"] = ssf.get_position(pro_1be9_mut)

# need positionssplit
pro_1be9_df["positions_split"] = ssf.get_positions_split(pro_1be9_df)

positions_split_subtracted = []
for pos_list in pro_1be9_df["positions_split"]:
    pos_list = [x - 301 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_1be9_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_1be9_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_1be9_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_1be9_df["variant"] = ssf.get_mutations_names_list(pro_1be9_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [101]:
string_seq_1be9 = "FLGEEDIPREPRRIVIHRGSTGLGFNIIGGEDGEGIFISFILAGGPADLSGELRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTIIAQYKPEEYSRFEANSRVNSSGRIVTNKQTSV"
protein_seq_1be9 = ssf.get_expanded_seq(string_seq_1be9)
protein_seq_1be9_split = protein_seq_1be9.split()

In [102]:
path = "../PDB and STRIDE Files/" + '1be9_stride.txt'
pro_1be9_stride_file = open(path, 'r')

pro_1be9_ss_indexes = ssf.get_all_sec_struc_boolean(pro_1be9_stride_file)
print(pro_1be9_ss_indexes.count(True))
print(pro_1be9_ss_indexes.count(False))

# add in_sec_str_col
pro_1be9_df = add_sec_str_col(pro_1be9_df, pro_1be9_ss_indexes, 0)

97
23


In [103]:
# pro_1be9_in_domain_df = get_domain_dataset(pro_1be9_df, 0, 1200)
not_included_pro_1be9 = get_excluded_res(pro_1be9_ss_indexes)
pro_1be9_in_domain_df = get_domain_dataset_v2(pro_1be9_df, 0, 2000, not_included_pro_1be9)
print(len(pro_1be9_in_domain_df))

Num True Indices: 97
Num False Indices: 23
Difference: 74
Num Indices to Remove: 74
560


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [105]:
pro_1be9_ss_df = get_ss_dataset(pro_1be9_in_domain_df, pro_1be9_ss_indexes, 0)
# print(len(pro_1be9_ss_df))

    variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split  \
0     16GLN  0.117077             H           Q       16            [16]   
1     54THR  0.061228             K           T       54            [54]   
2     82HIS  0.041419             G           H       82            [82]   
3     52LEU  0.047569             L           L       52            [52]   
4     53ARG  0.084396             R           R       53            [53]   
5     53PRO -1.076711             R           P       53            [53]   
6     11PRO -0.195228             R           P       11            [11]   
7     51LEU  0.088649             E           L       51            [51]   
8     50GLU  0.018137             G           E       50            [50]   
9     10SER  0.106889             P           S       10            [10]   
10    17LEU -0.265691             R           L       17            [17]   
11    12ILE -0.056989             R           I       12            [12]   
12    34ALA 

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [106]:
print(len(pro_1be9_ss_df))

340


In [107]:
pro_1be9_not_ss_df = get_not_ss_dataset(pro_1be9_in_domain_df, pro_1be9_ss_indexes, 0)
print(len(pro_1be9_not_ss_df))

220


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


1000 Value Test dataset in SS

In [170]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_1be9_ss_1000_test_df = pro_1be9_ss_df.sample(n=1000)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [80]:
pro_1be9_temp_df = pd.concat([pro_1be9_ss_1000_test_df, pro_1be9_ss_df])
print(len(pro_1be9_temp_df))
pro_1be9_ss_df = pro_1be9_temp_df[~pro_1be9_temp_df.index.duplicated(keep=False)]
print(len(pro_1be9_ss_df))

121761
119761


1000 Value Test dataset not in SS

In [81]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
pro_1be9_not_ss_1000_test_df = pro_1be9_not_ss_df.sample(n=1000)

In [82]:
pro_1be9_temp_df = pd.concat([pro_1be9_not_ss_1000_test_df, pro_1be9_not_ss_df])
print(len(pro_1be9_temp_df))
pro_1be9_not_ss_df = pro_1be9_temp_df[~pro_1be9_temp_df.index.duplicated(keep=False)]
print(len(pro_1be9_not_ss_df))

44600
42600


Training Data

In [83]:
pro_1be9_ss_df_500_t1 = pro_1be9_ss_df.sample(n=500)
pro_1be9_ss_df_500_t2 = pro_1be9_ss_df.sample(n=500)
pro_1be9_ss_df_500_t3 = pro_1be9_ss_df.sample(n=500)

In [84]:
pro_1be9_ss_df_1000_t1 = pro_1be9_ss_df.sample(n=1000)
pro_1be9_ss_df_1000_t2 = pro_1be9_ss_df.sample(n=1000)
pro_1be9_ss_df_1000_t3 = pro_1be9_ss_df.sample(n=1000)

In [85]:
pro_1be9_ss_df_2000_t1 = pro_1be9_ss_df.sample(n=2000)
pro_1be9_ss_df_2000_t2 = pro_1be9_ss_df.sample(n=2000)
pro_1be9_ss_df_2000_t3 = pro_1be9_ss_df.sample(n=2000)

In [86]:
pro_1be9_ss_df_3000_t1 = pro_1be9_ss_df.sample(n=3000)
pro_1be9_ss_df_3000_t2 = pro_1be9_ss_df.sample(n=3000)
pro_1be9_ss_df_3000_t3 = pro_1be9_ss_df.sample(n=3000)

In [87]:
pro_1be9_not_ss_df_500_t1 = pro_1be9_not_ss_df.sample(n=500)
pro_1be9_not_ss_df_500_t2 = pro_1be9_not_ss_df.sample(n=500)
pro_1be9_not_ss_df_500_t3 = pro_1be9_not_ss_df.sample(n=500)

In [88]:
pro_1be9_not_ss_df_1000_t1 = pro_1be9_not_ss_df.sample(n=1000)
pro_1be9_not_ss_df_1000_t2 = pro_1be9_not_ss_df.sample(n=1000)
pro_1be9_not_ss_df_1000_t3 = pro_1be9_not_ss_df.sample(n=1000)

In [89]:
pro_1be9_not_ss_df_2000_t1 = pro_1be9_not_ss_df.sample(n=2000)
pro_1be9_not_ss_df_2000_t2 = pro_1be9_not_ss_df.sample(n=2000)
pro_1be9_not_ss_df_2000_t3 = pro_1be9_not_ss_df.sample(n=2000)

In [90]:
pro_1be9_not_ss_df_3000_t1 = pro_1be9_not_ss_df.sample(n=3000)
pro_1be9_not_ss_df_3000_t2 = pro_1be9_not_ss_df.sample(n=3000)
pro_1be9_not_ss_df_3000_t3 = pro_1be9_not_ss_df.sample(n=3000)

In [91]:
pro_1be9_ss_500_df_t1 = pd.concat([pro_1be9_ss_df_500_t1, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_500_df_t1 = pd.concat([pro_1be9_not_ss_df_500_t1, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_500_df_t2 = pd.concat([pro_1be9_ss_df_500_t2, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_500_df_t2 = pd.concat([pro_1be9_not_ss_df_500_t2, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_500_df_t3 = pd.concat([pro_1be9_ss_df_500_t3, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_500_df_t3 = pd.concat([pro_1be9_not_ss_df_500_t3, pro_1be9_not_ss_1000_test_df])

In [92]:
pro_1be9_ss_1000_df_t1 = pd.concat([pro_1be9_ss_df_1000_t1, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_1000_df_t1 = pd.concat([pro_1be9_not_ss_df_1000_t1, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_1000_df_t2 = pd.concat([pro_1be9_ss_df_1000_t2, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_1000_df_t2 = pd.concat([pro_1be9_not_ss_df_1000_t2, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_1000_df_t3 = pd.concat([pro_1be9_ss_df_1000_t3, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_1000_df_t3 = pd.concat([pro_1be9_not_ss_df_1000_t3, pro_1be9_not_ss_1000_test_df])

In [93]:
pro_1be9_ss_2000_df_t1 = pd.concat([pro_1be9_ss_df_2000_t1, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_2000_df_t1 = pd.concat([pro_1be9_not_ss_df_2000_t1, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_2000_df_t2 = pd.concat([pro_1be9_ss_df_2000_t2, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_2000_df_t2 = pd.concat([pro_1be9_not_ss_df_2000_t2, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_2000_df_t3 = pd.concat([pro_1be9_ss_df_2000_t3, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_2000_df_t3 = pd.concat([pro_1be9_not_ss_df_2000_t3, pro_1be9_not_ss_1000_test_df])

In [94]:
pro_1be9_ss_3000_df_t1 = pd.concat([pro_1be9_ss_df_3000_t1, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_3000_df_t1 = pd.concat([pro_1be9_not_ss_df_3000_t1, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_3000_df_t2 = pd.concat([pro_1be9_ss_df_3000_t2, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_3000_df_t2 = pd.concat([pro_1be9_not_ss_df_3000_t2, pro_1be9_not_ss_1000_test_df])
pro_1be9_ss_3000_df_t3 = pd.concat([pro_1be9_ss_df_3000_t3, pro_1be9_ss_1000_test_df])
pro_1be9_not_ss_3000_df_t3 = pd.concat([pro_1be9_not_ss_df_3000_t3, pro_1be9_not_ss_1000_test_df])

In [95]:
# write data to formatted txt file

write_data_file("pro_1be9_MLformat_ss_500_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_ss_500_df_t1)
write_data_file("pro_1be9_MLformat_not_ss_500_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_not_ss_500_df_t1)
write_data_file("pro_1be9_MLformat_ss_500_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_ss_500_df_t2)
write_data_file("pro_1be9_MLformat_not_ss_500_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_not_ss_500_df_t2)
write_data_file("pro_1be9_MLformat_ss_500_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_ss_500_df_t3)
write_data_file("pro_1be9_MLformat_not_ss_500_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_not_ss_500_df_t3)

write_data_file("pro_1be9_MLformat_ss_1000_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_ss_1000_df_t1)
write_data_file("pro_1be9_MLformat_not_ss_1000_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_not_ss_1000_df_t1)
write_data_file("pro_1be9_MLformat_ss_1000_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_ss_1000_df_t2)
write_data_file("pro_1be9_MLformat_not_ss_1000_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_not_ss_1000_df_t2)
write_data_file("pro_1be9_MLformat_ss_1000_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_ss_1000_df_t3)
write_data_file("pro_1be9_MLformat_not_ss_1000_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_not_ss_1000_df_t3)

write_data_file("pro_1be9_MLformat_ss_2000_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_ss_2000_df_t1)
write_data_file("pro_1be9_MLformat_not_ss_2000_train_1000_test_t1", protein_seq_pro_1be9, pro_1be9_not_ss_2000_df_t1)
write_data_file("pro_1be9_MLformat_ss_2000_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_ss_2000_df_t2)
write_data_file("pro_1be9_MLformat_not_ss_2000_train_1000_test_t2", protein_seq_pro_1be9, pro_1be9_not_ss_2000_df_t2)
write_data_file("pro_1be9_MLformat_ss_2000_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_ss_2000_df_t3)
write_data_file("pro_1be9_MLformat_not_ss_2000_train_1000_test_t3", protein_seq_pro_1be9, pro_1be9_not_ss_2000_df_t3)

Filename: gb1_MLformat_ss_500_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_500_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_500_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t3.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t3.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t3.tx

In [None]:
# not enough for pt. 1

## Small ubiquitin-related modifier 1

In [332]:
modifier_1_df1 = pd.read_csv("../Raw Data/modifier_1_mod.csv")

In [333]:
# modifier_1_df1.columns = modifier_1_df1.iloc[3]
print(modifier_1_df1.columns)
print(len(modifier_1_df1))

modifier_1_df = modifier_1_df1[(modifier_1_df1["hgvs_pro"].str.contains("=") == False) & (modifier_1_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (modifier_1_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(modifier_1_df))
print(modifier_1_df.head(1))

# shuffle values
modifier_1_df = modifier_1_df.sample(frac=1)

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'se',
       'exp.score', 'exp.sd', 'df', 'pred.score'],
      dtype='object')
2020
1919
                   accession  hgvs_nt  hgvs_splice   hgvs_pro     score  \
0  urn:mavedb:00000001-b-1#1      NaN          NaN  p.Glu5Lys  1.311357   

         sd        se  exp.score    exp.sd   df  pred.score  
0  0.085569  0.042785    1.31651  0.024947  4.0    1.117086  


In [334]:
# getting uniprot to compare offset
protein_seq_modifier_1 = ssf.get_protein_seq("P63165")
# offset of 1
print(protein_seq_modifier_1)
print(len(protein_seq_modifier_1))
protein_seq_modifier_1_split = protein_seq_modifier_1.split()

MET SER ASP GLN GLU ALA LYS PRO SER THR GLU ASP LEU GLY ASP LYS LYS GLU GLY GLU TYR ILE LYS LEU LYS VAL ILE GLY GLN ASP SER SER GLU ILE HIS PHE LYS VAL LYS MET THR THR HIS LEU LYS LYS LEU LYS GLU SER TYR CYS GLN ARG GLN GLY VAL PRO MET ASN SER LEU ARG PHE LEU PHE GLU GLY GLN ARG ILE ALA ASP ASN HIS THR PRO LYS GLU LEU GLY MET GLU GLU GLU ASP VAL ILE GLU VAL TYR GLN GLU GLN THR GLY GLY HIS SER THR VAL
403


In [335]:
protein_seq_modifier_1 = protein_seq_modifier_1_split[17:98]

protein_seq_modifier_1_cut = ""
for residue in protein_seq_modifier_1:

    protein_seq_modifier_1_cut += residue + " "

protein_seq_modifier_1 = protein_seq_modifier_1_cut.rstrip()
print(protein_seq_modifier_1)

GLU GLY GLU TYR ILE LYS LEU LYS VAL ILE GLY GLN ASP SER SER GLU ILE HIS PHE LYS VAL LYS MET THR THR HIS LEU LYS LYS LEU LYS GLU SER TYR CYS GLN ARG GLN GLY VAL PRO MET ASN SER LEU ARG PHE LEU PHE GLU GLY GLN ARG ILE ALA ASP ASN HIS THR PRO LYS GLU LEU GLY MET GLU GLU GLU ASP VAL ILE GLU VAL TYR GLN GLU GLN THR GLY GLY HIS


In [336]:
protein_seq_modifier_1_split = protein_seq_modifier_1.split()
print(protein_seq_modifier_1_split[55:57])

['ASP', 'ASN']


In [337]:
modifier_1_df["variant"] = format_mavedb_variant(modifier_1_df, "hgvs_pro", -1)

In [338]:
# splitting variant list if there are multiple mutations
modifier_1_mut = modifier_1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
modifier_1_df["WILD_TYPE_RES"] = ssf.get_wild_type(modifier_1_mut)

# get mutated residue and place in seperate col
modifier_1_df["MUTATED_RES"] = ssf.get_mutation_type(modifier_1_mut)

# get position and place in seperate col
modifier_1_df["POSITION"] = ssf.get_position(modifier_1_mut)

# replace variant column with reformatted variant name
modifier_1_df["variant"] = ssf.get_mutations_names_list(modifier_1_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

AttributeError: 'float' object has no attribute 'split'

In [331]:
# getting training and test datasets

# get ss position indexes
# path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

modifier_1_ss_indexes = ssf.get_all_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements
print(modifier_1_ss_indexes.count(True))
print(modifier_1_ss_indexes.count(False))

# need positionssplit
modifier_1_df["positions_split"] = ssf.get_positions_split(modifier_1_df)

# add in_sec_str_col
modifier_1_df = add_sec_str_col(modifier_1_df, modifier_1_ss_indexes, 0)

71
30


AttributeError: 'float' object has no attribute 'split'

In [193]:
not_included_modifier_1_v1 = get_excluded_res(modifier_1_ss_indexes)

path = "../PDB and STRIDE Files/" + 'modifier_1.pdb.pdb'
modifier_1_pdb = open(path, 'r')

low_confidence_indices = get_low_confidence_indices(modifier_1_pdb)
not_included_modifier_1_testing = not_included_modifier_1_v1+low_confidence_indices
not_included_modifier_1 = sorted(np.unique(not_included_modifier_1_v1+low_confidence_indices))

Num True Indices: 71
Num False Indices: 30
Difference: 41
Num Indices to Remove: 41


In [194]:
modifier_1_in_domain_df = get_domain_dataset_v2(modifier_1_df, 0, 2000, not_included_modifier_1)
print(len(modifier_1_in_domain_df))

817


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [195]:
modifier_1_ss_df = get_ss_dataset(modifier_1_in_domain_df, modifier_1_ss_indexes, 0)
print(len(modifier_1_ss_df))

                        accession  hgvs_nt  hgvs_splice     hgvs_pro  \
0     urn:mavedb:00000001-b-1#507      NaN          NaN   p.Thr76Val   
1    urn:mavedb:00000001-b-1#1433      NaN          NaN   p.Val26Asp   
2    urn:mavedb:00000001-b-1#1923      NaN          NaN   p.Leu65Ser   
3    urn:mavedb:00000001-b-1#1653      NaN          NaN   p.Pro77Leu   
4    urn:mavedb:00000001-b-1#1436      NaN          NaN   p.Gly97His   
5     urn:mavedb:00000001-b-1#385      NaN          NaN   p.Gln94Asn   
6    urn:mavedb:00000001-b-1#1784      NaN          NaN   p.Val57Leu   
7     urn:mavedb:00000001-b-1#760      NaN          NaN   p.Gly97Phe   
8    urn:mavedb:00000001-b-1#1491      NaN          NaN   p.Pro77Val   
9     urn:mavedb:00000001-b-1#514      NaN          NaN   p.Gln53Leu   
10   urn:mavedb:00000001-b-1#1845      NaN          NaN   p.Asp73Ala   
11   urn:mavedb:00000001-b-1#1441      NaN          NaN   p.His43Arg   
12    urn:mavedb:00000001-b-1#950      NaN          NaN   p.Lys7

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [196]:
modifier_1_not_ss_df = get_not_ss_dataset(modifier_1_in_domain_df, modifier_1_ss_indexes, 0)
print(len(modifier_1_not_ss_df))

323


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


500 Value Test dataset in SS

In [254]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
modifier_1_ss_50_test_df = modifier_1_ss_df.sample(n=50)

In [255]:
modifier_1_temp_df = pd.concat([modifier_1_ss_50_test_df, modifier_1_ss_df])
print(len(modifier_1_temp_df))
modifier_1_ss_df = modifier_1_temp_df[~modifier_1_temp_df.index.duplicated(keep=False)]
print(len(modifier_1_ss_df))

544
444


500 Value Test dataset not in SS

In [256]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
modifier_1_not_ss_50_test_df = modifier_1_not_ss_df.sample(n=50)

In [257]:
modifier_1_temp_df = pd.concat([modifier_1_not_ss_50_test_df, modifier_1_not_ss_df])
print(len(modifier_1_temp_df))
modifier_1_not_ss_df = modifier_1_temp_df[~modifier_1_temp_df.index.duplicated(keep=False)]
print(len(modifier_1_not_ss_df))

373
273


Training Data

In [40]:
modifier_1_ss_df_400_t1 = modifier_1_ss_df.sample(n=400)
modifier_1_ss_df_500_t2 = modifier_1_ss_df.sample(n=500)
modifier_1_ss_df_500_t3 = modifier_1_ss_df.sample(n=500)

In [258]:
modifier_1_ss_df_200_t1 = modifier_1_ss_df.sample(n=200)
modifier_1_ss_df_200_t2 = modifier_1_ss_df.sample(n=200)
modifier_1_ss_df_200_t3 = modifier_1_ss_df.sample(n=200)

In [259]:
modifier_1_not_ss_df_200_t1 = modifier_1_not_ss_df.sample(n=200)
modifier_1_not_ss_df_200_t2 = modifier_1_not_ss_df.sample(n=200)
modifier_1_not_ss_df_200_t3 = modifier_1_not_ss_df.sample(n=200)

In [85]:
modifier_1_ss_df_2000_t1 = modifier_1_ss_df.sample(n=2000)
modifier_1_ss_df_2000_t2 = modifier_1_ss_df.sample(n=2000)
modifier_1_ss_df_2000_t3 = modifier_1_ss_df.sample(n=2000)

In [86]:
modifier_1_ss_df_3000_t1 = modifier_1_ss_df.sample(n=3000)
modifier_1_ss_df_3000_t2 = modifier_1_ss_df.sample(n=3000)
modifier_1_ss_df_3000_t3 = modifier_1_ss_df.sample(n=3000)

In [41]:
modifier_1_not_ss_df_400_t1 = modifier_1_not_ss_df.sample(n=400)
# modifier_1_not_ss_df_500_t2 = modifier_1_not_ss_df.sample(n=500)
# modifier_1_not_ss_df_500_t3 = modifier_1_not_ss_df.sample(n=500)

In [89]:
modifier_1_not_ss_df_2000_t1 = modifier_1_not_ss_df.sample(n=2000)
modifier_1_not_ss_df_2000_t2 = modifier_1_not_ss_df.sample(n=2000)
modifier_1_not_ss_df_2000_t3 = modifier_1_not_ss_df.sample(n=2000)

In [90]:
modifier_1_not_ss_df_3000_t1 = modifier_1_not_ss_df.sample(n=3000)
modifier_1_not_ss_df_3000_t2 = modifier_1_not_ss_df.sample(n=3000)
modifier_1_not_ss_df_3000_t3 = modifier_1_not_ss_df.sample(n=3000)

In [42]:
modifier_1_ss_400_df_t1 = pd.concat([modifier_1_ss_df_400_t1, modifier_1_ss_100_test_df])
modifier_1_not_ss_400_df_t1 = pd.concat([modifier_1_not_ss_df_400_t1, modifier_1_not_ss_100_test_df])
# modifier_1_ss_500_df_t2 = pd.concat([modifier_1_ss_df_500_t2, modifier_1_ss_1000_test_df])
# modifier_1_not_ss_500_df_t2 = pd.concat([modifier_1_not_ss_df_500_t2, modifier_1_not_ss_1000_test_df])
# modifier_1_ss_500_df_t3 = pd.concat([modifier_1_ss_df_500_t3, modifier_1_ss_1000_test_df])
# modifier_1_not_ss_500_df_t3 = pd.concat([modifier_1_not_ss_df_500_t3, modifier_1_not_ss_1000_test_df])

In [92]:
modifier_1_ss_1000_df_t1 = pd.concat([modifier_1_ss_df_1000_t1, modifier_1_ss_1000_test_df])
modifier_1_not_ss_1000_df_t1 = pd.concat([modifier_1_not_ss_df_1000_t1, modifier_1_not_ss_1000_test_df])
modifier_1_ss_1000_df_t2 = pd.concat([modifier_1_ss_df_1000_t2, modifier_1_ss_1000_test_df])
modifier_1_not_ss_1000_df_t2 = pd.concat([modifier_1_not_ss_df_1000_t2, modifier_1_not_ss_1000_test_df])
modifier_1_ss_1000_df_t3 = pd.concat([modifier_1_ss_df_1000_t3, modifier_1_ss_1000_test_df])
modifier_1_not_ss_1000_df_t3 = pd.concat([modifier_1_not_ss_df_1000_t3, modifier_1_not_ss_1000_test_df])

In [260]:
modifier_1_ss_200_df_t1 = pd.concat([modifier_1_ss_df_200_t1, modifier_1_ss_50_test_df])
modifier_1_not_ss_200_df_t1 = pd.concat([modifier_1_not_ss_df_200_t1, modifier_1_not_ss_50_test_df])
modifier_1_ss_200_df_t2 = pd.concat([modifier_1_ss_df_200_t2, modifier_1_ss_50_test_df])
modifier_1_not_ss_200_df_t2 = pd.concat([modifier_1_not_ss_df_200_t2, modifier_1_not_ss_50_test_df])
modifier_1_ss_200_df_t3 = pd.concat([modifier_1_ss_df_200_t3, modifier_1_ss_50_test_df])
modifier_1_not_ss_200_df_t3 = pd.concat([modifier_1_not_ss_df_200_t3, modifier_1_not_ss_50_test_df])

In [94]:
modifier_1_ss_3000_df_t1 = pd.concat([modifier_1_ss_df_3000_t1, modifier_1_ss_1000_test_df])
modifier_1_not_ss_3000_df_t1 = pd.concat([modifier_1_not_ss_df_3000_t1, modifier_1_not_ss_1000_test_df])
modifier_1_ss_3000_df_t2 = pd.concat([modifier_1_ss_df_3000_t2, modifier_1_ss_1000_test_df])
modifier_1_not_ss_3000_df_t2 = pd.concat([modifier_1_not_ss_df_3000_t2, modifier_1_not_ss_1000_test_df])
modifier_1_ss_3000_df_t3 = pd.concat([modifier_1_ss_df_3000_t3, modifier_1_ss_1000_test_df])
modifier_1_not_ss_3000_df_t3 = pd.concat([modifier_1_not_ss_df_3000_t3, modifier_1_not_ss_1000_test_df])

In [261]:
# write data to formatted txt file

write_data_file("modifier_1_MLformat_ss_200_train_50_test_turns1", protein_seq_modifier_1, modifier_1_ss_200_df_t1)
write_data_file("modifier_1_MLformat_not_ss_200_train_50_test_turns1", protein_seq_modifier_1, modifier_1_not_ss_200_df_t1)
write_data_file("modifier_1_MLformat_ss_200_train_500_test_turns2", protein_seq_modifier_1, modifier_1_ss_200_df_t2)
write_data_file("modifier_1_MLformat_not_ss_200_train_50_test_turns2", protein_seq_modifier_1, modifier_1_not_ss_200_df_t2)
write_data_file("modifier_1_MLformat_ss_200_train_50_test_turns3", protein_seq_modifier_1, modifier_1_ss_200_df_t3)
write_data_file("modifier_1_MLformat_not_ss_200_train_50_test_turns3", protein_seq_modifier_1, modifier_1_not_ss_200_df_t3)

# write_data_file("modifier_1_MLformat_ss_500_train_500_test_t1", protein_seq_modifier_1, modifier_1_ss_500_df_t1)
# write_data_file("modifier_1_MLformat_not_ss_500_train_500_test_t1", protein_seq_modifier_1, modifier_1_not_ss_500_df_t1)
# write_data_file("modifier_1_MLformat_ss_500_train_500_test_t2", protein_seq_modifier_1, modifier_1_ss_500_df_t2)
# write_data_file("modifier_1_MLformat_not_ss_500_train_500_test_t2", protein_seq_modifier_1, modifier_1_not_ss_500_df_t2)
# write_data_file("modifier_1_MLformat_ss_500_train_500_test_t3", protein_seq_modifier_1, modifier_1_ss_500_df_t3)
# write_data_file("modifier_1_MLformat_not_ss_500_train_500_test_t3", protein_seq_modifier_1, modifier_1_not_ss_500_df_t3)

# write_data_file("modifier_1_MLformat_ss_2000_train_500_test_t1", protein_seq_modifier_1, modifier_1_ss_2000_df_t1)
# write_data_file("modifier_1_MLformat_not_ss_2000_train_500_test_t1", protein_seq_modifier_1, modifier_1_not_ss_2000_df_t1)
# write_data_file("modifier_1_MLformat_ss_2000_train_500_test_t2", protein_seq_modifier_1, modifier_1_ss_2000_df_t2)
# write_data_file("modifier_1_MLformat_not_ss_2000_train_500_test_t2", protein_seq_modifier_1, modifier_1_not_ss_2000_df_t2)
# write_data_file("modifier_1_MLformat_ss_2000_train_500_test_t3", protein_seq_modifier_1, modifier_1_ss_2000_df_t3)
# write_data_file("modifier_1_MLformat_not_ss_2000_train_500_test_t3", protein_seq_modifier_1, modifier_1_not_ss_2000_df_t3)

Filename: modifier_1_MLformat_ss_200_train_50_test_turns1.txt
Filename: modifier_1_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: modifier_1_MLformat_ss_200_train_500_test_turns2.txt
Filename: modifier_1_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: modifier_1_MLformat_ss_200_train_50_test_turns3.txt
Filename: modifier_1_MLformat_not_ss_200_train_50_test_turns3.txt


## Thermonuclease

In [141]:
# important df i need for this
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)

In [142]:
# finding column with most values
# print(nuclease_df)
# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']
# print(nuclease_df.head(50))
nuclease_df = nuclease_df.loc[nuclease_df['PROTEIN'].str.contains('Thiol:disulfide interchange protein DsbA') == False]
# print(nuclease_df.columns)
nuclease_df = nuclease_df[nuclease_df['ddG_(kcal/mol)'].notna()]
nuclease_df = nuclease_df[nuclease_df['MUTATION'].str.contains('wild-type') == False]
# print(nuclease_df['ddG_(kcal/mol)'].tail(40))

# rename ddG to score and mutation to variant
nuclease_df = nuclease_df.rename(columns={'MUTATION': 'variant', 'ddG_(kcal/mol)': 'score'})
# print(nuclease_df['score'])
nuclease_df['score'] = nuclease_df['score'].round(6)
# print(nuclease_df['variant'].head(30))

nuclease_df["positions_split"] = ssf.get_positions_split(nuclease_df)

positions_split_subtracted = []
for pos_list in nuclease_df["positions_split"]:
    pos_list = [x - 1 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

nuclease_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in nuclease_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(nuclease_df["POSITION"]))

nuclease_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

In [143]:
# get protein from uniprot
protein_seq_nuclease = ssf.get_protein_seq('P00644')
print(protein_seq_nuclease)

MET LEU VAL MET THR GLU TYR LEU LEU SER ALA GLY ILE CYS MET ALA ILE VAL SER ILE LEU LEU ILE GLY MET ALA ILE SER ASN VAL SER LYS GLY GLN TYR ALA LYS ARG PHE PHE PHE PHE ALA THR SER CYS LEU VAL LEU THR LEU VAL VAL VAL SER SER LEU SER SER SER ALA ASN ALA SER GLN THR ASP ASN GLY VAL ASN ARG SER GLY SER GLU ASP PRO THR VAL TYR SER ALA THR SER THR LYS LYS LEU HIS LYS GLU PRO ALA THR LEU ILE LYS ALA ILE ASP GLY ASP THR VAL LYS LEU MET TYR LYS GLY GLN PRO MET THR PHE ARG LEU LEU LEU VAL ASP THR PRO GLU THR LYS HIS PRO LYS LYS GLY VAL GLU LYS TYR GLY PRO GLU ALA SER ALA PHE THR LYS LYS MET VAL GLU ASN ALA LYS LYS ILE GLU VAL GLU PHE ASP LYS GLY GLN ARG THR ASP LYS TYR GLY ARG GLY LEU ALA TYR ILE TYR ALA ASP GLY LYS MET VAL ASN GLU ALA LEU VAL ARG GLN GLY LEU ALA LYS VAL ALA TYR VAL TYR LYS PRO ASN ASN THR HIS GLU GLN HIS LEU ARG LYS SER GLU ALA GLN ALA LYS LYS GLU LYS LEU ASN ILE TRP SER GLU ASP ASN ALA ASP SER GLY GLN


In [144]:
protein_seq_nuclease_split = protein_seq_nuclease.split()
print(protein_seq_nuclease_split[221:224])
print(len(protein_seq_nuclease_split))

['TRP', 'SER', 'GLU']
231


In [145]:
nuclease_df["variant"] = ssf.get_mutations_names_list(nuclease_df)
# print(nuclease_df.head(10))

In [146]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [147]:
nuclease_ss_indexes = ssf.get_all_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements from uniprot

In [148]:
nuclease_df = add_sec_str_col(nuclease_df, nuclease_ss_indexes, 0)

In [149]:
# print(nuclease_df.head(5))
print(len(nuclease_df))
nuclease_df = nuclease_df[nuclease_df['score'] != 0.0]
print(len(nuclease_df))

1280
1258


In [158]:
# nuclease_in_domain_df = get_domain_dataset(nuclease_df, 0, 1200)
not_included_nuclease_v1 = get_excluded_res(nuclease_ss_indexes)
# print(len(not_included_nuclease_v1))

path = "../PDB and STRIDE Files/" + 'thermonuclease_pdb.txt'
nuclease_pdb = open(path, 'r')

low_confidence_indices = get_low_confidence_indices(nuclease_pdb)
not_included_nuclease_testing = not_included_nuclease_v1+low_confidence_indices
not_included_nuclease = sorted(np.unique(not_included_nuclease_v1+low_confidence_indices))

Num True Indices: 193
Num False Indices: 38
Difference: 155
Num Indices to Remove: 155
155
33
188
176


In [159]:
nuclease_in_domain_df = get_domain_dataset_v2(nuclease_df, 0, 2000, not_included_nuclease)
print(len(nuclease_in_domain_df))

209


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [160]:
nuclease_ss_df = get_ss_dataset(nuclease_in_domain_df, nuclease_ss_indexes, 0)


      DATABASE                  PROTEIN UNIPROT_ID      variant  \
0     Protherm  Staphylococcal nuclease     P00644       160SER   
1     Protherm  Staphylococcal nuclease     P00644       169VAL   
2     Protherm  Staphylococcal nuclease     P00644       171SER   
3     Protherm  Staphylococcal nuclease     P00644       160SER   
4     Protherm  Staphylococcal nuclease     P00644       169VAL   
5     Protherm  Staphylococcal nuclease     P00644       171SER   
6     Protherm  Staphylococcal nuclease     P00644       170PHE   
7    ThermoMut           Thermonuclease     P00644       171GLY   
8    ThermoMut           Thermonuclease     P00644       171VAL   
9    ThermoMut           Thermonuclease     P00644       190GLY   
10   ThermoMut           Thermonuclease     P00644       190VAL   
11   ThermoMut           Thermonuclease     P00644       193VAL   
12   ThermoMut           Thermonuclease     P00644       123GLY   
13   ThermoMut           Thermonuclease     P00644       123AL

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [161]:
print(len(nuclease_ss_df))

137


In [162]:
nuclease_not_ss_df = get_not_ss_dataset(nuclease_in_domain_df, nuclease_ss_indexes, 0)
print(len(nuclease_not_ss_df))

72


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


1000 Value Test dataset in SS

In [216]:
# # find random test set, concat orig df and new test df, remove dups
# # DO NOT RERUN THIS BLOCK
# nuclease_ss_1000_test_df = nuclease_ss_df.sample(n=1000)

In [80]:
nuclease_temp_df = pd.concat([nuclease_ss_1000_test_df, nuclease_ss_df])
print(len(nuclease_temp_df))
nuclease_ss_df = nuclease_temp_df[~nuclease_temp_df.index.duplicated(keep=False)]
print(len(nuclease_ss_df))

121761
119761


1000 Value Test dataset not in SS

In [81]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
nuclease_not_ss_1000_test_df = nuclease_not_ss_df.sample(n=1000)

In [82]:
nuclease_temp_df = pd.concat([nuclease_not_ss_1000_test_df, nuclease_not_ss_df])
print(len(nuclease_temp_df))
nuclease_not_ss_df = nuclease_temp_df[~nuclease_temp_df.index.duplicated(keep=False)]
print(len(nuclease_not_ss_df))

44600
42600


Training Data

In [83]:
nuclease_ss_df_500_t1 = nuclease_ss_df.sample(n=500)
nuclease_ss_df_500_t2 = nuclease_ss_df.sample(n=500)
nuclease_ss_df_500_t3 = nuclease_ss_df.sample(n=500)

In [84]:
nuclease_ss_df_1000_t1 = nuclease_ss_df.sample(n=1000)
nuclease_ss_df_1000_t2 = nuclease_ss_df.sample(n=1000)
nuclease_ss_df_1000_t3 = nuclease_ss_df.sample(n=1000)

In [85]:
nuclease_ss_df_2000_t1 = nuclease_ss_df.sample(n=2000)
nuclease_ss_df_2000_t2 = nuclease_ss_df.sample(n=2000)
nuclease_ss_df_2000_t3 = nuclease_ss_df.sample(n=2000)

In [86]:
nuclease_ss_df_3000_t1 = nuclease_ss_df.sample(n=3000)
nuclease_ss_df_3000_t2 = nuclease_ss_df.sample(n=3000)
nuclease_ss_df_3000_t3 = nuclease_ss_df.sample(n=3000)

In [87]:
nuclease_not_ss_df_500_t1 = nuclease_not_ss_df.sample(n=500)
nuclease_not_ss_df_500_t2 = nuclease_not_ss_df.sample(n=500)
nuclease_not_ss_df_500_t3 = nuclease_not_ss_df.sample(n=500)

In [88]:
nuclease_not_ss_df_1000_t1 = nuclease_not_ss_df.sample(n=1000)
nuclease_not_ss_df_1000_t2 = nuclease_not_ss_df.sample(n=1000)
nuclease_not_ss_df_1000_t3 = nuclease_not_ss_df.sample(n=1000)

In [89]:
nuclease_not_ss_df_2000_t1 = nuclease_not_ss_df.sample(n=2000)
nuclease_not_ss_df_2000_t2 = nuclease_not_ss_df.sample(n=2000)
nuclease_not_ss_df_2000_t3 = nuclease_not_ss_df.sample(n=2000)

In [90]:
nuclease_not_ss_df_3000_t1 = nuclease_not_ss_df.sample(n=3000)
nuclease_not_ss_df_3000_t2 = nuclease_not_ss_df.sample(n=3000)
nuclease_not_ss_df_3000_t3 = nuclease_not_ss_df.sample(n=3000)

In [91]:
nuclease_ss_500_df_t1 = pd.concat([nuclease_ss_df_500_t1, nuclease_ss_1000_test_df])
nuclease_not_ss_500_df_t1 = pd.concat([nuclease_not_ss_df_500_t1, nuclease_not_ss_1000_test_df])
nuclease_ss_500_df_t2 = pd.concat([nuclease_ss_df_500_t2, nuclease_ss_1000_test_df])
nuclease_not_ss_500_df_t2 = pd.concat([nuclease_not_ss_df_500_t2, nuclease_not_ss_1000_test_df])
nuclease_ss_500_df_t3 = pd.concat([nuclease_ss_df_500_t3, nuclease_ss_1000_test_df])
nuclease_not_ss_500_df_t3 = pd.concat([nuclease_not_ss_df_500_t3, nuclease_not_ss_1000_test_df])

In [92]:
nuclease_ss_1000_df_t1 = pd.concat([nuclease_ss_df_1000_t1, nuclease_ss_1000_test_df])
nuclease_not_ss_1000_df_t1 = pd.concat([nuclease_not_ss_df_1000_t1, nuclease_not_ss_1000_test_df])
nuclease_ss_1000_df_t2 = pd.concat([nuclease_ss_df_1000_t2, nuclease_ss_1000_test_df])
nuclease_not_ss_1000_df_t2 = pd.concat([nuclease_not_ss_df_1000_t2, nuclease_not_ss_1000_test_df])
nuclease_ss_1000_df_t3 = pd.concat([nuclease_ss_df_1000_t3, nuclease_ss_1000_test_df])
nuclease_not_ss_1000_df_t3 = pd.concat([nuclease_not_ss_df_1000_t3, nuclease_not_ss_1000_test_df])

In [93]:
nuclease_ss_2000_df_t1 = pd.concat([nuclease_ss_df_2000_t1, nuclease_ss_1000_test_df])
nuclease_not_ss_2000_df_t1 = pd.concat([nuclease_not_ss_df_2000_t1, nuclease_not_ss_1000_test_df])
nuclease_ss_2000_df_t2 = pd.concat([nuclease_ss_df_2000_t2, nuclease_ss_1000_test_df])
nuclease_not_ss_2000_df_t2 = pd.concat([nuclease_not_ss_df_2000_t2, nuclease_not_ss_1000_test_df])
nuclease_ss_2000_df_t3 = pd.concat([nuclease_ss_df_2000_t3, nuclease_ss_1000_test_df])
nuclease_not_ss_2000_df_t3 = pd.concat([nuclease_not_ss_df_2000_t3, nuclease_not_ss_1000_test_df])

In [94]:
nuclease_ss_3000_df_t1 = pd.concat([nuclease_ss_df_3000_t1, nuclease_ss_1000_test_df])
nuclease_not_ss_3000_df_t1 = pd.concat([nuclease_not_ss_df_3000_t1, nuclease_not_ss_1000_test_df])
nuclease_ss_3000_df_t2 = pd.concat([nuclease_ss_df_3000_t2, nuclease_ss_1000_test_df])
nuclease_not_ss_3000_df_t2 = pd.concat([nuclease_not_ss_df_3000_t2, nuclease_not_ss_1000_test_df])
nuclease_ss_3000_df_t3 = pd.concat([nuclease_ss_df_3000_t3, nuclease_ss_1000_test_df])
nuclease_not_ss_3000_df_t3 = pd.concat([nuclease_not_ss_df_3000_t3, nuclease_not_ss_1000_test_df])

In [95]:
# write data to formatted txt file

write_data_file("nuclease_MLformat_ss_500_train_1000_test_t1", protein_seq_nuclease, nuclease_ss_500_df_t1)
write_data_file("nuclease_MLformat_not_ss_500_train_1000_test_t1", protein_seq_nuclease, nuclease_not_ss_500_df_t1)
write_data_file("nuclease_MLformat_ss_500_train_1000_test_t2", protein_seq_nuclease, nuclease_ss_500_df_t2)
write_data_file("nuclease_MLformat_not_ss_500_train_1000_test_t2", protein_seq_nuclease, nuclease_not_ss_500_df_t2)
write_data_file("nuclease_MLformat_ss_500_train_1000_test_t3", protein_seq_nuclease, nuclease_ss_500_df_t3)
write_data_file("nuclease_MLformat_not_ss_500_train_1000_test_t3", protein_seq_nuclease, nuclease_not_ss_500_df_t3)

write_data_file("nuclease_MLformat_ss_1000_train_1000_test_t1", protein_seq_nuclease, nuclease_ss_1000_df_t1)
write_data_file("nuclease_MLformat_not_ss_1000_train_1000_test_t1", protein_seq_nuclease, nuclease_not_ss_1000_df_t1)
write_data_file("nuclease_MLformat_ss_1000_train_1000_test_t2", protein_seq_nuclease, nuclease_ss_1000_df_t2)
write_data_file("nuclease_MLformat_not_ss_1000_train_1000_test_t2", protein_seq_nuclease, nuclease_not_ss_1000_df_t2)
write_data_file("nuclease_MLformat_ss_1000_train_1000_test_t3", protein_seq_nuclease, nuclease_ss_1000_df_t3)
write_data_file("nuclease_MLformat_not_ss_1000_train_1000_test_t3", protein_seq_nuclease, nuclease_not_ss_1000_df_t3)

write_data_file("nuclease_MLformat_ss_2000_train_1000_test_t1", protein_seq_nuclease, nuclease_ss_2000_df_t1)
write_data_file("nuclease_MLformat_not_ss_2000_train_1000_test_t1", protein_seq_nuclease, nuclease_not_ss_2000_df_t1)
write_data_file("nuclease_MLformat_ss_2000_train_1000_test_t2", protein_seq_nuclease, nuclease_ss_2000_df_t2)
write_data_file("nuclease_MLformat_not_ss_2000_train_1000_test_t2", protein_seq_nuclease, nuclease_not_ss_2000_df_t2)
write_data_file("nuclease_MLformat_ss_2000_train_1000_test_t3", protein_seq_nuclease, nuclease_ss_2000_df_t3)
write_data_file("nuclease_MLformat_not_ss_2000_train_1000_test_t3", protein_seq_nuclease, nuclease_not_ss_2000_df_t3)

Filename: gb1_MLformat_ss_500_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_500_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_500_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_500_train_1000_test_t3.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_1000_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_1000_train_1000_test_t3.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t1.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t1.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t2.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t2.txt
Filename: gb1_MLformat_ss_2000_train_1000_test_t3.txt
Filename: gb1_MLformat_not_ss_2000_train_1000_test_t3.tx

## Human Glucokinase

In [163]:
glucokinase_df1 = pd.read_csv("../Raw Data/glucokinase_scores.csv")

In [164]:
# take note of offset

glucokinase_df1.columns = glucokinase_df1.iloc[3]
print(glucokinase_df1.columns)
print(len(glucokinase_df1))


glucokinase_df = glucokinase_df1[(glucokinase_df1["hgvs_pro"].str.contains("del") == False) & (glucokinase_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (glucokinase_df1["hgvs_pro"].str.contains("Ter") == False)]

glucokinase_df = glucokinase_df1[(glucokinase_df1["hgvs_pro"].str.contains("=") == False) & (glucokinase_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (glucokinase_df1["hgvs_pro"].str.contains("Ter") == False)]

glucokinase_df = glucokinase_df.sample(frac=1)

print(len(glucokinase_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'df',
       'se'],
      dtype='object', name=3)
9366
8570


In [165]:
# getting uniprot to compare offset
protein_seq_glucokinase = ssf.get_protein_seq("P35557")
# offset of 1
print(protein_seq_glucokinase)
print(len(protein_seq_glucokinase))

MET LEU ASP ASP ARG ALA ARG MET GLU ALA ALA LYS LYS GLU LYS VAL GLU GLN ILE LEU ALA GLU PHE GLN LEU GLN GLU GLU ASP LEU LYS LYS VAL MET ARG ARG MET GLN LYS GLU MET ASP ARG GLY LEU ARG LEU GLU THR HIS GLU GLU ALA SER VAL LYS MET LEU PRO THR TYR VAL ARG SER THR PRO GLU GLY SER GLU VAL GLY ASP PHE LEU SER LEU ASP LEU GLY GLY THR ASN PHE ARG VAL MET LEU VAL LYS VAL GLY GLU GLY GLU GLU GLY GLN TRP SER VAL LYS THR LYS HIS GLN MET TYR SER ILE PRO GLU ASP ALA MET THR GLY THR ALA GLU MET LEU PHE ASP TYR ILE SER GLU CYS ILE SER ASP PHE LEU ASP LYS HIS GLN MET LYS HIS LYS LYS LEU PRO LEU GLY PHE THR PHE SER PHE PRO VAL ARG HIS GLU ASP ILE ASP LYS GLY ILE LEU LEU ASN TRP THR LYS GLY PHE LYS ALA SER GLY ALA GLU GLY ASN ASN VAL VAL GLY LEU LEU ARG ASP ALA ILE LYS ARG ARG GLY ASP PHE GLU MET ASP VAL VAL ALA MET VAL ASN ASP THR VAL ALA THR MET ILE SER CYS TYR TYR GLU ASP HIS GLN CYS GLU VAL GLY MET ILE VAL GLY THR GLY CYS ASN ALA CYS TYR MET GLU GLU MET GLN ASN VAL GLU LEU VAL GLU GLY ASP GLU GLY ARG 

In [166]:
protein_seq_glucokinase_split = protein_seq_glucokinase.split()
print(len(protein_seq_glucokinase_split))
print(protein_seq_glucokinase_split[186:189])

465
['ASP', 'ALA', 'ILE']


In [167]:
glucokinase_df["variant"] = format_mavedb_variant(glucokinase_df, "hgvs_pro", -1)

In [168]:
# splitting variant list if there are multiple mutations
glucokinase_mut = glucokinase_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
glucokinase_df["WILD_TYPE_RES"] = ssf.get_wild_type(glucokinase_mut)

# get mutated residue and place in seperate col
glucokinase_df["MUTATED_RES"] = ssf.get_mutation_type(glucokinase_mut)

# get position and place in seperate col
glucokinase_df["POSITION"] = ssf.get_position(glucokinase_mut)

# replace variant column with reformatted variant name
glucokinase_df["variant"] = ssf.get_mutations_names_list(glucokinase_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [169]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'glucokinase_stride.txt'
glucokinase_stride_file = open(path, 'r')

glucokinase_ss_indexes = ssf.get_all_sec_struc_boolean(glucokinase_stride_file) # boolean list of secondary structure assignements

# need positionssplit
glucokinase_df["positions_split"] = ssf.get_positions_split(glucokinase_df)

# add in_sec_str_col
glucokinase_df = add_sec_str_col(glucokinase_df, glucokinase_ss_indexes, 0)

In [177]:
not_included_glucokinase_v1 = get_excluded_res(glucokinase_ss_indexes)

path = "../PDB and STRIDE Files/" + 'glucokinase.pdb'
glucokinase_pdb = open(path, 'r')

low_confidence_indices = get_low_confidence_indices(glucokinase_pdb)
not_included_glucokinase_testing = not_included_glucokinase_v1+low_confidence_indices
not_included_glucokinase = sorted(np.unique(not_included_glucokinase_v1+low_confidence_indices))

Num True Indices: 403
Num False Indices: 62
Difference: 341
Num Indices to Remove: 341


In [178]:
glucokinase_in_domain_df = get_domain_dataset_v2(glucokinase_df, 0, 2000, not_included_glucokinase)
print(len(glucokinase_in_domain_df))

2289


  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [179]:
glucokinase_ss_df = get_ss_dataset(glucokinase_in_domain_df, glucokinase_ss_indexes, 0)
print(len(glucokinase_ss_df))

3                        accession hgvs_nt hgvs_splice     hgvs_pro  \
0     urn:mavedb:00000096-a-1#2287     NaN         NaN   p.Ser64Trp   
1     urn:mavedb:00000096-a-1#3701     NaN         NaN  p.Leu314Lys   
2     urn:mavedb:00000096-a-1#5769     NaN         NaN  p.Gly294Trp   
3     urn:mavedb:00000096-a-1#1716     NaN         NaN   p.Pro66Glu   
4     urn:mavedb:00000096-a-1#1112     NaN         NaN  p.Thr255Met   
...                            ...     ...         ...          ...   
2284  urn:mavedb:00000096-a-1#5782     NaN         NaN  p.Gly294Cys   
2285  urn:mavedb:00000096-a-1#6302     NaN         NaN   p.Glu27Ala   
2286  urn:mavedb:00000096-a-1#4928     NaN         NaN  p.Ile293Cys   
2287  urn:mavedb:00000096-a-1#2533     NaN         NaN  p.Phe152Val   
2288  urn:mavedb:00000096-a-1#4675     NaN         NaN  p.Ile110Leu   

3            score           sd df           se variant WILD_TYPE_RES  \
0      2.466990316  0.055858365  2  0.039497829   63TRP             S   
1

  sec_str_df = sec_str_df.append(rows, ignore_index=True)


In [180]:
glucokinase_not_ss_df = get_not_ss_dataset(glucokinase_in_domain_df, glucokinase_ss_indexes, 0)
print(len(glucokinase_not_ss_df))

1154


  not_sec_str_df = not_sec_str_df.append(rows, ignore_index=True)


In [246]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
glucokinase_ss_50_test_df = glucokinase_ss_df.sample(n=50)

In [247]:
glucokinase_temp_df = pd.concat([glucokinase_ss_50_test_df, glucokinase_ss_df])
print(len(glucokinase_temp_df))
glucokinase_ss_df = glucokinase_temp_df[~glucokinase_temp_df.index.duplicated(keep=False)]
print(len(glucokinase_ss_df))

1185
1085


1000 Value Test dataset not in SS

In [248]:
# find random test set, concat orig df and new test df, remove dups
# DO NOT RERUN THIS BLOCK
glucokinase_not_ss_50_test_df = glucokinase_not_ss_df.sample(n=50)

In [249]:
glucokinase_temp_df = pd.concat([glucokinase_not_ss_50_test_df, glucokinase_not_ss_df])
print(len(glucokinase_temp_df))
glucokinase_not_ss_df = glucokinase_temp_df[~glucokinase_temp_df.index.duplicated(keep=False)]
print(len(glucokinase_not_ss_df))

1204
1104


In [250]:
glucokinase_ss_df_200_t1 = glucokinase_ss_df.sample(n=200)
glucokinase_ss_df_200_t2 = glucokinase_ss_df.sample(n=200)
glucokinase_ss_df_200_t3 = glucokinase_ss_df.sample(n=200)

In [251]:
glucokinase_not_ss_df_200_t1 = glucokinase_not_ss_df.sample(n=200)
glucokinase_not_ss_df_200_t2 = glucokinase_not_ss_df.sample(n=200)
glucokinase_not_ss_df_200_t3 = glucokinase_not_ss_df.sample(n=200)

In [255]:
glucokinase_ss_df_1000_t1 = glucokinase_ss_df.sample(n=1000)
glucokinase_ss_df_1000_t2 = glucokinase_ss_df.sample(n=1000)
glucokinase_ss_df_1000_t3 = glucokinase_ss_df.sample(n=1000)

In [258]:
glucokinase_not_ss_df_1000_t1 = glucokinase_not_ss_df.sample(n=1000)
glucokinase_not_ss_df_1000_t2 = glucokinase_not_ss_df.sample(n=1000)
glucokinase_not_ss_df_1000_t3 = glucokinase_not_ss_df.sample(n=1000)

In [252]:
glucokinase_ss_200_df_t1 = pd.concat([glucokinase_ss_df_200_t1, glucokinase_ss_50_test_df])
glucokinase_not_ss_200_df_t1 = pd.concat([glucokinase_not_ss_df_200_t1, glucokinase_not_ss_50_test_df])
glucokinase_ss_200_df_t2 = pd.concat([glucokinase_ss_df_200_t2, glucokinase_ss_50_test_df])
glucokinase_not_ss_200_df_t2 = pd.concat([glucokinase_not_ss_df_200_t2, glucokinase_not_ss_50_test_df])
glucokinase_ss_200_df_t3 = pd.concat([glucokinase_ss_df_200_t3, glucokinase_ss_50_test_df])
glucokinase_not_ss_200_df_t3 = pd.concat([glucokinase_not_ss_df_200_t3, glucokinase_not_ss_50_test_df])

In [260]:
glucokinase_ss_1000_df_t1 = pd.concat([glucokinase_ss_df_1000_t1, glucokinase_ss_1000_test_df])
glucokinase_not_ss_1000_df_t1 = pd.concat([glucokinase_not_ss_df_1000_t1, glucokinase_not_ss_1000_test_df])
glucokinase_ss_1000_df_t2 = pd.concat([glucokinase_ss_df_1000_t2, glucokinase_ss_1000_test_df])
glucokinase_not_ss_1000_df_t2 = pd.concat([glucokinase_not_ss_df_1000_t2, glucokinase_not_ss_1000_test_df])
glucokinase_ss_1000_df_t3 = pd.concat([glucokinase_ss_df_1000_t3, glucokinase_ss_1000_test_df])
glucokinase_not_ss_1000_df_t3 = pd.concat([glucokinase_not_ss_df_1000_t3, glucokinase_not_ss_1000_test_df])

In [253]:
# write data to formatted txt file

write_data_file("glucokinase_MLformat_ss_200_train_50_test_turns1", protein_seq_glucokinase, glucokinase_ss_200_df_t1)
write_data_file("glucokinase_MLformat_not_ss_200_train_50_test_turns1", protein_seq_glucokinase, glucokinase_not_ss_200_df_t1)
write_data_file("glucokinase_MLformat_ss_200_train_50_test_turns2", protein_seq_glucokinase, glucokinase_ss_200_df_t2)
write_data_file("glucokinase_MLformat_not_ss_200_train_50_test_turns2", protein_seq_glucokinase, glucokinase_not_ss_200_df_t2)
write_data_file("glucokinase_MLformat_ss_200_train_50_test_turns3", protein_seq_glucokinase, glucokinase_ss_200_df_t3)
write_data_file("glucokinase_MLformat_not_ss_200_train_50_test_turns3", protein_seq_glucokinase, glucokinase_not_ss_200_df_t3)

# write_data_file("glucokinase_MLformat_ss_50_train_50_test_t1", protein_seq_glucokinase, glucokinase_ss_50_df_t1)
# write_data_file("glucokinase_MLformat_not_ss_50_train_50_test_t1", protein_seq_glucokinase, glucokinase_not_ss_50_df_t1)
# write_data_file("glucokinase_MLformat_ss_50_train_50_test_t2", protein_seq_glucokinase, glucokinase_ss_50_df_t2)
# write_data_file("glucokinase_MLformat_not_ss_50_train_50_test_t2", protein_seq_glucokinase, glucokinase_not_ss_50_df_t2)
# write_data_file("glucokinase_MLformat_ss_50_train_50_test_t3", protein_seq_glucokinase, glucokinase_ss_50_df_t3)
# write_data_file("glucokinase_MLformat_not_ss_50_train_50_test_t3", protein_seq_glucokinase, glucokinase_not_ss_50_df_t3)

Filename: glucokinase_MLformat_ss_200_train_50_test_turns1.txt
Filename: glucokinase_MLformat_not_ss_200_train_50_test_turns1.txt
Filename: glucokinase_MLformat_ss_200_train_50_test_turns2.txt
Filename: glucokinase_MLformat_not_ss_200_train_50_test_turns2.txt
Filename: glucokinase_MLformat_ss_200_train_50_test_turns3.txt
Filename: glucokinase_MLformat_not_ss_200_train_50_test_turns3.txt
