# Formatting Protein Data for Network

This notebook formats the protein data from Gelman et al. into a txt file that can be processed by the machine learning script.

In [8]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide

In [9]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Methods Used to Format Data

Formatting protein sequence into form for machine learning:

In [1]:
# parameters:
#      "uniprot_id" - string representing uniprot id of desired protein.
# This method uses a given uniprot id to query the uniprot data and 
# return a string respresention of the protein sequence. 
# E.g. MADIT
def get_protein_seq(uniprot_id):
    
    # importing fasta file from uniprot.org and getting protein sequence
    # taken from StackOverflow: 
    # https://stackoverflow.com/questions/52569622/protein-sequence-from-uniprot-protein-id-python
    url = "http://www.uniprot.org/uniprot/"
    complete_url = url + uniprot_id + ".fasta"
    response = requests.post(complete_url)
    data =''.join(response.text)
    sequence =StringIO(data)
    protein_seq=list(SeqIO.parse(sequence,'fasta'))

    # protein sequence as string (single-letter amino acids)
    string_seq = str(protein_seq[0].seq)
    
    # protein sequence w/ three-letter convention
    protein_seq = get_expanded_seq(string_seq)
    return protein_seq

Expanding protein sequence (1 letter AA -> 3 letter AA):

In [2]:
# parameter:
#      "seq" - string representing protein sequence in 1-letter convention.
# This method takes protein sequence string with 1-letter convention and returns
# a protein sequence with 3-letter convention.
# E.g. ADE -> ALA ASP GLU
def get_expanded_seq(seq):
    expanded_list = []
    split_seq = list(seq)
    for letter in split_seq:
        three_letter_abbr = Bio.PDB.Polypeptide.one_to_three(letter)
        expanded_list.append(three_letter_abbr)
    exanded_string = " ".join(expanded_list)
    return(exanded_string)

Returning index range of protein domain within protein:

In [3]:
# parameters: 
#      "full_protein_split" - list of amino acids in full protein in 3 letter convention.
#                             E.g. ["ALA", "GLY", "TYR"]
#      "domain_split" - list of amino acids in protein domain in 3 letter convention.
#                       E.g. ["ALA", "GLY"]
# This method prints the index of the given domain within the given protein.
# Starting value is inclusive and the ending value is exclusive. 
# E.g. [(0, 3)]
def get_index_range(full_protein_split, domain_split):
    indexes = []
    for i in range(len(full_protein_split)):
        if full_protein_split[i:i+len(domain_split)] == domain_split:
            indexes.append((i, i+len(domain_split)))
    print(indexes)
    indexes.clear()

Get variant in mutation-position form from wild-type-position-mutation form: (E.g. G126A -> 126ALA)

In [4]:
# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with wild-type residue (first letter) from variant.
def get_wild_type(split_mutation_column):
    wild_type_list = []
    w_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            wild_type = "wild_type"
        elif "-" in string[0] or len(string) == 0:
            wild_type = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                w_letters.append(mutation_name[0])
                wild_type = ",".join(w_letters)
        wild_type_list.append(wild_type)
        w_letters.clear()
    return wild_type_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with mutation residue (last letter) from variant.
def get_mutation_type(split_mutation_column):
    mutation_list = []
    m_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            mutation = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            mutation = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                m_letters.append(mutation_name[-1])
                mutation = ",".join(m_letters)
        mutation_list.append(mutation)
        m_letters.clear()
    return mutation_list


# parameter: 
#      "split_mutation_column" - list of mutations, split by comma if there are multiple.
# This method returns a list with the position of mutation (number) from variant.
def get_position(split_mutation_column):
    position_list = []
    p_letters = []
    for string in split_mutation_column:
        if "wild-type" in string[0]:
            position = "wild-type"
        elif "-" in string[0] or len(string) == 0:
            position = np.nan
        else:
            for val in string:
                mutation_name = val.strip(" ")
                p_letters.append(mutation_name[1:-1])
                position = ",".join(p_letters)
        position_list.append(position)
        p_letters.clear()
    return(position_list)

# parameter:
#      "df" - dataframe of protein data with "MUTATED_RES" and "POSITION" columns.
# This method returns a list with the correctly formatted variant (mutation-position form).
def get_mutations_names_list(df):
    formatted_list = []
    expanded_abbv = []
    for mutation, position in zip(df["MUTATED_RES"], df["POSITION"]):
        split_mutations = mutation.split(",")
        split_positions = position.split(",")
        if "wild-type" in split_mutations[0].lower() or "wild-type" in split_positions[0].lower():
            abbv_names = "WT"
        else:  
            for mut, pos in zip(split_mutations, split_positions):
                three_letter_mut = Bio.PDB.Polypeptide.one_to_three(mut.upper())
                position = str(int(pos))
                combined_name = position + three_letter_mut
                expanded_abbv.append(combined_name)
                abbv_names = ", ".join(expanded_abbv)
        expanded_abbv.clear()
        formatted_list.append(abbv_names)
    return(formatted_list)

Writing formatted data to txt file:

In [5]:
# parameters:
#      "txt_name" - desired name of formatted txt file for network. E.g. "pab1"
#      "protein_seq" - string of protein sequence in 3 letter convention. E.g. ALA GLU TYR
#      "df" - dataframe with cleaned protein data. Must contain "variant" and "score" 
#             columns.
# This method cleans the protein data and formats it into a txt that can be processed by the 
# network. It also prints the name of the file out for reference.
def write_data_file(txt_name, protein_seq, df):
    file_name = txt_name + ".txt"
    path_name = "../ML Script Data Files/" + file_name
    print("Filename: " + file_name)
    
    datafile = open(path_name, "w+")
    datafile.write(protein_seq + "\n")
    for index in range(len(df)-1):
        datafile.write(df["variant"].iloc[index] + ": " + str(df["score"].iloc[index]) + "\n")
    datafile.write(df["variant"].iloc[len(df) - 1] + ": " + str(df["score"].iloc[len(df) - 1]))
    datafile.close()

## Pab1

Formatting Pab1 data from _Gelman et al._ into txt file for machine learning script.

In [13]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [14]:
# rounding score column to 6 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star next to them
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]
print(len(pab1_df))
# change this value depending on amount of data needed for dataset
pab1_df = pab1_df.head(37600)
print(len(pab1_df))

40852
37710
37600


In [11]:
# get protein sequence from Uniprot and split
protein_seq_pab1 = get_protein_seq("P04147")
protein_seq_pab1_split = protein_seq_pab1.split()
print(len(protein_seq_pab1_split)) # protein length of 577

577


In [7]:
# pab1 protein domain sequence from Gelman et al.
string_seq = "GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP"
print(len(string_seq)) # <- domain length of 75
pab1_domain = get_expanded_seq(string_seq)
print(pab1_domain)
pab1_domain_split = pab1_domain.split()

75
GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO


In [13]:
# get index of domain inside protein
get_index_range(protein_seq_pab1_split, pab1_domain_split)

[(125, 200)]


In [14]:
# splitting variant list if there are multiple mutations
pab1_mut = pab1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pab1_df["WILD_TYPE_RES"] = get_wild_type(pab1_mut)

# get mutated residue and place in seperate col
pab1_df["MUTATED_RES"] = get_mutation_type(pab1_mut)

# get position and place in seperate col
pab1_df["POSITION"] = get_position(pab1_mut)

# replace variant column with reformatted variant name
pab1_df["variant"] = get_mutations_names_list(pab1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

pab1_df = pab1_df.drop(columns=to_drop)

In [34]:
# writing data to txt file
write_data_file("pab1_MLformat_37600", protein_seq_pab1, pab1_df)

Filename: pab1_MLformat_37600.txt


## Ube4b

Formatting Ube4b data from _Gelman et. al_ into txt file for machine learning script.

In [15]:
# importing Ube4b data from Gelman et al.
ube4b_df1 = pd.read_csv("../Raw Data/ube4b.tsv.txt", sep="\t")
ube4b_df = ube4b_df1.dropna()
print(len(ube4b_df))
print(ube4b_df.columns)

98297
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [16]:
# rounding score column to 6 decimal points
ube4b_df["score"] = ube4b_df["score"].round(6)

# remove values with wildcard star next to them
ube4b_df = ube4b_df[ube4b_df["variant"].str.contains("\*") == False]
print(len(ube4b_df))
# change this value depending on amount of data needed for dataset
ube4b_df = ube4b_df.head(80)
print(len(ube4b_df))

91031
80


In [41]:
# get protein sequence from Uniprot and split
protein_seq_ube4b = get_protein_seq("Q9ES00")
protein_seq_ube4b_split = protein_seq_ube4b.split()
print(len(protein_seq_ube4b_split)) # protein length of 1173

1173


In [42]:
# ube4b protein domain sequence from Gelman et. al
string_seq = "IEKFKLLAEKVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDPFNRQMLTESMLEPVPELKEQIQAWMREKQSSDH"
print(len(string_seq)) # <- domain length of 102
ube4b_domain = get_expanded_seq(string_seq)
ube4b_domain_split = ube4b_domain.split()

# NOTE - index in list corresponds exactly to location in domain (huh)

102


In [43]:
# index of domain inside protein
get_index_range(protein_seq_ube4b_split, ube4b_domain_split)

[(1071, 1173)]


In [44]:
# splitting variant list if there are multiple mutations
ube4b_mut = ube4b_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
ube4b_df["WILD_TYPE_RES"] = get_wild_type(ube4b_mut)

# get mutated residue and place in seperate col
ube4b_df["MUTATED_RES"] = get_mutation_type(ube4b_mut)

# get position and place in seperate col
ube4b_df["POSITION"] = get_position(ube4b_mut)

# replace variant column with reformatted variant name
ube4b_df["variant"] = get_mutations_names_list(ube4b_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

ube4b_df = ube4b_df.drop(columns=to_drop)

In [45]:
# writing data to txt file
write_data_file("ube4b_MLformat_80", protein_seq_ube4b, ube4b_df)

Filename: ube4b_MLformat_80.txt


## Bgl3

Formatting Bgl3 data from _Gelman et. al_ into txt file for machine learning script.

In [17]:
# importing Ube4b data from Gelman et al.
bgl3_df1 = pd.read_csv("../Raw Data/bgl3.tsv.txt", sep="\t")
bgl3_df = bgl3_df1.dropna()
print(len(bgl3_df))
print(bgl3_df.columns)

26653
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [18]:
# rounding score column to 6 decimal points
bgl3_df["score"] = bgl3_df["score"].round(6)
print(len(bgl3_df))

# remove values with wildcard star next to them
bgl3_df = bgl3_df[bgl3_df["variant"].str.contains("\*") == False]
print(len(bgl3_df))
# change this value depending on amount of data needed for dataset
bgl3_df = bgl3_df.head(25600)
print(len(bgl3_df))

26653
25737
25600


In [6]:
# NOTE - no protein domain for bgl3
# # get protein sequence from Gelman et al.
string_seq = "MVPAAQQTAMAPDAALTFPEGFLWGSATASYQIEGAAAEDGRTPSIWDTYARTPGRVRNGDTGDVATDHYHRWREDVALMAELGLGAYRFSLAWPRIQPTGRGPALQKGLDFYRRLADELLAKGIQPVATLYHWDLPQELENAGGWPERATAERFAEYAAIAADALGDRVKTWTTLNEPWCSAFLGYGSGVHAPGRTDPVAALRAAHHLNLGHGLAVQALRDRLPADAQCSVTLNIHHVRPLTDSDADADAVRRIDALANRVFTGPMLQGAYPEDLVKDTAGLTDWSFVRDGDLRLAHQKLDFLGVNYYSPTLVSEADGSGTHNSDGHGRSAHSPWPGADRVAFHQPPGETTAMGWAVDPSGLYELLRRLSSDFPALPLVITENGAAFHDYADPEGNVNDPERIAYVRDHLAAVHRAIKDGSDVRGYFLWSLLDNFEWAHGYSKRFGAVYVDYPTGTRIPKASARWYAEVARTGVLPTAGDPNSSSVDKLAAALEHHHHHH"
protein_seq_bgl3 = get_expanded_seq(string_seq)
print(protein_seq_bgl3)

MET VAL PRO ALA ALA GLN GLN THR ALA MET ALA PRO ASP ALA ALA LEU THR PHE PRO GLU GLY PHE LEU TRP GLY SER ALA THR ALA SER TYR GLN ILE GLU GLY ALA ALA ALA GLU ASP GLY ARG THR PRO SER ILE TRP ASP THR TYR ALA ARG THR PRO GLY ARG VAL ARG ASN GLY ASP THR GLY ASP VAL ALA THR ASP HIS TYR HIS ARG TRP ARG GLU ASP VAL ALA LEU MET ALA GLU LEU GLY LEU GLY ALA TYR ARG PHE SER LEU ALA TRP PRO ARG ILE GLN PRO THR GLY ARG GLY PRO ALA LEU GLN LYS GLY LEU ASP PHE TYR ARG ARG LEU ALA ASP GLU LEU LEU ALA LYS GLY ILE GLN PRO VAL ALA THR LEU TYR HIS TRP ASP LEU PRO GLN GLU LEU GLU ASN ALA GLY GLY TRP PRO GLU ARG ALA THR ALA GLU ARG PHE ALA GLU TYR ALA ALA ILE ALA ALA ASP ALA LEU GLY ASP ARG VAL LYS THR TRP THR THR LEU ASN GLU PRO TRP CYS SER ALA PHE LEU GLY TYR GLY SER GLY VAL HIS ALA PRO GLY ARG THR ASP PRO VAL ALA ALA LEU ARG ALA ALA HIS HIS LEU ASN LEU GLY HIS GLY LEU ALA VAL GLN ALA LEU ARG ASP ARG LEU PRO ALA ASP ALA GLN CYS SER VAL THR LEU ASN ILE HIS HIS VAL ARG PRO LEU THR ASP SER ASP ALA ASP ALA ASP 

In [31]:
# splitting variant list if there are multiple mutations
bgl3_mut = bgl3_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
bgl3_df["WILD_TYPE_RES"] = get_wild_type(bgl3_mut)

# get mutated residue and place in seperate col
bgl3_df["MUTATED_RES"] = get_mutation_type(bgl3_mut)

# get position and place in seperate col
bgl3_df["POSITION"] = get_position(bgl3_mut)

# replace variant column with reformatted variant name
bgl3_df["variant"] = get_mutations_names_list(bgl3_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

bgl3_df = bgl3_df.drop(columns=to_drop)

In [38]:
# write data to formatted txt file
write_data_file("bgl3_MLformat_all", protein_seq_bgl3, bgl3_df)

Filename: bgl3_MLformat_all.txt
