In [265]:
import os
import numpy as np
import pandas as pd

max_rows = None
max_cols = None
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

all_data_path = os.path.join("DBdata", "all_data.csv")
all_data_df = pd.read_csv(all_data_path)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [262]:
import requests as r
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide


uniprot_id = "P00644"
parameter = "∆∆G_H2O_(kcal/mol)"

# importing fasta file from uniprot.org and getting protein sequence
cID= uniprot_id

baseUrl="http://www.uniprot.org/uniprot/"
currentUrl=baseUrl+cID+".fasta"
response = r.post(currentUrl)
cData=''.join(response.text)
Seq=StringIO(cData)
pSeq=list(SeqIO.parse(Seq,'fasta'))

# protein sequence as string
string_seq = str(pSeq[0].seq)

# takes 1-letter sequence and formats into spaced 3-letter protein sequence
def get_expanded_seq(seq):
    expanded_list = []
    split_seq = list(seq)
    for letter in split_seq:
        three_letter_abbr = Bio.PDB.Polypeptide.one_to_three(letter)
        expanded_list.append(three_letter_abbr)
    exanded_string = " ".join(expanded_list)
    return(exanded_string)
    
protein_seq = get_expanded_seq(string_seq)

# creates mini dataframe with POSITION, MUTATED_RES, and relevant parameter
# removes rows w/ NaN data for relevant parameter
def prepare_df(df, uniprot_id, parameter):
    protein_df = df[df["UniProt_ID"] == uniprot_id]
    usable_cols_df = protein_df[['POSITION', 'MUTATED_RES', parameter]].copy()
    usable_cols_df.dropna(subset=['POSITION', 'MUTATED_RES', parameter], inplace=True)
    usable_cols_df[parameter] = usable_cols_df[parameter].round(1)
    return usable_cols_df

prot_df = prepare_df(all_data_df, uniprot_id, parameter)

# gets list of all values of relevant parameter (e.g Tm_(C))
def get_param_vals_list(df, parameter):
    param_list = []
    for value in df[parameter]:
        param_list.append(value)
    return param_list

# gets three letter mutant residue and position, adds them, formats as a list if there are more than one
# (e.g 4LYS, 30ALA)
def get_mutations_names_list(df):
    formatted_list = []
    expanded_abbv= []
    for mutation, position in zip(df["MUTATED_RES"], df["POSITION"]):
        split_mutations = mutation.split(",")
        split_positions = position.split(",")
        if "wild-type" in split_mutations[0].lower() or "wild-type" in split_positions[0].lower():
            abbv_names = "WT"
        else:  
            for mut, pos in zip(split_mutations, split_positions):
                three_letter_mut = Bio.PDB.Polypeptide.one_to_three(mut.upper())
                position = str(int(pos))
                combined_name = position + three_letter_mut
                expanded_abbv.append(combined_name)
                abbv_names = ", ".join(expanded_abbv)
        expanded_abbv.clear()
        formatted_list.append(abbv_names)
    return(formatted_list)

# merges rows with duplicate mutations and averages values // removes rows w/ 0 parameter value
# rounds parameter data to 1 decimal point
def merge_dups(mutation_list, param_list):
    dup_df = pd.DataFrame(list(zip(mutation_list, param_list)), columns =["Mutation", "Parameter"])
    dup_df = dup_df.groupby("Mutation", as_index=False)["Parameter"].agg('mean').round(1)
    dup_df = dup_df[dup_df["Parameter"] != 0].reset_index()
    return dup_df["Mutation"], dup_df["Parameter"]

# renames
mutations_list = get_mutations_names_list(prot_df)
parameters_list = get_param_vals_list(prot_df, parameter)
mutations, parameters = merge_dups(mutations_list, parameters_list)
print("Number of Rows: " + str(len(mutations)))

# writing to txt file 

# dict for file name
colname_dict = {"Tm_(C)": "Tm",
            "∆Tm_(C)": "dTm",
            "∆H_(kcal/mol)": "dH",
            "∆G_(kcal/mol)": "dG",
            "∆∆G_(kcal/mol)": "ddG",
            "∆∆G_H2O_(kcal/mol)": "ddG_H2O"
}

for key in colname_dict.keys():
    parameter = parameter.replace(key, colname_dict[key])

file_name = uniprot_id + "_" + parameter+ ".txt"
print("Filename: " + file_name)

datafile = open(file_name, "w+")
datafile.write(protein_seq + "\n")
for index in range(len(mutations)-1):
    datafile.write(mutations[index] + ": " + str(parameters[index]) + "\n")
datafile.write(mutations[len(mutations)-1] + ": " + str(parameters[len(mutations)-1]))
datafile.close()

Number of Rows: 756
Filename: P00644_ddG_H2O.txt
