# Datasets for Other Proteins

This notebook formats data for the additional protein data used.

In [171]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide
import re

In [172]:
import secStrucFormatting as ssf

In [309]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Metadata from compiled dataset

In [173]:
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)
# print(df.head)
print(df.columns)

print(df["PROTEIN"].value_counts().head(10))
print(df.loc[df['PROTEIN'] == 'Lysozyme'])

Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'ddG_(kcal/mol)',
       'ddG_H2O_(kcal/mol)', 'STATE', 'REVERSIBILITY', 'PUBMED_ID',
       'REFERENCE', 'MUTATED_CHAIN', 'KINGDOM', 'PBD_MUTANT', 'MEASURE',
       'METHOD', 'POSITION', 'WILD_TYPE_RES', 'MUTATED_RES', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS'],
      dtype='object')
Lysozyme                                                   2897
Immunoglobulin G-binding protein G                         1996
Thermonuclease                                             1586
Staphylococcal nuclease                                    1457
Endolysin                                                  1110
Ribonuclease                                                983
Ribonuclease HI                                             710
Guanine nucleotide-binding protein G(i) subunit alpha-1     704
Myo

In [174]:
print(df["UNIPROT_ID"].value_counts().head(10)) # 4 possible could be used?

P00644    3033
P00720    2767
P06654    2297
P61626    1146
P00648     981
P00651     904
P0A7Y4     722
P63096     698
P00044     546
P00698     491
Name: UNIPROT_ID, dtype: int64


#### P00644 (Thermonuclease)

In [199]:
# finding column with most values

# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']
print(nuclease_df["PROTEIN"].value_counts())
print(len(nuclease_df))
nuclease_df.count() 
# using ddG_

Thermonuclease                              1586
Staphylococcal nuclease                     1445
Thiol:disulfide interchange protein DsbA       2
Name: PROTEIN, dtype: int64
3033


DATABASE              3033
PROTEIN               3033
UNIPROT_ID            3033
MUTATION              2901
SOURCE                2901
PBD_WILD              3031
PBD_CHAIN_MUTATION    1360
pH                    3031
T_(C)                 2577
Tm_(C)                 380
dTm_(C)                585
dH_(kcal/mol)           48
dG_(kcal/mol)           43
ddG_(kcal/mol)        1282
ddG_H2O_(kcal/mol)    1068
STATE                   12
REVERSIBILITY         1450
PUBMED_ID             3006
REFERENCE             2901
MUTATED_CHAIN         2943
KINGDOM               1451
PBD_MUTANT              23
MEASURE               1556
METHOD                1556
POSITION              2901
WILD_TYPE_RES         2901
MUTATED_RES           2901
IS_CURATED             132
CONSERVATION           131
NOTES                   24
DATASETS                29
dtype: int64

In [7]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [8]:
nuclease_ss_indexes = ssf.get_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements

In [9]:
ss = nuclease_ss_indexes.count(True)
not_ss = nuclease_ss_indexes.count(False)
print(ss)
print(not_ss)

144
87


#### P00720 (Endolysin)

In [198]:
endolysin_df = df.loc[df['UNIPROT_ID'] == 'P00720']
endolysin_df = endolysin_df.protein
print(len(endolysin_df))
endolysin_df.count()
print(endolysin_df["PROTEIN"].value_counts())

2767
Lysozyme                                     1637
Endolysin                                    1101
T4 lysozyme                                    20
N,O-diacetylmuramidase                          2
Beta-galactosidase                              2
Invertase 2                                     1
Spanin, inner membrane subunit                  1
Cytosol aminopeptidase                          1
Thermophilic aminopeptidase 1 alpha chain       1
Carboxypeptidase Y                              1
Name: PROTEIN, dtype: int64


In [11]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'endolysin_stride.txt'
endolysin_stride_file = open(path, 'r')

In [12]:
endolysin_ss_indexes = ssf.get_sec_struc_boolean(endolysin_stride_file) # boolean list of secondary structure assignements

In [13]:
ss = endolysin_ss_indexes.count(True)
not_ss = endolysin_ss_indexes.count(False)
print(ss)
print(not_ss)

122
41


#### P06654 (Immunoglobulin G-binding protein G)

In [200]:
protein_G_df = df.loc[df['UNIPROT_ID'] == 'P06654']
print(protein_G_df["PROTEIN"].value_counts())
# print(protein_G_df.head)
print(len(protein_G_df))
protein_G_df.count()
# print(protein_G_df.head)
protein_G_df = protein_G_df[protein_G_df["ddG_(kcal/mol)"].notna()]
print(len(protein_G_df))

Immunoglobulin G-binding protein G    1995
Protein G                              285
Single Domain Antibody                  17
Name: PROTEIN, dtype: int64
2297
1221


In [197]:
print(protein_G_df["MUTATION"].head(20))

30744     Q32V
30558     F52H
30146     L05A
24563      I6A
30136     Y03S
30699     N37V
30490     T53D
30378    D262A
30705     D36I
30643     E42N
30315     V29F
30567     F52S
30410     G41D
30594     A48N
30868     A20H
30255     T18D
30230     T16I
31020    K230T
30461     T49Q
30118     T02N
Name: MUTATION, dtype: object


In [176]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'protein_G_stride.txt'
protein_G_stride_file = open(path, 'r')

In [177]:
protein_G_ss_indexes = ssf.get_sec_struc_boolean(protein_G_stride_file) # boolean list of secondary structure assignements

In [178]:
# getting protein sequence
string_seq = "MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRNGGELTNLLGNSETTLALRNEESATADLTAAAVADTVAAAAAENAGAAAWEAAAAADALAKAKADALKEFNKYGVSDYYKNLINNAKTVEGIKDLQAQVVESAKKARISEATDGLSDFLKSQTPAEDTVKSIELAEAKVLANRELDKYGVSDYHKNLINNAKTVEGVKELIDEILAALPKTDTYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEKPEVIDASELTPAVTTYKLVINGKTLKGETTTKAVDAETAEKAFKQYANDNGVDGVWTYDDATKTFTVTEMVTEVPGDAPTEPEKPEASIPLVPLTPATPIAKDDAKKDDTKKEDAKKPEAKKDDAKKAETLPTTGEGSNPFFTAAALAVMAGAGALAVASKRKED"

In [179]:
protein_seq_protein_G = ssf.get_expanded_seq(string_seq)
print(protein_seq_protein_G)

MET GLU LYS GLU LYS LYS VAL LYS TYR PHE LEU ARG LYS SER ALA PHE GLY LEU ALA SER VAL SER ALA ALA PHE LEU VAL GLY SER THR VAL PHE ALA VAL ASP SER PRO ILE GLU ASP THR PRO ILE ILE ARG ASN GLY GLY GLU LEU THR ASN LEU LEU GLY ASN SER GLU THR THR LEU ALA LEU ARG ASN GLU GLU SER ALA THR ALA ASP LEU THR ALA ALA ALA VAL ALA ASP THR VAL ALA ALA ALA ALA ALA GLU ASN ALA GLY ALA ALA ALA TRP GLU ALA ALA ALA ALA ALA ASP ALA LEU ALA LYS ALA LYS ALA ASP ALA LEU LYS GLU PHE ASN LYS TYR GLY VAL SER ASP TYR TYR LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY ILE LYS ASP LEU GLN ALA GLN VAL VAL GLU SER ALA LYS LYS ALA ARG ILE SER GLU ALA THR ASP GLY LEU SER ASP PHE LEU LYS SER GLN THR PRO ALA GLU ASP THR VAL LYS SER ILE GLU LEU ALA GLU ALA LYS VAL LEU ALA ASN ARG GLU LEU ASP LYS TYR GLY VAL SER ASP TYR HIS LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY VAL LYS GLU LEU ILE ASP GLU ILE LEU ALA ALA LEU PRO LYS THR ASP THR TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA 

In [194]:
protein_seq_protein_G_split = protein_seq_protein_G.split()
print(protein_seq_protein_G_split[34])

ASP


In [181]:
protein_G_df = protein_G_df[protein_G_df["POSITION"].str.contains("pga_A") == False]

protein_G_df["variant"] = ssf.get_mutations_names_list(protein_G_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# rename to score
protein_G_df.rename(columns = {'ddG_(kcal/mol)':'score'}, inplace = True)

protein_G_df = protein_G_df.drop(columns=to_drop)
protein_G_df = protein_G_df.sample(frac=1)
print(protein_G_df.columns)


Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'score', 'ddG_H2O_(kcal/mol)',
       'STATE', 'REVERSIBILITY', 'PUBMED_ID', 'REFERENCE', 'MUTATED_CHAIN',
       'KINGDOM', 'PBD_MUTANT', 'MEASURE', 'METHOD', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS', 'variant'],
      dtype='object')


KeyError: 'mutation'

In [284]:
# protein_G_df = protein_G_df.sample(n=320)

In [275]:
print(len(protein_G_df))

320


In [285]:
print(protein_G_df["score"])

30262   -0.950000
30473   -3.310000
21789    0.520000
30722    0.480000
24082    4.100000
30707    0.200000
30857   -1.350000
30832    0.440000
27182   -0.200000
31021   -0.270000
31001   -0.910000
30878    0.120000
15198   -0.860000
30515   -4.000000
30598    0.120000
27187   -3.500000
30373    0.040000
30553   -0.230000
30506   -2.670000
30575   -0.050000
30408   -1.280000
28478    1.075526
30955    0.250000
24119    1.500000
30821    0.550000
30529   -0.300000
30187    0.150000
30988   -2.070000
30543   -0.140000
24100    2.500000
30690   -0.420000
30306    0.570000
30422    0.520000
15197   -0.450000
13675   -0.300000
30465   -0.190000
10027    0.520000
30115    0.310000
22728   -2.820000
30541    0.020000
9945     2.430000
30772    0.310000
30705   -0.090000
30472   -1.100000
24091    3.140000
30250   -0.130000
30511    0.260000
30415    0.610000
30658   -4.000000
30796    0.260000
30106    0.270000
30517   -2.910000
30440   -2.490000
30612    0.900000
30531    0.340000
30406   -2

In [286]:
ssf.write_data_file("protein_G_MLformat_320", protein_seq_protein_G, protein_G_df)

Filename: protein_G_MLformat_320.txt


From Gelman et al.

**avGFP**

In [92]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [93]:
avgfp_ss_indexes = ssf.get_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [94]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

152
85


Formatting Data

In [95]:
# importing avGFP data from Gelman et al.
avgfp_df1 = pd.read_csv("../Raw Data/avgfp.tsv.txt", sep="\t")
avgfp_df = avgfp_df1.dropna()
print(len(avgfp_df))
print(avgfp_df.columns)

54024
Index(['variant', 'num_mutations', 'score', 'score_wt_norm'], dtype='object')


In [96]:
# rounding score column to 2 decimal points
avgfp_df["score"] = avgfp_df["score"].round(6)
print(len(avgfp_df))

# remove values with wildcard star thing cause idk what it means
avgfp_df = avgfp_df[avgfp_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
# avgfp_df = avgfp_df.sample(n=160)
print(len(avgfp_df))

54024
51714


In [85]:
# getting dataset size to run

string_seq = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(len(string_seq)) 
protein_seq_avgfp = ssf.get_expanded_seq(string_seq)
print(protein_seq_avgfp)

237
SER LYS GLY GLU GLU LEU PHE THR GLY VAL VAL PRO ILE LEU VAL GLU LEU ASP GLY ASP VAL ASN GLY HIS LYS PHE SER VAL SER GLY GLU GLY GLU GLY ASP ALA THR TYR GLY LYS LEU THR LEU LYS PHE ILE CYS THR THR GLY LYS LEU PRO VAL PRO TRP PRO THR LEU VAL THR THR LEU SER TYR GLY VAL GLN CYS PHE SER ARG TYR PRO ASP HIS MET LYS GLN HIS ASP PHE PHE LYS SER ALA MET PRO GLU GLY TYR VAL GLN GLU ARG THR ILE PHE PHE LYS ASP ASP GLY ASN TYR LYS THR ARG ALA GLU VAL LYS PHE GLU GLY ASP THR LEU VAL ASN ARG ILE GLU LEU LYS GLY ILE ASP PHE LYS GLU ASP GLY ASN ILE LEU GLY HIS LYS LEU GLU TYR ASN TYR ASN SER HIS ASN VAL TYR ILE MET ALA ASP LYS GLN LYS ASN GLY ILE LYS VAL ASN PHE LYS ILE ARG HIS ASN ILE GLU ASP GLY SER VAL GLN LEU ALA ASP HIS TYR GLN GLN ASN THR PRO ILE GLY ASP GLY PRO VAL LEU LEU PRO ASP ASN HIS TYR LEU SER THR GLN SER ALA LEU SER LYS ASP PRO ASN GLU LYS ARG ASP HIS MET VAL LEU LEU GLU PHE VAL THR ALA ALA GLY ILE THR HIS GLY MET ASP GLU LEU TYR LYS


In [90]:
protein_seq_avgfp_split = protein_seq_avgfp.split()
print(len(protein_seq_avgfp_split))
print(protein_seq_avgfp_split[108])

237
ALA


In [97]:
# splitting variant list if there are multiple mutations
avgfp_mut = avgfp_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
avgfp_df["WILD_TYPE_RES"] = ssf.get_wild_type(avgfp_mut)

# get mutated residue and place in seperate col
avgfp_df["MUTATED_RES"] = ssf.get_mutation_type(avgfp_mut)

# get position and place in seperate col
avgfp_df["POSITION"] = ssf.get_position(avgfp_mut)

# replace variant column with reformatted variant name
avgfp_df["variant"] = ssf.get_mutations_names_list(avgfp_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# avgfp_df = avgfp_df.drop(columns=to_drop)

In [98]:

# need positionssplit
avgfp_df["positions_split"] = ssf.get_positions_split(avgfp_df)

# add in_sec_str_col
avgfp_df = add_sec_str_col(avgfp_df, avgfp_ss_indexes, 0)

In [102]:
avgfp_train_df, avgfp_new_test_df, avgfp_new_remaining_df = get_train_and_test_df(avgfp_remaining_df, 0.64, 465)

in fraction,df len31791
297
168
Train Data Fraction: 0.639
false_df len31788
true_df len3


ValueError: Cannot take a larger sample than population when 'replace=False'

In [132]:
avgfp_test_10000 = avgfp_test_df.head(10000)
avgfp_df_format = pd.concat([avgfp_train_df, avgfp_test_10000])
print(len(avgfp_df_format))
# writing data to txt file
ssf.write_data_file("avgfp_MLformat_465_train_10k_test_1", protein_seq_avgfp, avgfp_df_format)

10465
Filename: avgfp_MLformat_465_train_10k_test_1.txt


In [226]:
# writing data to txt file
ssf.write_data_file("avgfp_MLformat_160", protein_seq_avgfp, avgfp_df)

Filename: avgfp_MLformat_160.txt


**GB1**

In [115]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [108]:
gb1_ss_indexes = ssf.get_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [109]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

39
17


In [116]:
# importing pab1 data from Gelman et al.
gb1_df1 = pd.read_csv("../Raw Data/gb1.tsv.txt", sep="\t")
gb1_df = gb1_df1.dropna()
print(len(gb1_df))
# gb1_df = gb1_df.sample(n=480)
print(gb1_df.columns)
gb1_df = gb1_df.sample(frac=1)

536084
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [117]:
# rounding score column to 2 decimal points
gb1_df["score"] = gb1_df["score"].round(6)
print(len(gb1_df))

# remove values with wildcard star thing cause idk what it means
gb1_df = gb1_df[gb1_df["variant"].str.contains("\*") == False]

# gb1_df = gb1_df.sample(n=40)
# pab1_df = pab1_df.head(37600)
print(len(gb1_df))

536084
536084


In [183]:
# getting protein sequence
# protein_seq_gb1 = ssf.get_protein_seq("P04386")

In [112]:
# getting dataset size to run

string_seq = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
print(len(string_seq)) # <- domain length of 75
protein_seq_gb1 = ssf.get_expanded_seq(string_seq)
print(protein_seq_gb1)

56
MET GLN TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA THR ALA GLU LYS VAL PHE LYS GLN TYR ALA ASN ASP ASN GLY VAL ASP GLY GLU TRP THR TYR ASP ASP ALA THR LYS THR PHE THR VAL THR GLU


In [118]:
# splitting variant list if there are multiple mutations
gb1_mut = gb1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gb1_df["WILD_TYPE_RES"] = ssf.get_wild_type(gb1_mut)

# get mutated residue and place in seperate col
gb1_df["MUTATED_RES"] = ssf.get_mutation_type(gb1_mut)

# get position and place in seperate col
gb1_df["POSITION"] = ssf.get_position(gb1_mut)

# replace variant column with reformatted variant name
gb1_df["variant"] = ssf.get_mutations_names_list(gb1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# gb1_df = gb1_df.drop(columns=to_drop)

In [120]:
# need positionssplit
gb1_df["positions_split"] = ssf.get_positions_split(gb1_df)

# add in_sec_str_col
gb1_df = add_sec_str_col(gb1_df, gb1_ss_indexes, 1)

In [133]:
gb1_train_df, gb1_test_df, gb1_remaining_df = get_train_and_test_df(gb1_df, 0.70, 110)
gb1_test_10000 = gb1_test_df.head(10000)
gb1_df_format = pd.concat([gb1_train_df, gb1_test_10000])
print(len(gb1_df_format))
# writing data to txt file
ssf.write_data_file("gb1_MLformat_110_train_10k_test_1", protein_seq_gb1, gb1_df_format)

in fraction,df len536084
77
33
Train Data Fraction: 0.7
false_df len268502
true_df len267582
in fraction,df len535974
Test Data Fraction: 0.7
false_df len268469
true_df len267505
Size of Test Dataset: 382146
Size of Total Dataset: 382256
10110
Filename: gb1_MLformat_110_train_10k_test_1.txt


In [219]:
# writing data to txt file
ssf.write_data_file("gb1_MLformat_40", protein_seq_gb1, gb1_df)

Filename: gb1_MLformat_40.txt


**GAL4**

In [77]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

In [78]:
gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

In [79]:
ss = gal4_ss_indexes.count(True)
not_ss = gal4_ss_indexes.count(False)
print(ss)
print(not_ss)

415
466


**Alpha-synuclein**

In [80]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'alpha-synuclein_stride.txt'
alpha_synuclein_stride_file = open(path, 'r')

In [81]:
alpha_synuclein_ss_indexes = ssf.get_sec_struc_boolean(alpha_synuclein_stride_file) # boolean list of secondary structure assignements

In [82]:
ss = alpha_synuclein_ss_indexes.count(True)
not_ss = alpha_synuclein_ss_indexes.count(False)
print(ss)
print(not_ss)

92
48


**Small ubiquitin-related modifier 1**

In [83]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

In [84]:
modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

In [85]:
ss = modifier_1_ss_indexes.count(True)
not_ss = modifier_1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
54


**TAR DNA-binding protein 43**

In [86]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

In [87]:
tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

In [88]:
ss = tar_ss_indexes.count(True)
not_ss = tar_ss_indexes.count(False)
print(ss)
print(not_ss)

149
265


**Human Glucokinase**

## Cleaning MaveDB Data

In [141]:
import re
# delete rows with "del" in it or *
# normalize scores? (already done in script)
def format_mavedb_variant(df, variant_col_name, offset):
    new_var_col = []
    for variant in df[variant_col_name]:
        wild_type = Bio.PDB.Polypeptide.three_to_one(variant[2:5].upper())
        position = int(re.findall("[0-9]+", variant)[0]) + offset
        mut_type = Bio.PDB.Polypeptide.three_to_one(variant[-3:].upper())
        new_var_col.append(wild_type + str(position) + mut_type)
    return new_var_col

**GAL4**

In [235]:
gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")

In [236]:
# take note of offset

# find columns and rename title column
gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")
gal4_df1.columns = gal4_df1.iloc[3]
print(gal4_df1.columns)
print(len(gal4_df1))


gal4_df = gal4_df1[(gal4_df1["hgvs_pro"].str.contains("del") == False) & (gal4_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (gal4_df1["hgvs_pro"].str.contains("Ter") == False)]
gal4_df = gal4_df.sample(frac=1)

print(len(gal4_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score'], dtype='object', name=3)
1323
1196


In [244]:
# getting uniprot to compare offset
protein_seq_gal4 = ssf.get_protein_seq("P04386")

In [245]:
# comparing offset
protein_seq_gal4_list = protein_seq_gal4.split(" ")
# print(protein_seq_gal4_list)
print(protein_seq_gal4_list[41]) 

PRO


In [257]:
gal4_df["variant"] = format_mavedb_variant(gal4_df, "hgvs_pro", 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["variant"] = format_mavedb_variant(gal4_df, "hgvs_pro", 0)


In [258]:
# splitting variant list if there are multiple mutations
gal4_mut = gal4_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gal4_df["WILD_TYPE_RES"] = ssf.get_wild_type(gal4_mut)

# get mutated residue and place in seperate col
gal4_df["MUTATED_RES"] = ssf.get_mutation_type(gal4_mut)

# get position and place in seperate col
gal4_df["POSITION"] = ssf.get_position(gal4_mut)

# replace variant column with reformatted variant name
gal4_df["variant"] = ssf.get_mutations_names_list(gal4_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["WILD_TYPE_RES"] = ssf.get_wild_type(gal4_mut)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["MUTATED_RES"] = ssf.get_mutation_type(gal4_mut)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["POSITION"] = ssf.get_position(gal4_mut)
A value is trying to be set on a copy of a s

In [None]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

# need positionssplit
gal4_df["positions_split"] = ssf.get_positions_split(gal4_df)

# add in_sec_str_col
gal4_df = add_sec_str_col(gal4_df, gal4_ss_indexes, 0)

In [None]:
gal4_train_df, gal4_test_df = get_train_and_test_df(gal4_df, 0.47, 40)
gal4_df_format = pd.concat([gal4_train_df, gal4_test_df])

# writing data to txt file
ssf.write_data_file("gal4_MLformat_40_train", protein_seq_gal4, gal4_df_format)

**Small ubiquitin-related modifier 1**

In [135]:
modifier_1_df1 = pd.read_csv("../Raw Data/modifier_1_mod.csv")

In [137]:
# modifier_1_df1.columns = modifier_1_df1.iloc[3]
print(modifier_1_df1.columns)
print(len(modifier_1_df1))

modifier_1_df = modifier_1_df1[(modifier_1_df1["hgvs_pro"].str.contains("=") == False) & (modifier_1_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (modifier_1_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(modifier_1_df))
print(modifier_1_df.head(1))

# shuffle values
modifier_1_df = modifier_1_df.sample(frac=1)

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'se',
       'exp.score', 'exp.sd', 'df', 'pred.score'],
      dtype='object')
2020
1919
                   accession  hgvs_nt  hgvs_splice   hgvs_pro     score  \
0  urn:mavedb:00000001-b-1#1      NaN          NaN  p.Glu5Lys  1.311357   

         sd        se  exp.score    exp.sd   df  pred.score  
0  0.085569  0.042785    1.31651  0.024947  4.0    1.117086  


In [138]:
# getting uniprot to compare offset
protein_seq_modifier_1 = ssf.get_protein_seq("P63165")
# offset of 1
print(protein_seq_modifier_1)
print(len(protein_seq_modifier_1))

MET SER ASP GLN GLU ALA LYS PRO SER THR GLU ASP LEU GLY ASP LYS LYS GLU GLY GLU TYR ILE LYS LEU LYS VAL ILE GLY GLN ASP SER SER GLU ILE HIS PHE LYS VAL LYS MET THR THR HIS LEU LYS LYS LEU LYS GLU SER TYR CYS GLN ARG GLN GLY VAL PRO MET ASN SER LEU ARG PHE LEU PHE GLU GLY GLN ARG ILE ALA ASP ASN HIS THR PRO LYS GLU LEU GLY MET GLU GLU GLU ASP VAL ILE GLU VAL TYR GLN GLU GLN THR GLY GLY HIS SER THR VAL
403


In [139]:
protein_seq_modifier_1_split = protein_seq_modifier_1.split()
print(len(protein_seq_modifier_1_split))

101


In [146]:
modifier_1_df["variant"] = format_mavedb_variant(modifier_1_df, "hgvs_pro", -1)

In [147]:
# splitting variant list if there are multiple mutations
modifier_1_mut = modifier_1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
modifier_1_df["WILD_TYPE_RES"] = ssf.get_wild_type(modifier_1_mut)

# get mutated residue and place in seperate col
modifier_1_df["MUTATED_RES"] = ssf.get_mutation_type(modifier_1_mut)

# get position and place in seperate col
modifier_1_df["POSITION"] = ssf.get_position(modifier_1_mut)

# replace variant column with reformatted variant name
modifier_1_df["variant"] = ssf.get_mutations_names_list(modifier_1_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [148]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

# need positionssplit
modifier_1_df["positions_split"] = ssf.get_positions_split(modifier_1_df)

# add in_sec_str_col
modifier_1_df = add_sec_str_col(modifier_1_df, modifier_1_ss_indexes, 0)

In [152]:
modifier_1_train_df, modifier_1_test_df, modifier_1_remaining_df = get_train_and_test_df(modifier_1_df, 0.46, 198)
# print("TEST LEN")
# print(len(modifier_1_test_df))
modifier_1_df_format = pd.concat([modifier_1_train_df, modifier_1_test_df])
print(len(modifier_1_df_format))
# writing data to txt file
ssf.write_data_file("modifier_1_MLformat_198_train_1700_test_1", protein_seq_modifier_1, modifier_1_df_format)

in fraction,df len1919
91
107
Train Data Fraction: 0.46
false_df len1026
true_df len893
in fraction,df len1721
Test Data Fraction: 0.46
false_df len919
true_df len802
Size of Test Dataset: 1700
Size of Total Dataset: 1898
1898
Filename: modifier_1_MLformat_198_train_1700_test_1.txt


In [None]:
# ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_remaining, 0.52, 200)
ube4b_test_10000 = ube4b_test_df.head(10000)
ube4b_df_format = pd.concat([ube4b_train_df, ube4b_test_10000])
print(len(ube4b_df_format))
# writing data to txt file
ssf.write_data_file("ube4b_MLformat_200_train_10k_test_2", protein_seq_ube4b, ube4b_df_format)

**TAR DNA-binding protein 43**

In [153]:
tar_df_pt1 = pd.read_csv("../Raw Data/tar1_mod.csv")
tar_df_pt2 = pd.read_csv("../Raw Data/tar2_mod.csv")

In [154]:
print(len(tar_df_pt1))
print(len(tar_df_pt2))
# tar_df_pt1 = tar_df_pt1.sample(frac=1)
# tar_df_pt2 = tar_df_pt2.sample(frac=1)
# print(tar_df_pt1.head(30))
# print(tar_df_pt2.head(30))
# tar_df1 = pd.concat([tar_df_pt1, tar_df_pt2])
# print(len(tar_df1))

# print(tar_df1.columns)

tar_df_pt1 = tar_df_pt1[(tar_df_pt1["hgvs_pro"].str.contains("\*") == False) & (tar_df_pt1["hgvs_pro"].str.contains("hgvs") == False)
                    & (tar_df_pt1["hgvs_pro"].str.contains("Ter") == False)]

tar_df_pt2 = tar_df_pt2[(tar_df_pt2["hgvs_pro"].str.contains("\*") == False) & (tar_df_pt2["hgvs_pro"].str.contains("hgvs") == False)
                    & (tar_df_pt2["hgvs_pro"].str.contains("Ter") == False)]

# print(len(tar_df))
# # shuffle values
# tar_df = tar_df.sample(frac=1)
# print(tar_df.head(3))

704
714


In [155]:
# getting uniprot to compare offset
protein_seq_tar = ssf.get_protein_seq("Q13148")
print(protein_seq_tar)

MET SER GLU TYR ILE ARG VAL THR GLU ASP GLU ASN ASP GLU PRO ILE GLU ILE PRO SER GLU ASP ASP GLY THR VAL LEU LEU SER THR VAL THR ALA GLN PHE PRO GLY ALA CYS GLY LEU ARG TYR ARG ASN PRO VAL SER GLN CYS MET ARG GLY VAL ARG LEU VAL GLU GLY ILE LEU HIS ALA PRO ASP ALA GLY TRP GLY ASN LEU VAL TYR VAL VAL ASN TYR PRO LYS ASP ASN LYS ARG LYS MET ASP GLU THR ASP ALA SER SER ALA VAL LYS VAL LYS ARG ALA VAL GLN LYS THR SER ASP LEU ILE VAL LEU GLY LEU PRO TRP LYS THR THR GLU GLN ASP LEU LYS GLU TYR PHE SER THR PHE GLY GLU VAL LEU MET VAL GLN VAL LYS LYS ASP LEU LYS THR GLY HIS SER LYS GLY PHE GLY PHE VAL ARG PHE THR GLU TYR GLU THR GLN VAL LYS VAL MET SER GLN ARG HIS MET ILE ASP GLY ARG TRP CYS ASP CYS LYS LEU PRO ASN SER LYS GLN SER GLN ASP GLU PRO LEU ARG SER ARG LYS VAL PHE VAL GLY ARG CYS THR GLU ASP MET THR GLU ASP GLU LEU ARG GLU PHE PHE SER GLN TYR GLY ASP VAL MET ASP VAL PHE ILE PRO LYS PRO PHE ARG ALA PHE ALA PHE VAL THR PHE ALA ASP ASP GLN ILE ALA GLN SER LEU CYS GLY GLU ASP LEU ILE ILE 

In [156]:
protein_seq_tar_split = protein_seq_tar.split()
print(protein_seq_tar_split[291]) # for the first one starts at 289 (add offset of )
# row 10, col 11, 31 

SER


In [157]:
print(protein_seq_tar_split[346]) 
# row 12, 6, 31 <- 16 starts at 346: 1 starts at 331?
# have to offset seperately and then add

SER


In [158]:
tar_df_pt1["variant"] = format_mavedb_variant(tar_df_pt1, "hgvs_pro", 288)
tar_df_pt2["variant"] = format_mavedb_variant(tar_df_pt2, "hgvs_pro", 330)

In [159]:
tar_df = pd.concat([tar_df_pt1, tar_df_pt2])
tar_df = tar_df.sample(frac=1)
print(len(tar_df))

1342


In [160]:
# splitting variant list if there are multiple mutations
tar_mut = tar_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
tar_df["WILD_TYPE_RES"] = ssf.get_wild_type(tar_mut)

# get mutated residue and place in seperate col
tar_df["MUTATED_RES"] = ssf.get_mutation_type(tar_mut)

# get position and place in seperate col
tar_df["POSITION"] = ssf.get_position(tar_mut)

# replace variant column with reformatted variant name
tar_df["variant"] = ssf.get_mutations_names_list(tar_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [161]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

# need positionssplit
tar_df["positions_split"] = ssf.get_positions_split(tar_df)

# add in_sec_str_col
tar_df = add_sec_str_col(tar_df, tar_ss_indexes, 0)

In [162]:
tar_train_df, tar_test_df, tar_remaining_df = get_train_and_test_df(tar_df, 0.36, 812)
print(len(tar_test_df))
tar_df_format = pd.concat([tar_train_df, tar_test_df])

# writing data to txt file
# ssf.write_data_file("tar_MLformat_812_train_10k_test_1", protein_seq_tar, tar_df_format)

in fraction,df len1342
292
520
Train Data Fraction: 0.36
false_df len1147
true_df len195


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
# ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_remaining, 0.52, 200)
ube4b_test_10000 = ube4b_test_df.head(10000)
ube4b_df_format = pd.concat([ube4b_train_df, ube4b_test_10000])
print(len(ube4b_df_format))
# writing data to txt file
ssf.write_data_file("ube4b_MLformat_200_train_10k_test_2", protein_seq_ube4b, ube4b_df_format)

**Human Glucokinase? Maybe**

## Getting Dataset Fraction

In [15]:
# adds boolean column to dataframe to indicate whether value is in secondary structure
# needs positions split column
def add_sec_str_col(df, bool_ss_list, domain_start_index):
    has_sec_str = []
    for val in df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []

        for position in val:
            if (bool_ss_list[position - domain_start_index] == False):  # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)

        # all pos sec struc should match val list
        # if there's a value in all_pos_sec_struc that's false, append false
        # otherwise, append true
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False

        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()

    # print(len(has_sec_str)) # should match dataframe length
    df['in_sec_str'] = has_sec_str
    return df

In [46]:
def get_fractioned_dataset(df, fraction_ss, size):
    
    print("in fraction,df len" + str(len(df)))
    if (size is not None): # need training dataset size
        num_in_ss = int(size*fraction_ss) 
        print(num_in_ss)
        num_not_ss = size - num_in_ss
        print(num_not_ss)
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Train Data Fraction: " + str(real_fraction))
    else: # need test dataset and size of dataset doesn't matter
        num_in_ss, num_not_ss = get_num_ss(df, fraction_ss)
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Test Data Fraction: " + str(real_fraction))
        
    true_df = df[df["in_sec_str"] == True]
    false_df = df[df["in_sec_str"] == False]
    print("false_df len" + str(len(false_df)))
    print("true_df len" + str(len(true_df)))
    ss_df = true_df.sample(num_in_ss)

    false_df = df[df["in_sec_str"] == False]
    not_ss_df = false_df.sample(num_not_ss)
    # print("Testing Test Data Fraction:" + str(len(ss_df)/(len(not_ss_df)+len(ss_df))))
    
    fractioned_df = pd.concat([ss_df, not_ss_df]).sample(frac=1)
    
    remaining_df = pd.concat([fractioned_df, df])
    remaining_df = remaining_df[~remaining_df.index.duplicated(keep=False)]
    
    return fractioned_df, remaining_df

In [17]:
def get_num_ss(df, fraction_ss): # use when size is dependent on remaining values
    # print("enters this method")
    num_ss_vals = (df["in_sec_str"] == True).sum() # number of trues to work from 
    
    num_not_ss_vals = (df["in_sec_str"] == False).sum() # number of falses to work from 
    
    if (num_not_ss_vals < num_ss_vals): # if not in ss is limiting factor
        ideal_split_ss = int((num_not_ss_vals/(1-fraction_ss))*fraction_ss)
        ideal_split_not_ss = int(num_not_ss_vals)
        # instead do min value, max value and fraction it is
    else:
        ideal_split_ss = int(num_ss_vals)
        ideal_split_not_ss = int((num_ss_vals/fraction_ss)*(1-fraction_ss))
    total = ideal_split_ss + ideal_split_not_ss
    # print("ideal_ss: " + str(ideal_split_ss))
    # print("ideal_not_ss: " + str(ideal_split_not_ss))
    while (ideal_split_ss >= num_ss_vals or ideal_split_not_ss >= num_not_ss_vals):
        # until thei)y both fit the total number of values
        if (ideal_split_ss < ideal_split_not_ss):
            ideal_split_ss = ideal_split_ss - 1
            ideal_split_not_ss = (ideal_split_ss/fraction_ss)*(1-fraction_ss)
        else:
            ideal_split_not_ss = ideal_split_not_ss - 1
            ideal_split_ss = (ideal_split_not_ss/(1-fraction_ss))*(fraction_ss)
        ideal_split_ss = int(ideal_split_ss)
        ideal_split_not_ss = int(ideal_split_not_ss)
        # total = ideal_split_ss + ideal_split_not_ss
        # print("total: " + str(total))
        # print(total % batch_size != 0)
        # print(ideal_split_ss >= num_ss_vals)
        # print(ideal_split_not_ss >= num_not_ss_vals)
        #print((ideal_split_ss >= num_ss_vals or ideal_split_not_ss >= num_not_ss_vals))
    return int(ideal_split_ss), int(ideal_split_not_ss)

In [404]:
(8+5)%32 != 0

True

In [18]:
# gets both train and test datasets based on size
# fraction_ss is a percent (e.g. 0.56)

# need "is_sec_struc_col"
# reutrns remaining df
def get_train_and_test_df(df, fraction_ss, train_size):
    train_df, remaining_df = get_fractioned_dataset(df, fraction_ss, train_size)
    test_df, remaining_df = get_fractioned_dataset(remaining_df, fraction_ss, None)
    print("Size of Test Dataset: " + str(len(test_df)))
    print("Size of Total Dataset: " + str(len(test_df)+ len(train_df)))
    return train_df, test_df, remaining_df


In [36]:
def get_new_trainset(train_df, remaining_df, fraction_ss, size):
    if (size > len(remaining_df)):
        print("Not enough training values, combining old values")
        df = pd.concat([train_df, remaining_df])
        # print("enters wrong block")     
    else:
        # print("enters right block")
        df = remaining_df
        print("remaing df len" + str(len(df)))
    new_train_df, new_remaining_df = get_fractioned_dataset(df, fraction_ss, size)
    return new_train_df, new_remaining_df

## Cleaning Gelman et al. Data

## Ube4b

In [22]:
# importing Ube4b data from Gelman et al.
ube4b_df1 = pd.read_csv("../Raw Data/ube4b.tsv.txt", sep="\t")
ube4b_df = ube4b_df1.dropna()
print(len(ube4b_df))
print(ube4b_df.columns)

98297
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [23]:
# rounding score column to 6 decimal points
ube4b_df["score"] = ube4b_df["score"].round(6)

# remove values with wildcard star next to them
ube4b_df = ube4b_df[ube4b_df["variant"].str.contains("\*") == False]
print(len(ube4b_df))
ube4b_df = ube4b_df.sample(frac=1)
# change this value depending on amount of data needed for dataset
# ube4b_df = ube4b_df.sample(n=80)
print(len(ube4b_df))

91031
91031


In [24]:
# get protein sequence from Uniprot and split
protein_seq_ube4b = ssf.get_protein_seq("Q9ES00")
protein_seq_ube4b_split = protein_seq_ube4b.split()
print(len(protein_seq_ube4b_split)) # protein length of 1173

1173


In [25]:
# ube4b protein domain sequence from Gelman et. al
string_seq = "IEKFKLLAEKVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDPFNRQMLTESMLEPVPELKEQIQAWMREKQSSDH"
print(len(string_seq)) # <- domain length of 102
ube4b_domain = ssf.get_expanded_seq(string_seq)
ube4b_domain_split = ube4b_domain.split()

# NOTE - index in list corresponds exactly to location in domain (huh)

102


In [26]:
# index of domain inside protein
ssf.get_index_range(protein_seq_ube4b_split, ube4b_domain_split)

[(1071, 1173)]


In [27]:
# splitting variant list if there are multiple mutations
ube4b_mut = ube4b_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
ube4b_df["WILD_TYPE_RES"] = ssf.get_wild_type(ube4b_mut)

# get mutated residue and place in seperate col
ube4b_df["MUTATED_RES"] = ssf.get_mutation_type(ube4b_mut)

# get position and place in seperate col
ube4b_df["POSITION"] = ssf.get_position(ube4b_mut)

# replace variant column with reformatted variant name
ube4b_df["variant"] = ssf.get_mutations_names_list(ube4b_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

#  ube4b_df = ube4b_df.drop(columns=to_drop)

In [28]:
# get ss position indexes
path = "../PDB and STRIDE Files/" + 'ube4b_stride.txt'
ube4b_stride_file = open(path, 'r')

ube4b_ss_indexes = ssf.get_sec_struc_boolean(ube4b_stride_file)

# need positionssplit
ube4b_df["positions_split"] = ssf.get_positions_split(ube4b_df)

# add in_sec_str_col
ube4b_df = add_sec_str_col(ube4b_df, ube4b_ss_indexes, 0)

In [33]:
print(len(ube4b_df))

91031


In [78]:
ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_remaining, 0.52, 200)

in fraction,df len80831
104
96
Train Data Fraction: 0.52
false_df len71697
true_df len9134
in fraction,df len80631
Test Data Fraction: 0.52
false_df len71601
true_df len9030
Size of Test Dataset: 17362
Size of Total Dataset: 17562


In [74]:
ube4b_remaining = pd.concat([ube4b_test_df[10000:], ube4b_remaining_df])

In [75]:
print(len(ube4b_remaining))

80831


In [79]:
# ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_remaining, 0.52, 200)
ube4b_test_10000 = ube4b_test_df.head(10000)
ube4b_df_format = pd.concat([ube4b_train_df, ube4b_test_10000])
print(len(ube4b_df_format))
# writing data to txt file
ssf.write_data_file("ube4b_MLformat_200_train_10k_test_2", protein_seq_ube4b, ube4b_df_format)

10200
Filename: ube4b_MLformat_200_train_10k_test_2.txt


In [32]:
print(len(ube4b_remaining_df))

63384


In [47]:
ube_new_trainset = get_new_trainset(ube4b_train_df, ube4b_remaining_df, 0.52, 64)

enters right block
remaing df len63384
in fraction,df len63384
33
31
Train Data Fraction: 0.516
false_df len63382
true_df len2


ValueError: Cannot take a larger sample than population when 'replace=False'

In [477]:
print(len(ube4b_df_format))

27647


#### Pab1

In [122]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [123]:
# rounding score column to 6 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star next to them
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]
print(len(pab1_df))
# change this value depending on amount of data needed for dataset
# pab1_df = pab1_df.sample(n=10000)
print(len(pab1_df))
pab1_df = pab1_df.sample(frac=1)

40852
37710
37710


In [124]:
# splitting variant list if there are multiple mutations
pab1_mut = pab1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pab1_df["WILD_TYPE_RES"] = ssf.get_wild_type(pab1_mut)

# get mutated residue and place in seperate col
pab1_df["MUTATED_RES"] = ssf.get_mutation_type(pab1_mut)

# get position and place in seperate col
pab1_df["POSITION"] = ssf.get_position(pab1_mut)

# replace variant column with reformatted variant name
pab1_df["variant"] = ssf.get_mutations_names_list(pab1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)
pab1_df["positions_split"] = ssf.get_positions_split(pab1_df)

In [125]:
print(len(pab1_df))

37710


In [126]:
path = "../PDB and STRIDE Files/" + 'pab1_stride.txt'
pab1_stride_file = open(path, 'r')

In [127]:
pab1_ss_indexes = ssf.get_sec_struc_boolean(pab1_stride_file) # boolean list of secondary structure assignements

In [128]:
# add column

pab1_df = add_sec_str_col(pab1_df, pab1_ss_indexes, 126)
# need positionssplit

In [90]:
print(pab1_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION', 'positions_split', 'in_sec_str'],
      dtype='object')


In [130]:
protein_seq_pab1 = ssf.get_protein_seq("P04147")

In [134]:
pab1_train_df, pab1_test_df, pab1_remaining_df = get_train_and_test_df(pab1_df, 0.69, 147)
pab1_test_10000 = pab1_test_df.head(10000)
pab1_df_format = pd.concat([pab1_train_df, pab1_test_10000])
print(len(pab1_df_format))
# writing data to txt file
ssf.write_data_file("pab1_MLformat_147_train_10k_test_1", protein_seq_pab1, pab1_df_format)

in fraction,df len37710
101
46
Train Data Fraction: 0.687
false_df len19683
true_df len18027
in fraction,df len37563
Test Data Fraction: 0.69
false_df len19637
true_df len17926
Size of Test Dataset: 25974
Size of Total Dataset: 26121
10147
Filename: pab1_MLformat_147_train_10k_test_1.txt


#### Bgl3

In [163]:
# importing Ube4b data from Gelman et al.
bgl3_df1 = pd.read_csv("../Raw Data/bgl3.tsv.txt", sep="\t")
bgl3_df = bgl3_df1.dropna()
print(len(bgl3_df))
print(bgl3_df.columns)

26653
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [164]:
# rounding score column to 6 decimal points
bgl3_df["score"] = bgl3_df["score"].round(6)
print(len(bgl3_df))

# remove values with wildcard star next to them
bgl3_df = bgl3_df[bgl3_df["variant"].str.contains("\*") == False]
print(len(bgl3_df))
# change this value depending on amount of data needed for dataset
# bgl3_df = bgl3_df.sample(n=360)
bgl3_df = bgl3_df.sample(frac=1)
print(len(bgl3_df))

26653
25737
25737


In [165]:
# NOTE - no protein domain for bgl3
# # get protein sequence from Gelman et al.
string_seq = "MVPAAQQTAMAPDAALTFPEGFLWGSATASYQIEGAAAEDGRTPSIWDTYARTPGRVRNGDTGDVATDHYHRWREDVALMAELGLGAYRFSLAWPRIQPTGRGPALQKGLDFYRRLADELLAKGIQPVATLYHWDLPQELENAGGWPERATAERFAEYAAIAADALGDRVKTWTTLNEPWCSAFLGYGSGVHAPGRTDPVAALRAAHHLNLGHGLAVQALRDRLPADAQCSVTLNIHHVRPLTDSDADADAVRRIDALANRVFTGPMLQGAYPEDLVKDTAGLTDWSFVRDGDLRLAHQKLDFLGVNYYSPTLVSEADGSGTHNSDGHGRSAHSPWPGADRVAFHQPPGETTAMGWAVDPSGLYELLRRLSSDFPALPLVITENGAAFHDYADPEGNVNDPERIAYVRDHLAAVHRAIKDGSDVRGYFLWSLLDNFEWAHGYSKRFGAVYVDYPTGTRIPKASARWYAEVARTGVLPTAGDPNSSSVDKLAAALEHHHHHH"
protein_seq_bgl3 = ssf.get_expanded_seq(string_seq)
print(protein_seq_bgl3)

MET VAL PRO ALA ALA GLN GLN THR ALA MET ALA PRO ASP ALA ALA LEU THR PHE PRO GLU GLY PHE LEU TRP GLY SER ALA THR ALA SER TYR GLN ILE GLU GLY ALA ALA ALA GLU ASP GLY ARG THR PRO SER ILE TRP ASP THR TYR ALA ARG THR PRO GLY ARG VAL ARG ASN GLY ASP THR GLY ASP VAL ALA THR ASP HIS TYR HIS ARG TRP ARG GLU ASP VAL ALA LEU MET ALA GLU LEU GLY LEU GLY ALA TYR ARG PHE SER LEU ALA TRP PRO ARG ILE GLN PRO THR GLY ARG GLY PRO ALA LEU GLN LYS GLY LEU ASP PHE TYR ARG ARG LEU ALA ASP GLU LEU LEU ALA LYS GLY ILE GLN PRO VAL ALA THR LEU TYR HIS TRP ASP LEU PRO GLN GLU LEU GLU ASN ALA GLY GLY TRP PRO GLU ARG ALA THR ALA GLU ARG PHE ALA GLU TYR ALA ALA ILE ALA ALA ASP ALA LEU GLY ASP ARG VAL LYS THR TRP THR THR LEU ASN GLU PRO TRP CYS SER ALA PHE LEU GLY TYR GLY SER GLY VAL HIS ALA PRO GLY ARG THR ASP PRO VAL ALA ALA LEU ARG ALA ALA HIS HIS LEU ASN LEU GLY HIS GLY LEU ALA VAL GLN ALA LEU ARG ASP ARG LEU PRO ALA ASP ALA GLN CYS SER VAL THR LEU ASN ILE HIS HIS VAL ARG PRO LEU THR ASP SER ASP ALA ASP ALA ASP 

In [166]:
# splitting variant list if there are multiple mutations
bgl3_mut = bgl3_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
bgl3_df["WILD_TYPE_RES"] = ssf.get_wild_type(bgl3_mut)

# get mutated residue and place in seperate col
bgl3_df["MUTATED_RES"] = ssf.get_mutation_type(bgl3_mut)

# get position and place in seperate col
bgl3_df["POSITION"] = ssf.get_position(bgl3_mut)

# replace variant column with reformatted variant name
bgl3_df["variant"] = ssf.get_mutations_names_list(bgl3_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# bgl3_df = bgl3_df.drop(columns=to_drop)

In [167]:
# get ss position indexes
path = "../PDB and STRIDE Files/" + 'bgl3_stride.txt'
bgl3_stride_file = open(path, 'r')

bgl3_ss_indexes = ssf.get_sec_struc_boolean(bgl3_stride_file)

# need positionssplit
bgl3_df["positions_split"] = ssf.get_positions_split(bgl3_df)

# add in_sec_str_col
bgl3_df = add_sec_str_col(bgl3_df, bgl3_ss_indexes, 0)

In [169]:
bgl3_train_df, bgl3_test_df, bgl3_remaining_df = get_train_and_test_df(bgl3_df, 0.54, 982)
bgl3_test_10000 = bgl3_test_df.head(10000)
print("TEST LEN")
print(len(bgl3_test_df))
bgl3_df_format = pd.concat([bgl3_train_df, bgl3_test_10000])
print(len(bgl3_df_format))
# writing data to txt file
ssf.write_data_file("bgl3_MLformat_982_train_10k_test_1", protein_seq_bgl3, bgl3_df_format)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len17674
true_df len8063
in fraction,df len24755
Test Data Fraction: 0.54
false_df len17222
true_df len7533
Size of Test Dataset: 13945
Size of Total Dataset: 14927
TEST LEN
13945
10982
Filename: bgl3_MLformat_982_train_10k_test_1.txt


*avGFP and GB1 above

# Cleaning Curated Dataset Data