# Datasets for Other Proteins

This notebook formats data for the additional protein data used. It is used for part 2 of the project.

In [3]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide
import re

In [4]:
import secStrucFormatting as ssf

In [5]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Metadata from compiled dataset

In [6]:
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)
# print(df.head)
print(df.columns)

print(df["PROTEIN"].value_counts().head(10))
print(df.loc[df['PROTEIN'] == 'Lysozyme'])

Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'ddG_(kcal/mol)',
       'ddG_H2O_(kcal/mol)', 'STATE', 'REVERSIBILITY', 'PUBMED_ID',
       'REFERENCE', 'MUTATED_CHAIN', 'KINGDOM', 'PBD_MUTANT', 'MEASURE',
       'METHOD', 'POSITION', 'WILD_TYPE_RES', 'MUTATED_RES', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS'],
      dtype='object')
Lysozyme                                                   2897
Immunoglobulin G-binding protein G                         1996
Thermonuclease                                             1586
Staphylococcal nuclease                                    1457
Endolysin                                                  1110
Ribonuclease                                                983
Ribonuclease HI                                             710
Guanine nucleotide-binding protein G(i) subunit alpha-1     704
Myo

In [7]:
print(df["UNIPROT_ID"].value_counts().head(10)) # 4 possible could be used?

P00644    3033
P00720    2767
P06654    2297
P61626    1146
P00648     981
P00651     904
P0A7Y4     722
P63096     698
P00044     546
P00698     491
Name: UNIPROT_ID, dtype: int64


#### P00644 (Thermonuclease)

In [131]:
# finding column with most values
# print(nuclease_df)
# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']
print(nuclease_df.head(50))
nuclease_df = nuclease_df.loc[nuclease_df['PROTEIN'].str.contains('Thiol:disulfide interchange protein DsbA') == False]
# print(nuclease_df.columns)
nuclease_df = nuclease_df[nuclease_df['ddG_(kcal/mol)'].notna()]
nuclease_df = nuclease_df[nuclease_df['MUTATION'].str.contains('wild-type') == False]
print(nuclease_df['ddG_(kcal/mol)'].tail(40))

# rename ddG to score and mutation to variant
nuclease_df = nuclease_df.rename(columns={'MUTATION': 'variant', 'ddG_(kcal/mol)': 'score'})
print(nuclease_df['score'])
nuclease_df['score'] = nuclease_df['score'].round(6)
# print(nuclease_df['variant'].head(30))

nuclease_df["positions_split"] = ssf.get_positions_split(nuclease_df)

positions_split_subtracted = []
for pos_list in nuclease_df["positions_split"]:
    pos_list = [x - 1 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

nuclease_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in nuclease_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(nuclease_df["POSITION"]))

nuclease_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

      DATABASE                  PROTEIN UNIPROT_ID           MUTATION  \
359   Protherm  Staphylococcal nuclease     P00644          wild-type   
360   Protherm  Staphylococcal nuclease     P00644          wild-type   
361   Protherm  Staphylococcal nuclease     P00644              L107A   
362   Protherm  Staphylococcal nuclease     P00644              V148L   
363   Protherm  Staphylococcal nuclease     P00644              G161S   
364   Protherm  Staphylococcal nuclease     P00644              G170V   
365   Protherm  Staphylococcal nuclease     P00644              A172S   
366   Protherm  Staphylococcal nuclease     P00644              H206L   
367   Protherm  Staphylococcal nuclease     P00644              L107A   
368   Protherm  Staphylococcal nuclease     P00644              V148L   
369   Protherm  Staphylococcal nuclease     P00644              G161S   
370   Protherm  Staphylococcal nuclease     P00644              G170V   
371   Protherm  Staphylococcal nuclease     P00644 

In [131]:
print(nuclease_df['score'])


10556    -2.6
10557     0.8
10558    -1.3
10559     0.5
10560    -2.5
         ... 
29815     7.2
29816     7.3
29817     9.3
29818    11.3
37335     0.7
Name: score, Length: 1280, dtype: float64


In [132]:
# get protein from uniprot
protein_seq_nuclease = ssf.get_protein_seq('P00644')
print(protein_seq_nuclease)

MET LEU VAL MET THR GLU TYR LEU LEU SER ALA GLY ILE CYS MET ALA ILE VAL SER ILE LEU LEU ILE GLY MET ALA ILE SER ASN VAL SER LYS GLY GLN TYR ALA LYS ARG PHE PHE PHE PHE ALA THR SER CYS LEU VAL LEU THR LEU VAL VAL VAL SER SER LEU SER SER SER ALA ASN ALA SER GLN THR ASP ASN GLY VAL ASN ARG SER GLY SER GLU ASP PRO THR VAL TYR SER ALA THR SER THR LYS LYS LEU HIS LYS GLU PRO ALA THR LEU ILE LYS ALA ILE ASP GLY ASP THR VAL LYS LEU MET TYR LYS GLY GLN PRO MET THR PHE ARG LEU LEU LEU VAL ASP THR PRO GLU THR LYS HIS PRO LYS LYS GLY VAL GLU LYS TYR GLY PRO GLU ALA SER ALA PHE THR LYS LYS MET VAL GLU ASN ALA LYS LYS ILE GLU VAL GLU PHE ASP LYS GLY GLN ARG THR ASP LYS TYR GLY ARG GLY LEU ALA TYR ILE TYR ALA ASP GLY LYS MET VAL ASN GLU ALA LEU VAL ARG GLN GLY LEU ALA LYS VAL ALA TYR VAL TYR LYS PRO ASN ASN THR HIS GLU GLN HIS LEU ARG LYS SER GLU ALA GLN ALA LYS LYS GLU LYS LEU ASN ILE TRP SER GLU ASP ASN ALA ASP SER GLY GLN


In [133]:
protein_seq_nuclease_split = protein_seq_nuclease.split()
print(protein_seq_nuclease_split[221:224])
print(len(protein_seq_nuclease_split))

['TRP', 'SER', 'GLU']
231


In [134]:
nuclease_df["variant"] = ssf.get_mutations_names_list(nuclease_df)
# print(nuclease_df.head(10))

In [139]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [140]:
nuclease_ss_indexes = ssf.get_all_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements from uniprot
print(nuclease_ss_indexes.count(True))
print(nuclease_ss_indexes.count(False))

193
38


In [141]:
nuclease_df = add_sec_str_col(nuclease_df, nuclease_ss_indexes, 0)

In [138]:
nuclease_df = ssf.get_domain_dataset(nuclease_df, 0, 231, [])

  in_domain_df = in_domain_df.append(rows, ignore_index=True)


In [139]:
print(len(nuclease_df))

1280


In [142]:
# print(nuclease_df.head(5))
print(len(nuclease_df))
nuclease_df = nuclease_df[nuclease_df['score'] != 0.0]
print(len(nuclease_df))

1258
1258


In [146]:
nuclease_train_df, nuclease_test_df, nuclease_remaining_df = get_train_and_test_df(nuclease_df, 0.62, 453)
nuclease_df_format = pd.concat([nuclease_train_df, nuclease_test_df])
print(len(nuclease_df_format))
ssf.write_data_file("nuclease_MLformat_453_train_539_test_t3", protein_seq_nuclease, nuclease_df_format)

in fraction,df len1258
280
173
Train Data Fraction: 0.618
false_df len379
true_df len879
in fraction,df len805
Test Data Fraction: 0.62
false_df len206
true_df len599
Size of Test Dataset: 539
Size of Total Dataset: 992
992
Filename: nuclease_MLformat_453_train_539_test_t3.txt


In [74]:
ss = nuclease_ss_indexes.count(True)
not_ss = nuclease_ss_indexes.count(False)
print(ss)
print(not_ss)

144
87


#### P00720 (Endolysin)

In [24]:
endolysin_df = df.loc[df['UNIPROT_ID'] == 'P00720']
# endolysin_df = protein_G_df["PROTEIN"].value_counts()
print(len(endolysin_df))
endolysin_df.count()
print(endolysin_df["PROTEIN"].value_counts())

2767
Lysozyme                                     1637
Endolysin                                    1101
T4 lysozyme                                    20
N,O-diacetylmuramidase                          2
Beta-galactosidase                              2
Invertase 2                                     1
Spanin, inner membrane subunit                  1
Cytosol aminopeptidase                          1
Thermophilic aminopeptidase 1 alpha chain       1
Carboxypeptidase Y                              1
Name: PROTEIN, dtype: int64


In [43]:
lys_df = df.loc[df['PROTEIN'].str.contains('Lysozyme') == True]
print(lys_df["PROTEIN"].value_counts())
print(lys_df['MUTATION'].head(50))

Lysozyme                    2897
Lysozyme C                   449
Lysozyme C, milk isozyme      14
Name: PROTEIN, dtype: int64
57       wild-type
97       wild-type
98            L46A
99           L118A
100          L121A
101           L99A
102          L133A
103          F153A
104     L99A F153A
105      wild-type
106           E25Q
107           D36N
108           D67N
109           D85N
110          D120N
111          D138N
112      wild-type
113           E25Q
114           D36N
115           D67N
116           D85N
117          D120N
118          D138N
119      wild-type
120           E25Q
121           D85N
122          D120N
123          D138N
124      wild-type
125           E25Q
126           D85N
127          D120N
128          D138N
129      wild-type
130      wild-type
131           G67A
132           G85A
133           G89A
134          G120A
135          G135A
136      wild-type
137           K16E
138          R119E
139          K135E
140          K147E
141          R154E

In [44]:
endo_df = df.loc[df['PROTEIN'].str.contains('Endolysin') == True]
print(print(endo_df["PROTEIN"].value_counts()))
# print(endo_df['MUTATION'].head(50))

Endolysin    1110
Name: PROTEIN, dtype: int64
None


In [11]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'endolysin_stride.txt'
endolysin_stride_file = open(path, 'r')

In [29]:
# getting protein string
protein_seq_endolysin = ssf.get_protein_seq('P00720')
print(protein_seq_endolysin)

MET ASN ILE PHE GLU MET LEU ARG ILE ASP GLU ARG LEU ARG LEU LYS ILE TYR LYS ASP THR GLU GLY TYR TYR THR ILE GLY ILE GLY HIS LEU LEU THR LYS SER PRO SER LEU ASN ALA ALA LYS SER GLU LEU ASP LYS ALA ILE GLY ARG ASN CYS ASN GLY VAL ILE THR LYS ASP GLU ALA GLU LYS LEU PHE ASN GLN ASP VAL ASP ALA ALA VAL ARG GLY ILE LEU ARG ASN ALA LYS LEU LYS PRO VAL TYR ASP SER LEU ASP ALA VAL ARG ARG CYS ALA LEU ILE ASN MET VAL PHE GLN MET GLY GLU THR GLY VAL ALA GLY PHE THR ASN SER LEU ARG MET LEU GLN GLN LYS ARG TRP ASP GLU ALA ALA VAL ASN LEU ALA LYS SER ILE TRP TYR ASN GLN THR PRO ASN ARG ALA LYS ARG VAL ILE THR THR PHE ARG THR GLY THR TRP ASP ALA TYR LYS ASN LEU


In [55]:
protein_seq_endolysin_split = protein_seq_endolysin.split()
print(len(protein_seq_endolysin_split))

164


In [12]:
endolysin_ss_indexes = ssf.get_sec_struc_boolean(endolysin_stride_file) # boolean list of secondary structure assignements

In [13]:
ss = endolysin_ss_indexes.count(True)
not_ss = endolysin_ss_indexes.count(False)
print(ss)
print(not_ss)

122
41


#### P06654 (Immunoglobulin G-binding protein G)

In [106]:
protein_G_df = df.loc[df['UNIPROT_ID'] == 'P06654']
print(protein_G_df["PROTEIN"].value_counts())
protein_G_df = protein_G_df.loc[protein_G_df['PROTEIN'].str.contains('Single Domain Antibody') == False]
protein_G_df = protein_G_df[protein_G_df["POSITION"].str.contains("pga_A") == False]

protein_G_df = protein_G_df[protein_G_df["ddG_(kcal/mol)"].notna()]
print(len(protein_G_df))

protein_G_df = protein_G_df.rename(columns={'MUTATION': 'variant', 'ddG_(kcal/mol)': 'score'})
protein_G_df['score'] = protein_G_df['score'].round(6)
# print(protein_G_df['variant'].head(30))

protein_G_df["positions_split"] = ssf.get_positions_split(protein_G_df)

positions_split_subtracted = []
for pos_list in protein_G_df["positions_split"]:
    pos_list = [x - 1 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

protein_G_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in protein_G_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(protein_G_df["POSITION"]))

protein_G_df["POSITION"] = new_positions
# print(protein_G_df["variant"].head(10))
# print(protein_G_df["POSITION"].head(10))

Immunoglobulin G-binding protein G    1995
Protein G                              285
Single Domain Antibody                  17
Name: PROTEIN, dtype: int64
1221


In [107]:
protein_G_df["variant"] = ssf.get_mutations_names_list(protein_G_df)
print(protein_G_df["variant"].head(10))
protein_G_df = add_sec_str_col(protein_G_df, protein_G_ss_indexes, 0)

9330            231ARG, 278GLU, 269ALA
9331            231ARG, 278ALA, 269ARG
9332            231ARG, 278ALA, 269ALA
9333                    278GLU, 269ARG
9334                    278GLU, 269ALA
9335                    278ALA, 269ARG
9336                    278ALA, 269ALA
9483    241ILE, 243ILE, 250GLU, 254PHE
9484    241ILE, 243ILE, 250GLU, 254TYR
9485            241ILE, 243ILE, 254TRP
Name: variant, dtype: object


In [101]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'protein_G_stride.txt'
protein_G_stride_file = open(path, 'r')

In [102]:
protein_G_ss_indexes = ssf.get_sec_struc_boolean(protein_G_stride_file) # boolean list of secondary structure assignements

In [88]:
protein_seq_protein_G = ssf.get_protein_seq('P06654')
print(protein_seq_protein_G)

MET GLU LYS GLU LYS LYS VAL LYS TYR PHE LEU ARG LYS SER ALA PHE GLY LEU ALA SER VAL SER ALA ALA PHE LEU VAL GLY SER THR VAL PHE ALA VAL ASP SER PRO ILE GLU ASP THR PRO ILE ILE ARG ASN GLY GLY GLU LEU THR ASN LEU LEU GLY ASN SER GLU THR THR LEU ALA LEU ARG ASN GLU GLU SER ALA THR ALA ASP LEU THR ALA ALA ALA VAL ALA ASP THR VAL ALA ALA ALA ALA ALA GLU ASN ALA GLY ALA ALA ALA TRP GLU ALA ALA ALA ALA ALA ASP ALA LEU ALA LYS ALA LYS ALA ASP ALA LEU LYS GLU PHE ASN LYS TYR GLY VAL SER ASP TYR TYR LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY ILE LYS ASP LEU GLN ALA GLN VAL VAL GLU SER ALA LYS LYS ALA ARG ILE SER GLU ALA THR ASP GLY LEU SER ASP PHE LEU LYS SER GLN THR PRO ALA GLU ASP THR VAL LYS SER ILE GLU LEU ALA GLU ALA LYS VAL LEU ALA ASN ARG GLU LEU ASP LYS TYR GLY VAL SER ASP TYR HIS LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY VAL LYS GLU LEU ILE ASP GLU ILE LEU ALA ALA LEU PRO LYS THR ASP THR TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA 

In [99]:
protein_seq_protein_G_split = protein_seq_protein_G.split()
print(protein_seq_protein_G_split[227:234])

['THR', 'TYR', 'LYS', 'LEU', 'ILE', 'LEU', 'ASN']


In [108]:
print(len(protein_G_df))

1221


In [111]:
# protein_G_train_df, protein_G_test_df, protein_G_remaining_df = get_train_and_test_df(protein_G_df, 0.53, 878)
# protein_G_df_format = pd.concat([protein_G_train_df, protein_G_test_df])
# print(len(protein_G_df_format))
# # ssf.write_data_file("protein_G_MLformat_453_train_639_test_t3", protein_seq_protein_G, protein_G_df_format)

# not enough for test and train set

From Gelman et al.

**avGFP**

In [31]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [32]:
avgfp_ss_indexes = ssf.get_all_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [33]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

209
28


Formatting Data

In [34]:
# importing avGFP data from Gelman et al.
avgfp_df1 = pd.read_csv("../Raw Data/avgfp.tsv.txt", sep="\t")
avgfp_df = avgfp_df1.dropna()
print(len(avgfp_df))
print(avgfp_df.columns)

54024
Index(['variant', 'num_mutations', 'score', 'score_wt_norm'], dtype='object')


In [35]:
# rounding score column to 2 decimal points
avgfp_df["score"] = avgfp_df["score"].round(6)
print(len(avgfp_df))

# remove values with wildcard star thing cause idk what it means
avgfp_df = avgfp_df[avgfp_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
# avgfp_df = avgfp_df.sample(n=160)
print(len(avgfp_df))

54024
51714


In [36]:
# getting dataset size to run

string_seq = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(len(string_seq)) 
protein_seq_avgfp = ssf.get_expanded_seq(string_seq)
print(protein_seq_avgfp)

237
SER LYS GLY GLU GLU LEU PHE THR GLY VAL VAL PRO ILE LEU VAL GLU LEU ASP GLY ASP VAL ASN GLY HIS LYS PHE SER VAL SER GLY GLU GLY GLU GLY ASP ALA THR TYR GLY LYS LEU THR LEU LYS PHE ILE CYS THR THR GLY LYS LEU PRO VAL PRO TRP PRO THR LEU VAL THR THR LEU SER TYR GLY VAL GLN CYS PHE SER ARG TYR PRO ASP HIS MET LYS GLN HIS ASP PHE PHE LYS SER ALA MET PRO GLU GLY TYR VAL GLN GLU ARG THR ILE PHE PHE LYS ASP ASP GLY ASN TYR LYS THR ARG ALA GLU VAL LYS PHE GLU GLY ASP THR LEU VAL ASN ARG ILE GLU LEU LYS GLY ILE ASP PHE LYS GLU ASP GLY ASN ILE LEU GLY HIS LYS LEU GLU TYR ASN TYR ASN SER HIS ASN VAL TYR ILE MET ALA ASP LYS GLN LYS ASN GLY ILE LYS VAL ASN PHE LYS ILE ARG HIS ASN ILE GLU ASP GLY SER VAL GLN LEU ALA ASP HIS TYR GLN GLN ASN THR PRO ILE GLY ASP GLY PRO VAL LEU LEU PRO ASP ASN HIS TYR LEU SER THR GLN SER ALA LEU SER LYS ASP PRO ASN GLU LYS ARG ASP HIS MET VAL LEU LEU GLU PHE VAL THR ALA ALA GLY ILE THR HIS GLY MET ASP GLU LEU TYR LYS


In [37]:
protein_seq_avgfp_split = protein_seq_avgfp.split()
print(len(protein_seq_avgfp_split))
print(protein_seq_avgfp_split[60])

# 165VAL -> ILE

237
THR


In [38]:
# splitting variant list if there are multiple mutations
avgfp_mut = avgfp_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
avgfp_df["WILD_TYPE_RES"] = ssf.get_wild_type(avgfp_mut)

# get mutated residue and place in seperate col
avgfp_df["MUTATED_RES"] = ssf.get_mutation_type(avgfp_mut)

# get position and place in seperate col
avgfp_df["POSITION"] = ssf.get_position(avgfp_mut)

# replace variant column with reformatted variant name
avgfp_df["variant"] = ssf.get_mutations_names_list(avgfp_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# avgfp_df = avgfp_df.drop(columns=to_drop)

In [39]:

# need positionssplit
avgfp_df["positions_split"] = ssf.get_positions_split(avgfp_df)

# add in_sec_str_col
avgfp_df = add_sec_str_col(avgfp_df, avgfp_ss_indexes, 0)

In [23]:
avgfp_train_df, avgfp_test_df, avgfp_remaining_df = get_train_and_test_df(avgfp_test_df, 0.88, 465)
avgfp_df_format = pd.concat([avgfp_train_df, avgfp_test_df_2])
print(len(avgfp_df_format))
# ssf.write_data_file("avgfp_MLformat_465_train_19458_test_t3", protein_seq_avgfp, avgfp_df_format)

NameError: name 'avgfp_test_df' is not defined

In [46]:
avgfp_train_df, avgfp_test_df, avgfp_remaining_df = get_train_and_test_df(avgfp_df, 0.88, 465)
avgfp_train_dummy, avgfp_small_test_df, avgfp_remaining_df = get_train_and_test_df(avgfp_test_df, 0.88, 20000)
print("SMALL TEST")
print(len(avgfp_small_test_df))
avgfp_df_format = pd.concat([avgfp_train_df, avgfp_small_test_df])
print(len(avgfp_df_format))
ssf.write_data_file("avgfp_MLformat_465_train_15150_test_t3", protein_seq_avgfp, avgfp_df_format)

in fraction,df len51714
409
56
Train Data Fraction: 0.88
false_df len20361
true_df len31353
in fraction,df len51249
Test Data Fraction: 0.88
false_df len20305
true_df len30944
Size of Test Dataset: 35158
Size of Total Dataset: 35623
in fraction,df len35158
17600
2400
Train Data Fraction: 0.88
false_df len4219
true_df len30939
in fraction,df len15158
Test Data Fraction: 0.88
false_df len1819
true_df len13339
Size of Test Dataset: 15150
Size of Total Dataset: 35150
SMALL TEST
15150
15615
Filename: avgfp_MLformat_465_train_15150_test_t3.txt


In [103]:
avgfp_train_df_2, avgfp_test_df_2, avgfp_remaining_df_2 = get_train_and_test_df(avgfp_df, 0.64, 465)
avgfp_df_format_2 = pd.concat([avgfp_train_df_2, avgfp_test_df_2])
print(len(avgfp_df_format_2))
ssf.write_data_file("avgfp_MLformat_465_train_19458_test_t2", protein_seq_avgfp, avgfp_df_format_2)

in fraction,df len51714
297
168
Train Data Fraction: 0.639
false_df len38961
true_df len12753
in fraction,df len51249
Test Data Fraction: 0.64
false_df len38793
true_df len12456
Size of Test Dataset: 19458
Size of Total Dataset: 19923
19923
Filename: avgfp_MLformat_982_train_19458_test_t2.txt


In [104]:
avgfp_train_df_3, avgfp_test_df_3, avgfp_remaining_df_3 = get_train_and_test_df(avgfp_df, 0.64, 465)
avgfp_df_format_3 = pd.concat([avgfp_train_df_3, avgfp_test_df_3])
print(len(avgfp_df_format_3))
ssf.write_data_file("avgfp_MLformat_465_train_19458_test_t3", protein_seq_avgfp, avgfp_df_format_3)

in fraction,df len51714
297
168
Train Data Fraction: 0.639
false_df len38961
true_df len12753
in fraction,df len51249
Test Data Fraction: 0.64
false_df len38793
true_df len12456
Size of Test Dataset: 19458
Size of Total Dataset: 19923
19923
Filename: avgfp_MLformat_982_train_19458_test_t3.txt


In [105]:
print(len(avgfp_train_df))
print(len(avgfp_test_df))
print(len(avgfp_train_df_2))
print(len(avgfp_test_df_2))
print(len(avgfp_train_df_3))
print(len(avgfp_test_df_3))

465
19458
465
19458
465
19458


In [100]:
# avgfp_train_df, avgfp_new_test_df, avgfp_new_remaining_df = get_train_and_test_df(avgfp_test_df, 0.64, 465)

in fraction,df len51714
297
168
Train Data Fraction: 0.639
false_df len38961
true_df len12753
in fraction,df len51249
Test Data Fraction: 0.64
false_df len38793
true_df len12456
Size of Test Dataset: 19458
Size of Total Dataset: 19923


**GB1**

In [87]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [88]:
gb1_ss_indexes = ssf.get_all_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [89]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
9


In [90]:
# importing pab1 data from Gelman et al.
gb1_df1 = pd.read_csv("../Raw Data/gb1.tsv.txt", sep="\t")
gb1_df = gb1_df1.dropna()
print(len(gb1_df))
# gb1_df = gb1_df.sample(n=480)
print(gb1_df.columns)
gb1_df = gb1_df.sample(frac=1)

536084
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [91]:
# rounding score column to 2 decimal points
gb1_df["score"] = gb1_df["score"].round(6)
print(len(gb1_df))

# remove values with wildcard star thing cause idk what it means
gb1_df = gb1_df[gb1_df["variant"].str.contains("\*") == False]

# gb1_df = gb1_df.sample(n=40)
# pab1_df = pab1_df.head(37600)
print(len(gb1_df))

536084
536084


In [183]:
# getting protein sequence
# protein_seq_gb1 = ssf.get_protein_seq("P04386")

In [92]:
# getting dataset size to run

string_seq = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
print(len(string_seq)) # <- domain length of 75
protein_seq_gb1 = ssf.get_expanded_seq(string_seq)
print(protein_seq_gb1)

56
MET GLN TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA THR ALA GLU LYS VAL PHE LYS GLN TYR ALA ASN ASP ASN GLY VAL ASP GLY GLU TRP THR TYR ASP ASP ALA THR LYS THR PHE THR VAL THR GLU


In [93]:
# splitting variant list if there are multiple mutations
gb1_mut = gb1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gb1_df["WILD_TYPE_RES"] = ssf.get_wild_type(gb1_mut)

# get mutated residue and place in seperate col
gb1_df["MUTATED_RES"] = ssf.get_mutation_type(gb1_mut)

# get position and place in seperate col
gb1_df["POSITION"] = ssf.get_position(gb1_mut)

# replace variant column with reformatted variant name
gb1_df["variant"] = ssf.get_mutations_names_list(gb1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# gb1_df = gb1_df.drop(columns=to_drop)

In [94]:
# need positionssplit
gb1_df["positions_split"] = ssf.get_positions_split(gb1_df)

# add in_sec_str_col
gb1_df = add_sec_str_col(gb1_df, gb1_ss_indexes, 0)

In [15]:
# ssf.write_data_file("gb1_MLformat_full_dataset", protein_seq_gb1, gb1_df)

Filename: gb1_MLformat_full_dataset.txt


In [99]:
gb1_train_df, gb1_test_df, gb1_remaining_df = get_train_and_test_df(gb1_df, 0.70, 110)
# gb1_test_10000 = gb1_test_df.head(10000)
gb1_df_format = pd.concat([gb1_train_df, gb1_test_10000])
print(len(gb1_df_format))
# writing data to txt file
# ssf.write_data_file("gb1_MLformat_110_train_382146_test_t1", protein_seq_gb1, gb1_df_format)

in fraction,df len536084
77
33
Train Data Fraction: 0.7
false_df len145781
true_df len390303
in fraction,df len535974
Test Data Fraction: 0.7
false_df len145748
true_df len390226
Size of Test Dataset: 485823
Size of Total Dataset: 485933


NameError: name 'gb1_test_10000' is not defined

In [117]:
# make a smaller test size so program doesn't crash
gb1_train_dummy, gb1_test_df, gb1_remaining_df = get_train_and_test_df(gb1_df, 0.70, 30000)

in fraction,df len536084
244999
105001
Train Data Fraction: 0.7
false_df len268502
true_df len267582
in fraction,df len186084
Test Data Fraction: 0.7
false_df len163501
true_df len22583
Size of Test Dataset: 32256
Size of Total Dataset: 382256


In [116]:
print(len(gb1_test_df))

32256


In [114]:
gb1_train_df, gb1_test_df, gb1_remaining_df = get_train_and_test_df(gb1_df, 0.84, 110)
gb1_train_dummy, gb1_small_test_df, gb1_remaining_df = get_train_and_test_df(gb1_test_df, 0.84, 450000)
print("SMALL TEST")
print(len(gb1_small_test_df))
gb1_df_format = pd.concat([gb1_train_df, gb1_small_test_df])
print(len(gb1_df_format))
ssf.write_data_file("gb1_MLformat_110_train_14524_test_t3", protein_seq_gb1, gb1_df_format)

in fraction,df len536084
92
18
Train Data Fraction: 0.836
false_df len145781
true_df len390303
in fraction,df len535974
Test Data Fraction: 0.84
false_df len145763
true_df len390211
Size of Test Dataset: 464531
Size of Total Dataset: 464641
in fraction,df len464531
378000
72000
Train Data Fraction: 0.84
false_df len74325
true_df len390206
in fraction,df len14531
Test Data Fraction: 0.84
false_df len2325
true_df len12206
Size of Test Dataset: 14524
Size of Total Dataset: 464524
SMALL TEST
14524
14634
Filename: gb1_MLformat_110_train_14524_test_t3.txt


In [37]:
gb1_train_df_2, gb1_test_df_2, gb1_remaining_df_2 = get_train_and_test_df(gb1_df, 0.70, 110)
gb1_df_format_2 = pd.concat([gb1_train_df_2, gb1_test_df_2])
print(len(gb1_df_format_2))
ssf.write_data_file("gb1_MLformat_982_train_13945_test_t2", protein_seq_gb1, gb1_df_format_2)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len17674
true_df len8063
in fraction,df len24755
Test Data Fraction: 0.54
false_df len17222
true_df len7533
Size of Test Dataset: 13945
Size of Total Dataset: 14927
14927
Filename: bgl3_MLformat_982_train_13945_test_t2.txt


In [41]:
gb1_train_df_3, gb1_test_df_3, gb1_remaining_df_3 = get_train_and_test_df(gb1_df, 0.70, 110)
gb1_df_format_3 = pd.concat([gb1_train_df_3, gb1_test_df_3])
print(len(gb1_df_format_3))
ssf.write_data_file("gb1_MLformat_982_train_13945_test_t3", protein_seq_gb1, gb1_df_format_3)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len17674
true_df len8063
in fraction,df len24755
Test Data Fraction: 0.54
false_df len17222
true_df len7533
Size of Test Dataset: 13945
Size of Total Dataset: 14927
14927
Filename: bgl3_MLformat_982_train_13945_test_t3.txt


In [42]:
print(len(gb1_train_df))
print(len(gb1_test_df))
print(len(gb1_train_df_2))
print(len(gb1_test_df_2))
print(len(gb1_train_df_3))
print(len(gb1_test_df_3))

982
13945
982
13945
982
13945


**GAL4**

In [77]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

In [78]:
gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

In [79]:
ss = gal4_ss_indexes.count(True)
not_ss = gal4_ss_indexes.count(False)
print(ss)
print(not_ss)

415
466


**Alpha-synuclein**

In [80]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'alpha-synuclein_stride.txt'
alpha_synuclein_stride_file = open(path, 'r')

In [81]:
alpha_synuclein_ss_indexes = ssf.get_sec_struc_boolean(alpha_synuclein_stride_file) # boolean list of secondary structure assignements

In [82]:
ss = alpha_synuclein_ss_indexes.count(True)
not_ss = alpha_synuclein_ss_indexes.count(False)
print(ss)
print(not_ss)

92
48


**Small ubiquitin-related modifier 1**

In [83]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

In [84]:
modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

In [85]:
ss = modifier_1_ss_indexes.count(True)
not_ss = modifier_1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
54


**TAR DNA-binding protein 43**

In [86]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

In [87]:
tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

In [88]:
ss = tar_ss_indexes.count(True)
not_ss = tar_ss_indexes.count(False)
print(ss)
print(not_ss)

149
265


**Human Glucokinase**

## Cleaning MaveDB Data

In [130]:
import re
# delete rows with "del" in it or *
# normalize scores? (already done in script)
def format_mavedb_variant(df, variant_col_name, offset):
    new_var_col = []
    for variant in df[variant_col_name]:
        wild_type = Bio.PDB.Polypeptide.three_to_one(variant[2:5].upper())
        position = int(re.findall("[0-9]+", variant)[0]) + offset
        mut_type = Bio.PDB.Polypeptide.three_to_one(variant[-3:].upper())
        new_var_col.append(wild_type + str(position) + mut_type)
    return new_var_col

**GAL4**

In [235]:
gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")

In [236]:
# take note of offset

# find columns and rename title column
gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")
gal4_df1.columns = gal4_df1.iloc[3]
print(gal4_df1.columns)
print(len(gal4_df1))


gal4_df = gal4_df1[(gal4_df1["hgvs_pro"].str.contains("del") == False) & (gal4_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (gal4_df1["hgvs_pro"].str.contains("Ter") == False)]
gal4_df = gal4_df.sample(frac=1)

print(len(gal4_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score'], dtype='object', name=3)
1323
1196


In [244]:
# getting uniprot to compare offset
protein_seq_gal4 = ssf.get_protein_seq("P04386")

In [245]:
# comparing offset
protein_seq_gal4_list = protein_seq_gal4.split(" ")
# print(protein_seq_gal4_list)
print(protein_seq_gal4_list[41]) 

PRO


In [257]:
gal4_df["variant"] = format_mavedb_variant(gal4_df, "hgvs_pro", 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["variant"] = format_mavedb_variant(gal4_df, "hgvs_pro", 0)


In [258]:
# splitting variant list if there are multiple mutations
gal4_mut = gal4_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gal4_df["WILD_TYPE_RES"] = ssf.get_wild_type(gal4_mut)

# get mutated residue and place in seperate col
gal4_df["MUTATED_RES"] = ssf.get_mutation_type(gal4_mut)

# get position and place in seperate col
gal4_df["POSITION"] = ssf.get_position(gal4_mut)

# replace variant column with reformatted variant name
gal4_df["variant"] = ssf.get_mutations_names_list(gal4_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["WILD_TYPE_RES"] = ssf.get_wild_type(gal4_mut)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["MUTATED_RES"] = ssf.get_mutation_type(gal4_mut)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gal4_df["POSITION"] = ssf.get_position(gal4_mut)
A value is trying to be set on a copy of a s

In [None]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

# need positionssplit
gal4_df["positions_split"] = ssf.get_positions_split(gal4_df)

# add in_sec_str_col
gal4_df = add_sec_str_col(gal4_df, gal4_ss_indexes, 0)

In [None]:
gal4_train_df, gal4_test_df = get_train_and_test_df(gal4_df, 0.47, 40)
gal4_df_format = pd.concat([gal4_train_df, gal4_test_df])

# writing data to txt file
ssf.write_data_file("gal4_MLformat_40_train", protein_seq_gal4, gal4_df_format)

**Small ubiquitin-related modifier 1**

In [87]:
modifier_1_df1 = pd.read_csv("../Raw Data/modifier_1_mod.csv")

In [88]:
# modifier_1_df1.columns = modifier_1_df1.iloc[3]
print(modifier_1_df1.columns)
print(len(modifier_1_df1))

modifier_1_df = modifier_1_df1[(modifier_1_df1["hgvs_pro"].str.contains("=") == False) & (modifier_1_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (modifier_1_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(modifier_1_df))
print(modifier_1_df.head(1))

# shuffle values
modifier_1_df = modifier_1_df.sample(frac=1)

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'se',
       'exp.score', 'exp.sd', 'df', 'pred.score'],
      dtype='object')
2020
1919
                   accession  hgvs_nt  hgvs_splice   hgvs_pro     score  \
0  urn:mavedb:00000001-b-1#1      NaN          NaN  p.Glu5Lys  1.311357   

         sd        se  exp.score    exp.sd   df  pred.score  
0  0.085569  0.042785    1.31651  0.024947  4.0    1.117086  


In [89]:
# getting uniprot to compare offset
protein_seq_modifier_1 = ssf.get_protein_seq("P63165")
# offset of 1
print(protein_seq_modifier_1)
print(len(protein_seq_modifier_1))

MET SER ASP GLN GLU ALA LYS PRO SER THR GLU ASP LEU GLY ASP LYS LYS GLU GLY GLU TYR ILE LYS LEU LYS VAL ILE GLY GLN ASP SER SER GLU ILE HIS PHE LYS VAL LYS MET THR THR HIS LEU LYS LYS LEU LYS GLU SER TYR CYS GLN ARG GLN GLY VAL PRO MET ASN SER LEU ARG PHE LEU PHE GLU GLY GLN ARG ILE ALA ASP ASN HIS THR PRO LYS GLU LEU GLY MET GLU GLU GLU ASP VAL ILE GLU VAL TYR GLN GLU GLN THR GLY GLY HIS SER THR VAL
403


In [82]:
protein_seq_modifier_1_split = protein_seq_modifier_1.split()
print(protein_seq_modifier_1_split[55:57])

['GLY', 'VAL']


In [90]:
modifier_1_df["variant"] = format_mavedb_variant(modifier_1_df, "hgvs_pro", -1)

In [86]:
print(modifier_1_df.head(10))

                         accession  hgvs_nt  hgvs_splice     hgvs_pro  \
953    urn:mavedb:00000001-b-1#954      NaN          NaN   p.Ser99Asp   
1469  urn:mavedb:00000001-b-1#1470      NaN          NaN   p.Lys48Ser   
1773  urn:mavedb:00000001-b-1#1774      NaN          NaN   p.Asn60Gly   
404    urn:mavedb:00000001-b-1#405      NaN          NaN   p.Tyr91Phe   
1069  urn:mavedb:00000001-b-1#1070      NaN          NaN  p.Thr100Glu   
88      urn:mavedb:00000001-b-1#89      NaN          NaN   p.Val26Gln   
1177  urn:mavedb:00000001-b-1#1178      NaN          NaN   p.Asn60Lys   
652    urn:mavedb:00000001-b-1#653      NaN          NaN   p.Gly56Asn   
1152  urn:mavedb:00000001-b-1#1153      NaN          NaN   p.Glu20Arg   
1692  urn:mavedb:00000001-b-1#1693      NaN          NaN   p.Lys25Gln   

         score        sd        se  exp.score    exp.sd   df  pred.score  \
953   0.508321  0.470306  0.470306        NaN       NaN  NaN    0.508321   
1469 -0.506137  0.204993  0.102496  -0.51477

In [91]:
# splitting variant list if there are multiple mutations
modifier_1_mut = modifier_1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
modifier_1_df["WILD_TYPE_RES"] = ssf.get_wild_type(modifier_1_mut)

# get mutated residue and place in seperate col
modifier_1_df["MUTATED_RES"] = ssf.get_mutation_type(modifier_1_mut)

# get position and place in seperate col
modifier_1_df["POSITION"] = ssf.get_position(modifier_1_mut)

# replace variant column with reformatted variant name
modifier_1_df["variant"] = ssf.get_mutations_names_list(modifier_1_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [92]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

# need positionssplit
modifier_1_df["positions_split"] = ssf.get_positions_split(modifier_1_df)

# add in_sec_str_col
modifier_1_df = add_sec_str_col(modifier_1_df, modifier_1_ss_indexes, 0)

In [94]:
modifier_1_train_df, modifier_1_test_df, modifier_1_remaining_df = get_train_and_test_df(modifier_1_df, 0.46, 198)
# print("TEST LEN")
# print(len(modifier_1_test_df))
modifier_1_df_format = pd.concat([modifier_1_train_df, modifier_1_test_df])
print(len(modifier_1_df_format))
# writing data to txt file
ssf.write_data_file("modifier_1_MLformat_198_train_1700_test_3", protein_seq_modifier_1, modifier_1_df_format)

in fraction,df len1919
91
107
Train Data Fraction: 0.46
false_df len1026
true_df len893
in fraction,df len1721
Test Data Fraction: 0.46
false_df len919
true_df len802
Size of Test Dataset: 1700
Size of Total Dataset: 1898
1898
Filename: modifier_1_MLformat_198_train_1700_test_3.txt


**TAR DNA-binding protein 43**

In [153]:
tar_df_pt1 = pd.read_csv("../Raw Data/tar1_mod.csv")
tar_df_pt2 = pd.read_csv("../Raw Data/tar2_mod.csv")

In [154]:
print(len(tar_df_pt1))
print(len(tar_df_pt2))
# tar_df_pt1 = tar_df_pt1.sample(frac=1)
# tar_df_pt2 = tar_df_pt2.sample(frac=1)
# print(tar_df_pt1.head(30))
# print(tar_df_pt2.head(30))
# tar_df1 = pd.concat([tar_df_pt1, tar_df_pt2])
# print(len(tar_df1))

# print(tar_df1.columns)

tar_df_pt1 = tar_df_pt1[(tar_df_pt1["hgvs_pro"].str.contains("\*") == False) & (tar_df_pt1["hgvs_pro"].str.contains("hgvs") == False)
                    & (tar_df_pt1["hgvs_pro"].str.contains("Ter") == False)]

tar_df_pt2 = tar_df_pt2[(tar_df_pt2["hgvs_pro"].str.contains("\*") == False) & (tar_df_pt2["hgvs_pro"].str.contains("hgvs") == False)
                    & (tar_df_pt2["hgvs_pro"].str.contains("Ter") == False)]

# print(len(tar_df))
# # shuffle values
# tar_df = tar_df.sample(frac=1)
# print(tar_df.head(3))

704
714


In [155]:
# getting uniprot to compare offset
protein_seq_tar = ssf.get_protein_seq("Q13148")
print(protein_seq_tar)

MET SER GLU TYR ILE ARG VAL THR GLU ASP GLU ASN ASP GLU PRO ILE GLU ILE PRO SER GLU ASP ASP GLY THR VAL LEU LEU SER THR VAL THR ALA GLN PHE PRO GLY ALA CYS GLY LEU ARG TYR ARG ASN PRO VAL SER GLN CYS MET ARG GLY VAL ARG LEU VAL GLU GLY ILE LEU HIS ALA PRO ASP ALA GLY TRP GLY ASN LEU VAL TYR VAL VAL ASN TYR PRO LYS ASP ASN LYS ARG LYS MET ASP GLU THR ASP ALA SER SER ALA VAL LYS VAL LYS ARG ALA VAL GLN LYS THR SER ASP LEU ILE VAL LEU GLY LEU PRO TRP LYS THR THR GLU GLN ASP LEU LYS GLU TYR PHE SER THR PHE GLY GLU VAL LEU MET VAL GLN VAL LYS LYS ASP LEU LYS THR GLY HIS SER LYS GLY PHE GLY PHE VAL ARG PHE THR GLU TYR GLU THR GLN VAL LYS VAL MET SER GLN ARG HIS MET ILE ASP GLY ARG TRP CYS ASP CYS LYS LEU PRO ASN SER LYS GLN SER GLN ASP GLU PRO LEU ARG SER ARG LYS VAL PHE VAL GLY ARG CYS THR GLU ASP MET THR GLU ASP GLU LEU ARG GLU PHE PHE SER GLN TYR GLY ASP VAL MET ASP VAL PHE ILE PRO LYS PRO PHE ARG ALA PHE ALA PHE VAL THR PHE ALA ASP ASP GLN ILE ALA GLN SER LEU CYS GLY GLU ASP LEU ILE ILE 

In [156]:
protein_seq_tar_split = protein_seq_tar.split()
print(protein_seq_tar_split[291]) # for the first one starts at 289 (add offset of )
# row 10, col 11, 31 

SER


In [157]:
print(protein_seq_tar_split[346]) 
# row 12, 6, 31 <- 16 starts at 346: 1 starts at 331?
# have to offset seperately and then add

SER


In [158]:
tar_df_pt1["variant"] = format_mavedb_variant(tar_df_pt1, "hgvs_pro", 288)
tar_df_pt2["variant"] = format_mavedb_variant(tar_df_pt2, "hgvs_pro", 330)

In [159]:
tar_df = pd.concat([tar_df_pt1, tar_df_pt2])
tar_df = tar_df.sample(frac=1)
print(len(tar_df))

1342


In [160]:
# splitting variant list if there are multiple mutations
tar_mut = tar_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
tar_df["WILD_TYPE_RES"] = ssf.get_wild_type(tar_mut)

# get mutated residue and place in seperate col
tar_df["MUTATED_RES"] = ssf.get_mutation_type(tar_mut)

# get position and place in seperate col
tar_df["POSITION"] = ssf.get_position(tar_mut)

# replace variant column with reformatted variant name
tar_df["variant"] = ssf.get_mutations_names_list(tar_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [161]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

# need positionssplit
tar_df["positions_split"] = ssf.get_positions_split(tar_df)

# add in_sec_str_col
tar_df = add_sec_str_col(tar_df, tar_ss_indexes, 0)

In [113]:
# tar_train_df, tar_test_df, tar_remaining_df = get_train_and_test_df(tar_df, 0.36, 812)
# print(len(tar_test_df))
# tar_df_format = pd.concat([tar_train_df, tar_test_df])

# # writing data to txt file
# # ssf.write_data_file("tar_MLformat_812_train_10k_test_1", protein_seq_tar, tar_df_format)

**Human Glucokinase? Maybe** -- yup

In [114]:
glucokinase_df1 = pd.read_csv("../Raw Data/glucokinase_scores.csv")

In [116]:
print(glucokinase_df1.columns)

Index(['# Accession: urn:mavedb:00000096-a-1', 'Unnamed: 1', 'Unnamed: 2',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'],
      dtype='object')


In [123]:
# take note of offset

glucokinase_df1.columns = glucokinase_df1.iloc[3]
print(glucokinase_df1.columns)
print(len(glucokinase_df1))


glucokinase_df = glucokinase_df1[(glucokinase_df1["hgvs_pro"].str.contains("del") == False) & (glucokinase_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (glucokinase_df1["hgvs_pro"].str.contains("Ter") == False)]

glucokinase_df = glucokinase_df1[(glucokinase_df1["hgvs_pro"].str.contains("=") == False) & (glucokinase_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (glucokinase_df1["hgvs_pro"].str.contains("Ter") == False)]

glucokinase_df = glucokinase_df.sample(frac=1)

print(len(glucokinase_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'df',
       'se'],
      dtype='object', name=3)
9366
8570


In [124]:
print(glucokinase_df.head(10))

3                        accession hgvs_nt hgvs_splice     hgvs_pro  \
5058  urn:mavedb:00000096-a-1#5055     NaN         NaN  p.Gly318Ile   
1720  urn:mavedb:00000096-a-1#1717     NaN         NaN   p.Pro66Gln   
880    urn:mavedb:00000096-a-1#877     NaN         NaN  p.Val222Trp   
6348  urn:mavedb:00000096-a-1#6345     NaN         NaN  p.Glu268Tyr   
9172  urn:mavedb:00000096-a-1#9169     NaN         NaN  p.Ala188Trp   
5710  urn:mavedb:00000096-a-1#5707     NaN         NaN  p.His105Met   
2749  urn:mavedb:00000096-a-1#2746     NaN         NaN  p.Met462Val   
6772  urn:mavedb:00000096-a-1#6769     NaN         NaN  p.Glu256Ser   
768    urn:mavedb:00000096-a-1#765     NaN         NaN   p.Trp99Ser   
6407  urn:mavedb:00000096-a-1#6404     NaN         NaN  p.Glu236Ile   

3            score           sd df           se  
5058   0.597180548  0.345674096  2  0.244428497  
1720   1.125473589  0.192021693  2  0.135779841  
880    0.417811603  0.394878786  2  0.279221468  
6348   0.431714124

In [120]:
# getting uniprot to compare offset
protein_seq_glucokinase = ssf.get_protein_seq("P35557")
# offset of 1
print(protein_seq_glucokinase)
print(len(protein_seq_glucokinase))

MET LEU ASP ASP ARG ALA ARG MET GLU ALA ALA LYS LYS GLU LYS VAL GLU GLN ILE LEU ALA GLU PHE GLN LEU GLN GLU GLU ASP LEU LYS LYS VAL MET ARG ARG MET GLN LYS GLU MET ASP ARG GLY LEU ARG LEU GLU THR HIS GLU GLU ALA SER VAL LYS MET LEU PRO THR TYR VAL ARG SER THR PRO GLU GLY SER GLU VAL GLY ASP PHE LEU SER LEU ASP LEU GLY GLY THR ASN PHE ARG VAL MET LEU VAL LYS VAL GLY GLU GLY GLU GLU GLY GLN TRP SER VAL LYS THR LYS HIS GLN MET TYR SER ILE PRO GLU ASP ALA MET THR GLY THR ALA GLU MET LEU PHE ASP TYR ILE SER GLU CYS ILE SER ASP PHE LEU ASP LYS HIS GLN MET LYS HIS LYS LYS LEU PRO LEU GLY PHE THR PHE SER PHE PRO VAL ARG HIS GLU ASP ILE ASP LYS GLY ILE LEU LEU ASN TRP THR LYS GLY PHE LYS ALA SER GLY ALA GLU GLY ASN ASN VAL VAL GLY LEU LEU ARG ASP ALA ILE LYS ARG ARG GLY ASP PHE GLU MET ASP VAL VAL ALA MET VAL ASN ASP THR VAL ALA THR MET ILE SER CYS TYR TYR GLU ASP HIS GLN CYS GLU VAL GLY MET ILE VAL GLY THR GLY CYS ASN ALA CYS TYR MET GLU GLU MET GLN ASN VAL GLU LEU VAL GLU GLY ASP GLU GLY ARG 

In [128]:
protein_seq_glucokinase_split = protein_seq_glucokinase.split()
print(len(protein_seq_glucokinase_split))
print(protein_seq_glucokinase_split[186:189])

465
['ASP', 'ALA', 'ILE']


In [131]:
glucokinase_df["variant"] = format_mavedb_variant(glucokinase_df, "hgvs_pro", -1)

In [133]:
# splitting variant list if there are multiple mutations
glucokinase_mut = glucokinase_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
glucokinase_df["WILD_TYPE_RES"] = ssf.get_wild_type(glucokinase_mut)

# get mutated residue and place in seperate col
glucokinase_df["MUTATED_RES"] = ssf.get_mutation_type(glucokinase_mut)

# get position and place in seperate col
glucokinase_df["POSITION"] = ssf.get_position(glucokinase_mut)

# replace variant column with reformatted variant name
glucokinase_df["variant"] = ssf.get_mutations_names_list(glucokinase_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)

In [134]:
# getting training and test datasets

# get ss position indexes
path = "../PDB and STRIDE Files/" + 'glucokinase_stride.txt'
glucokinase_stride_file = open(path, 'r')

glucokinase_ss_indexes = ssf.get_sec_struc_boolean(glucokinase_stride_file) # boolean list of secondary structure assignements

# need positionssplit
glucokinase_df["positions_split"] = ssf.get_positions_split(glucokinase_df)

# add in_sec_str_col
glucokinase_df = add_sec_str_col(glucokinase_df, glucokinase_ss_indexes, 0)

In [135]:
print(glucokinase_ss_indexes.count(False))
print(glucokinase_ss_indexes.count(True))

145
320


In [139]:
glucokinase_train_df, glucokinase_test_df, glucokinase_remaining_df = get_train_and_test_df(glucokinase_df, 0.69, 912)
glucokinase_df_format = pd.concat([glucokinase_train_df, glucokinase_test_df])
print(len(glucokinase_df_format))
# writing data to txt file
ssf.write_data_file("glucokinase_MLformat_912_train_7612_test_3", protein_seq_glucokinase, glucokinase_df_format)

in fraction,df len8570
629
283
Train Data Fraction: 0.69
false_df len2686
true_df len5884
in fraction,df len7658
Test Data Fraction: 0.69
false_df len2403
true_df len5255
Size of Test Dataset: 7612
Size of Total Dataset: 8524
8524
Filename: glucokinase_MLformat_912_train_7612_test_3.txt


## Getting Dataset Fraction

In [17]:
# adds boolean column to dataframe to indicate whether value is in secondary structure
# needs positions split column
def add_sec_str_col(df, bool_ss_list, domain_start_index):
    has_sec_str = []
    for val in df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []

        for position in val:
            if (bool_ss_list[position - domain_start_index] == False):  # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)

        # all pos sec struc should match val list
        # if there's a value in all_pos_sec_struc that's false, append false
        # otherwise, append true
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False

        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()

    # print(len(has_sec_str)) # should match dataframe length
    df['in_sec_str'] = has_sec_str
    return df

In [18]:
def get_fractioned_dataset(df, fraction_ss, size):
    
    print("in fraction,df len" + str(len(df)))
    if (size is not None): # need training dataset size
        num_in_ss = int(size*fraction_ss) 
        print(num_in_ss)
        num_not_ss = size - num_in_ss
        print(num_not_ss)
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Train Data Fraction: " + str(real_fraction))
    else: # need test dataset and size of dataset doesn't matter
        num_in_ss, num_not_ss = get_num_ss(df, fraction_ss)
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Test Data Fraction: " + str(real_fraction))
        
    true_df = df[df["in_sec_str"] == True]
    false_df = df[df["in_sec_str"] == False]
    print("false_df len" + str(len(false_df)))
    print("true_df len" + str(len(true_df)))
    ss_df = true_df.sample(num_in_ss)

    false_df = df[df["in_sec_str"] == False]
    not_ss_df = false_df.sample(num_not_ss)
    # print("Testing Test Data Fraction:" + str(len(ss_df)/(len(not_ss_df)+len(ss_df))))
    
    fractioned_df = pd.concat([ss_df, not_ss_df]).sample(frac=1)
    
    remaining_df = pd.concat([fractioned_df, df])
    remaining_df = remaining_df[~remaining_df.index.duplicated(keep=False)]
    
    return fractioned_df, remaining_df

In [19]:
def get_num_ss(df, fraction_ss): # use when size is dependent on remaining values
    # print("enters this method")
    num_ss_vals = (df["in_sec_str"] == True).sum() # number of trues to work from 
    
    num_not_ss_vals = (df["in_sec_str"] == False).sum() # number of falses to work from 
    
    if (num_not_ss_vals < num_ss_vals): # if not in ss is limiting factor
        ideal_split_ss = int((num_not_ss_vals/(1-fraction_ss))*fraction_ss)
        ideal_split_not_ss = int(num_not_ss_vals)
        # instead do min value, max value and fraction it is
    else:
        ideal_split_ss = int(num_ss_vals)
        ideal_split_not_ss = int((num_ss_vals/fraction_ss)*(1-fraction_ss))
    total = ideal_split_ss + ideal_split_not_ss
    # print("ideal_ss: " + str(ideal_split_ss))
    # print("ideal_not_ss: " + str(ideal_split_not_ss))
    while (ideal_split_ss >= num_ss_vals or ideal_split_not_ss >= num_not_ss_vals):
        # until thei)y both fit the total number of values
        if (ideal_split_ss < ideal_split_not_ss):
            ideal_split_ss = ideal_split_ss - 1
            ideal_split_not_ss = (ideal_split_ss/fraction_ss)*(1-fraction_ss)
        else:
            ideal_split_not_ss = ideal_split_not_ss - 1
            ideal_split_ss = (ideal_split_not_ss/(1-fraction_ss))*(fraction_ss)
        ideal_split_ss = int(ideal_split_ss)
        ideal_split_not_ss = int(ideal_split_not_ss)
        # total = ideal_split_ss + ideal_split_not_ss
        # print("total: " + str(total))
        # print(total % batch_size != 0)
        # print(ideal_split_ss >= num_ss_vals)
        # print(ideal_split_not_ss >= num_not_ss_vals)
        #print((ideal_split_ss >= num_ss_vals or ideal_split_not_ss >= num_not_ss_vals))
    return int(ideal_split_ss), int(ideal_split_not_ss)

In [404]:
(8+5)%32 != 0

True

In [20]:
# gets both train and test datasets based on size
# fraction_ss is a percent (e.g. 0.56)

# need "is_sec_struc_col"
# reutrns remaining df
def get_train_and_test_df(df, fraction_ss, train_size):
    train_df, remaining_df = get_fractioned_dataset(df, fraction_ss, train_size)
    test_df, remaining_df = get_fractioned_dataset(remaining_df, fraction_ss, None)
    print("Size of Test Dataset: " + str(len(test_df)))
    print("Size of Total Dataset: " + str(len(test_df)+ len(train_df)))
    return train_df, test_df, remaining_df


In [21]:
def get_new_trainset(train_df, remaining_df, fraction_ss, size):
    if (size > len(remaining_df)):
        print("Not enough training values, combining old values")
        df = pd.concat([train_df, remaining_df])
        # print("enters wrong block")     
    else:
        # print("enters right block")
        df = remaining_df
        print("remaing df len" + str(len(df)))
    new_train_df, new_remaining_df = get_fractioned_dataset(df, fraction_ss, size)
    return new_train_df, new_remaining_df

## Cleaning Gelman et al. Data

## Ube4b

In [144]:
# importing Ube4b data from Gelman et al.
ube4b_df1 = pd.read_csv("../Raw Data/ube4b.tsv.txt", sep="\t")
ube4b_df = ube4b_df1.dropna()
print(len(ube4b_df))
print(ube4b_df.columns)

98297
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [145]:
# rounding score column to 6 decimal points
ube4b_df["score"] = ube4b_df["score"].round(6)

# remove values with wildcard star next to them
ube4b_df = ube4b_df[ube4b_df["variant"].str.contains("\*") == False]
print(len(ube4b_df))
ube4b_df = ube4b_df.sample(frac=1)
# change this value depending on amount of data needed for dataset
# ube4b_df = ube4b_df.sample(n=80)
print(len(ube4b_df))

91031
91031


In [164]:
# # get protein sequence from Uniprot and split
# # protein_seq_ube4b = ssf.get_protein_seq("Q9ES00")
# protein_seq_ube4b_split = protein_seq_ube4b.split()
# print(len(protein_seq_ube4b_split)) # protein length of 1173

1173


In [146]:
# ube4b protein domain sequence from Gelman et. al
string_seq = "IEKFKLLAEKVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDPFNRQMLTESMLEPVPELKEQIQAWMREKQSSDH"
print(len(string_seq)) # <- domain length of 102
protein_seq_ube4b = ssf.get_expanded_seq(string_seq)
print(protein_seq_ube4b)
# protein_seq_ube4b = protein_seq_ube4b.split()

# NOTE - index in list corresponds exactly to location in domain (huh)

102
ILE GLU LYS PHE LYS LEU LEU ALA GLU LYS VAL GLU GLU ILE VAL ALA LYS ASN ALA ARG ALA GLU ILE ASP TYR SER ASP ALA PRO ASP GLU PHE ARG ASP PRO LEU MET ASP THR LEU MET THR ASP PRO VAL ARG LEU PRO SER GLY THR VAL MET ASP ARG SER ILE ILE LEU ARG HIS LEU LEU ASN SER PRO THR ASP PRO PHE ASN ARG GLN MET LEU THR GLU SER MET LEU GLU PRO VAL PRO GLU LEU LYS GLU GLN ILE GLN ALA TRP MET ARG GLU LYS GLN SER SER ASP HIS


In [163]:
# index of domain inside protein
# ssf.get_index_range(protein_seq_ube4b_split, ube4b_domain_split)

In [147]:
# splitting variant list if there are multiple mutations
ube4b_mut = ube4b_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
ube4b_df["WILD_TYPE_RES"] = ssf.get_wild_type(ube4b_mut)

# get mutated residue and place in seperate col
ube4b_df["MUTATED_RES"] = ssf.get_mutation_type(ube4b_mut)

# get position and place in seperate col
ube4b_df["POSITION"] = ssf.get_position(ube4b_mut)

# replace variant column with reformatted variant name
ube4b_df["variant"] = ssf.get_mutations_names_list(ube4b_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

#  ube4b_df = ube4b_df.drop(columns=to_drop)

In [148]:
# get ss position indexes
path = "../PDB and STRIDE Files/" + 'ube4b_stride.txt'
ube4b_stride_file = open(path, 'r')

ube4b_ss_indexes = ssf.get_all_sec_struc_boolean(ube4b_stride_file)
print(ube4b_ss_indexes.count(True))
print(ube4b_ss_indexes.count(False))

# need positionssplit
ube4b_df["positions_split"] = ssf.get_positions_split(ube4b_df)

# add in_sec_str_col
ube4b_df = add_sec_str_col(ube4b_df, ube4b_ss_indexes, 0)

93
9


In [31]:
print(len(ube4b_df))

91031


In [71]:
ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_remaining, 0.52, 200)

NameError: name 'ube4b_remaining' is not defined

In [38]:
ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_df, 0.52, 200)

in fraction,df len91031
104
96
Train Data Fraction: 0.52
false_df len76653
true_df len14378
in fraction,df len90831
Test Data Fraction: 0.52
false_df len76557
true_df len14274
Size of Test Dataset: 27447
Size of Total Dataset: 27647


In [155]:
ube4b_train_df, ube4b_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_df, 0.91, 200)
ube4b_train_dummy, ube4b_small_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_test_df, 0.91, 57000)
print("SMALL TEST")
print(len(ube4b_small_test_df))
ube4b_df_format = pd.concat([ube4b_train_df, ube4b_small_test_df])
print(len(ube4b_df_format))
ssf.write_data_file("ube4b_MLformat_200_train_17155_test_t3", protein_seq_ube4b, ube4b_df_format)

in fraction,df len91031
182
18
Train Data Fraction: 0.91
false_df len23348
true_df len67683
in fraction,df len90831
Test Data Fraction: 0.91
false_df len23330
true_df len67501
Size of Test Dataset: 74166
Size of Total Dataset: 74366
in fraction,df len74166
51870
5130
Train Data Fraction: 0.91
false_df len6675
true_df len67491
in fraction,df len17166
Test Data Fraction: 0.91
false_df len1545
true_df len15621
Size of Test Dataset: 17155
Size of Total Dataset: 74155
SMALL TEST
17155
17355
Filename: ube4b_MLformat_200_train_17155_test_t3.txt


In [97]:
# making test set smaller
ube4b_dummy_df, ube4b_small_test_df, ube4b_remaining_df = get_train_and_test_df(ube4b_test_df, 0.52, 14000)
ube4b_df_format = pd.concat([ube4b_train_df, ube4b_small_test_df])
print(len(ube4b_df_format))
ssf.write_data_file("ube4b_MLformat_200_train_13445_test_t3", protein_seq_ube4b, ube4b_df_format)

in fraction,df len27447
7280
6720
Train Data Fraction: 0.52
false_df len13175
true_df len14272
in fraction,df len13447
Test Data Fraction: 0.52
false_df len6455
true_df len6992
Size of Test Dataset: 13445
Size of Total Dataset: 27445
13645
Filename: ube4b_MLformat_200_train_13445_test_t3.txt


In [180]:
# making test set smaller
ube4b_dummy_df_2, ube4b_small_test_df_2, ube4b_remaining_df_2 = get_train_and_test_df(ube4b_test_df, 0.52, 14000)
ube4b_df_format_2 = pd.concat([ube4b_train_df_2, ube4b_small_test_df_2])
print(len(ube4b_df_format_2))
ssf.write_data_file("ube4b_MLformat_200_train_13445_test_t2", protein_seq_ube4b, ube4b_df_format_2)

in fraction,df len27447
7280
6720
Train Data Fraction: 0.52
false_df len13175
true_df len14272
in fraction,df len13447
Test Data Fraction: 0.52
false_df len6455
true_df len6992
Size of Test Dataset: 13445
Size of Total Dataset: 27445
13645
Filename: ube4b_MLformat_200_train_13445_test_t2.txt


In [181]:
# making test set smaller
ube4b_dummy_df_3, ube4b_small_test_df_3, ube4b_remaining_df_3 = get_train_and_test_df(ube4b_test_df, 0.52, 14000)
ube4b_df_format_3 = pd.concat([ube4b_train_df_3, ube4b_small_test_df_3])
print(len(ube4b_df_format_3))
ssf.write_data_file("ube4b_MLformat_200_train_13445_test_t3", protein_seq_ube4b, ube4b_df_format_3)

in fraction,df len27447
7280
6720
Train Data Fraction: 0.52
false_df len13175
true_df len14272
in fraction,df len13447
Test Data Fraction: 0.52
false_df len6455
true_df len6992
Size of Test Dataset: 13445
Size of Total Dataset: 27445
13645
Filename: ube4b_MLformat_200_train_13445_test_t3.txt


In [75]:
print(len(ube4b_train_df))
print(len(ube4b_test_df))
print(len(ube4b_train_df_2))
print(len(ube4b_test_df_2))
print(len(ube4b_train_df_3))
print(len(ube4b_test_df_3))

200
27447
200
27447
200
27447


#### Pab1

In [74]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [75]:
# rounding score column to 6 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star next to them
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]
print(len(pab1_df))
# change this value depending on amount of data needed for dataset
# pab1_df = pab1_df.sample(n=10000)
print(len(pab1_df))
pab1_df = pab1_df.sample(frac=1)

40852
37710
37710


In [76]:
# splitting variant list if there are multiple mutations
pab1_mut = pab1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pab1_df["WILD_TYPE_RES"] = ssf.get_wild_type(pab1_mut)

# get mutated residue and place in seperate col
pab1_df["MUTATED_RES"] = ssf.get_mutation_type(pab1_mut)

# get position and place in seperate col
pab1_df["POSITION"] = ssf.get_position(pab1_mut)
pab1_df["positions_split"] = ssf.get_positions_split(pab1_df)

positions_split_subtracted = []
for pos_list in pab1_df["positions_split"]:
    pos_list = [x - 126 for x in pos_list]
    positions_split_subtracted.append(pos_list)  

pab1_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pab1_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    # print(pos_string)
    new_positions.append(pos_string)
    pos_string = ""
# print(len(new_positions))
# print(len(pab1_df["POSITION"]))

pab1_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name
pab1_df["variant"] = ssf.get_mutations_names_list(pab1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)


In [80]:
print(len(pab1_df))

37710


In [77]:
path = "../PDB and STRIDE Files/" + 'pab1_stride.txt'
pab1_stride_file = open(path, 'r')

In [78]:
pab1_ss_indexes = ssf.get_all_sec_struc_boolean(pab1_stride_file) # boolean list of secondary structure assignements

In [79]:
# add column

pab1_df = add_sec_str_col(pab1_df, pab1_ss_indexes, 0)
# need positionssplit

In [84]:
print(pab1_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION', 'positions_split', 'in_sec_str'],
      dtype='object')


In [80]:
string_seq_pab1 = "GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP"
protein_seq_pab1 = ssf.get_expanded_seq(string_seq_pab1)
print(protein_seq_pab1)

GLY ASN ILE PHE ILE LYS ASN LEU HIS PRO ASP ILE ASP ASN LYS ALA LEU TYR ASP THR PHE SER VAL PHE GLY ASP ILE LEU SER SER LYS ILE ALA THR ASP GLU ASN GLY LYS SER LYS GLY PHE GLY PHE VAL HIS PHE GLU GLU GLU GLY ALA ALA LYS GLU ALA ILE ASP ALA LEU ASN GLY MET LEU LEU ASN GLY GLN GLU ILE TYR VAL ALA PRO


In [81]:
pab1_train_df, pab1_test_df, pab1_remaining_df = get_train_and_test_df(pab1_df, 0.91, 147)
pab1_df_format = pd.concat([pab1_train_df, pab1_test_df])
print(len(pab1_df_format))
# ssf.write_data_file("pab1_MLformat_147_train_20054_test_t3", protein_seq_pab1, pab1_df_format)

in fraction,df len37710
133
14
Train Data Fraction: 0.905
false_df len6264
true_df len31446
in fraction,df len37563
Test Data Fraction: 0.91
false_df len6250
true_df len31313
Size of Test Dataset: 34400
Size of Total Dataset: 34547
34547


In [86]:
pab1_train_df, pab1_test_df, pab1_remaining_df = get_train_and_test_df(pab1_df, 0.91, 147)
pab1_train_dummy, pab1_small_test_df, pab1_remaining_df = get_train_and_test_df(pab1_test_df, 0.91, 20000)
print("SMALL TEST")
print(len(pab1_small_test_df))
pab1_df_format = pd.concat([pab1_train_df, pab1_small_test_df])
print(len(pab1_df_format))
ssf.write_data_file("pab1_MLformat_200_train_14344_test_t3", protein_seq_pab1, pab1_df_format)

in fraction,df len37710
182
18
Train Data Fraction: 0.91
false_df len6264
true_df len31446
in fraction,df len37510
Test Data Fraction: 0.91
false_df len6246
true_df len31264
Size of Test Dataset: 34355
Size of Total Dataset: 34555
in fraction,df len34355
18200
1800
Train Data Fraction: 0.91
false_df len3092
true_df len31263
in fraction,df len14355
Test Data Fraction: 0.91
false_df len1292
true_df len13063
Size of Test Dataset: 14344
Size of Total Dataset: 34344
SMALL TEST
14344
14544
Filename: pab1_MLformat_200_train_14344_test_t3.txt


In [53]:
pab1_train_df_2, pab1_test_df_2, pab1_remaining_df_2 = get_train_and_test_df(pab1_df, 0.69, 147)
pab1_df_format_2 = pd.concat([pab1_train_df_2, pab1_test_df_2])
print(len(pab1_df_format_2))
ssf.write_data_file("pab1_MLformat_147_train_25974_test_t2", protein_seq_pab1, pab1_df_format_2)

in fraction,df len37710
101
46
Train Data Fraction: 0.687
false_df len19683
true_df len18027
in fraction,df len37563
Test Data Fraction: 0.69
false_df len19637
true_df len17926
Size of Test Dataset: 25974
Size of Total Dataset: 26121
26121
Filename: pab1_MLformat_147_train_25974_test_t2.txt


In [106]:
pab1_train_df_3, pab1_test_df_3, pab1_remaining_df_3 = get_train_and_test_df(pab1_df, 0.69, 147)
pab1_df_format_3 = pd.concat([pab1_train_df_3, pab1_test_df_3])
print(len(pab1_df_format_3))
ssf.write_data_file("pab1_MLformat_147_train_25974_test_t3", protein_seq_pab1, pab1_df_format_3)

in fraction,df len37710
101
46
Train Data Fraction: 0.687
false_df len19683
true_df len18027
in fraction,df len37563
Test Data Fraction: 0.69
false_df len19637
true_df len17926
Size of Test Dataset: 25974
Size of Total Dataset: 26121
26121
Filename: pab1_MLformat_147_train_25974_test_t3.txt


In [58]:
print(len(pab1_train_df))
print(len(pab1_test_df))
print(len(pab1_train_df_2))
print(len(pab1_test_df_2))
print(len(pab1_train_df_3))
print(len(pab1_test_df_3))

147
25974
147
25974
147
25974


In [56]:
# pab1_train_df, pab1_test_df, pab1_remaining_df = get_train_and_test_df(pab1_df, 0.69, 147)
# pab1_test_10000 = pab1_test_df.head(10000)
# pab1_df_format = pd.concat([pab1_train_df, pab1_test_10000])
# print(len(pab1_df_format))
# # writing data to txt file
# ssf.write_data_file("pab1_MLformat_147_train_25974_test_t1", protein_seq_pab1, pab1_df_format)

in fraction,df len37710
101
46
Train Data Fraction: 0.687
false_df len19683
true_df len18027
in fraction,df len37563
Test Data Fraction: 0.69
false_df len19637
true_df len17926
Size of Test Dataset: 25974
Size of Total Dataset: 26121
10147
Filename: pab1_MLformat_147_train_25974_test_t1.txt


#### Bgl3

In [134]:
# importing Ube4b data from Gelman et al.
bgl3_df1 = pd.read_csv("../Raw Data/bgl3.tsv.txt", sep="\t")
bgl3_df = bgl3_df1.dropna()
print(len(bgl3_df))
print(bgl3_df.columns)

26653
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [10]:
print(bgl3_df["variant"].head(20))

0                 A104E
1           A104E,A142E
2           A104E,E152V
3           A104E,K170R
4                 A104G
5                 A104P
6                 A104S
7           A104S,A116T
8           A104S,A121V
9           A104S,F111Y
10    A104S,R114H,Q137L
11                A104T
12          A104T,A116V
13          A104T,E156D
14          A104T,G108S
15          A104T,N141Y
16          A104T,Q106L
17          A104T,Q106R
18          A104T,Q137H
19                A104V
Name: variant, dtype: object


In [135]:
# rounding score column to 6 decimal points
bgl3_df["score"] = bgl3_df["score"].round(6)
print(len(bgl3_df))

# remove values with wildcard star next to them
bgl3_df = bgl3_df[bgl3_df["variant"].str.contains("\*") == False]
print(len(bgl3_df))

bgl3_df = bgl3_df.sample(frac=1)
print(len(bgl3_df))

26653
25737
25737


In [136]:
# NOTE - no protein domain for bgl3
# # get protein sequence from Gelman et al.
string_seq = "MVPAAQQTAMAPDAALTFPEGFLWGSATASYQIEGAAAEDGRTPSIWDTYARTPGRVRNGDTGDVATDHYHRWREDVALMAELGLGAYRFSLAWPRIQPTGRGPALQKGLDFYRRLADELLAKGIQPVATLYHWDLPQELENAGGWPERATAERFAEYAAIAADALGDRVKTWTTLNEPWCSAFLGYGSGVHAPGRTDPVAALRAAHHLNLGHGLAVQALRDRLPADAQCSVTLNIHHVRPLTDSDADADAVRRIDALANRVFTGPMLQGAYPEDLVKDTAGLTDWSFVRDGDLRLAHQKLDFLGVNYYSPTLVSEADGSGTHNSDGHGRSAHSPWPGADRVAFHQPPGETTAMGWAVDPSGLYELLRRLSSDFPALPLVITENGAAFHDYADPEGNVNDPERIAYVRDHLAAVHRAIKDGSDVRGYFLWSLLDNFEWAHGYSKRFGAVYVDYPTGTRIPKASARWYAEVARTGVLPTAGDPNSSSVDKLAAALEHHHHHH"
protein_seq_bgl3 = ssf.get_expanded_seq(string_seq)
print(protein_seq_bgl3)

MET VAL PRO ALA ALA GLN GLN THR ALA MET ALA PRO ASP ALA ALA LEU THR PHE PRO GLU GLY PHE LEU TRP GLY SER ALA THR ALA SER TYR GLN ILE GLU GLY ALA ALA ALA GLU ASP GLY ARG THR PRO SER ILE TRP ASP THR TYR ALA ARG THR PRO GLY ARG VAL ARG ASN GLY ASP THR GLY ASP VAL ALA THR ASP HIS TYR HIS ARG TRP ARG GLU ASP VAL ALA LEU MET ALA GLU LEU GLY LEU GLY ALA TYR ARG PHE SER LEU ALA TRP PRO ARG ILE GLN PRO THR GLY ARG GLY PRO ALA LEU GLN LYS GLY LEU ASP PHE TYR ARG ARG LEU ALA ASP GLU LEU LEU ALA LYS GLY ILE GLN PRO VAL ALA THR LEU TYR HIS TRP ASP LEU PRO GLN GLU LEU GLU ASN ALA GLY GLY TRP PRO GLU ARG ALA THR ALA GLU ARG PHE ALA GLU TYR ALA ALA ILE ALA ALA ASP ALA LEU GLY ASP ARG VAL LYS THR TRP THR THR LEU ASN GLU PRO TRP CYS SER ALA PHE LEU GLY TYR GLY SER GLY VAL HIS ALA PRO GLY ARG THR ASP PRO VAL ALA ALA LEU ARG ALA ALA HIS HIS LEU ASN LEU GLY HIS GLY LEU ALA VAL GLN ALA LEU ARG ASP ARG LEU PRO ALA ASP ALA GLN CYS SER VAL THR LEU ASN ILE HIS HIS VAL ARG PRO LEU THR ASP SER ASP ALA ASP ALA ASP 

In [137]:
protein_seq_bgl3_split = protein_seq_bgl3.split()
protein_seq_bgl3_split[500]

'HIS'

In [138]:
# splitting variant list if there are multiple mutations
bgl3_mut = bgl3_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
bgl3_df["WILD_TYPE_RES"] = ssf.get_wild_type(bgl3_mut)

# get mutated residue and place in seperate col
bgl3_df["MUTATED_RES"] = ssf.get_mutation_type(bgl3_mut)

# get position and place in seperate col
bgl3_df["POSITION"] = ssf.get_position(bgl3_mut)

# replace variant column with reformatted variant name
bgl3_df["variant"] = ssf.get_mutations_names_list(bgl3_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# bgl3_df = bgl3_df.drop(columns=to_drop)

In [139]:
# get ss position indexes
path = "../PDB and STRIDE Files/" + 'bgl3_stride.txt'
bgl3_stride_file = open(path, 'r')

bgl3_ss_indexes = ssf.get_all_sec_struc_boolean(bgl3_stride_file)

# need positionssplit
bgl3_df["positions_split"] = ssf.get_positions_split(bgl3_df)

# add in_sec_str_col
bgl3_df = add_sec_str_col(bgl3_df, bgl3_ss_indexes, 0)

In [15]:
# print(bgl3_ss_indexes)
for val in bgl3_ss_indexes:
    print(val)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
True
True
True
True
False
False
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
False
False
False
False
False
True
True
True
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
True
True
True
True
True
False
False
True
True
True
True
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
True
True
True
True
True
True
True
False
False
False
False
True
True
True
True
True
True
True
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
True
True
True
True
True
True
False
True
True
True
True
True
True
Tr

Trial 1:

In [152]:
bgl3_train_df, bgl3_test_df, bgl3_remaining_df = get_train_and_test_df(bgl3_df, 0.54, 982)
bgl3_df_format = pd.concat([bgl3_train_df, bgl3_test_df])
print(len(bgl3_df_format))
# ssf.write_data_file("bgl3_MLformat_982_train_13945_test_t3", protein_seq_bgl3, bgl3_df_format)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len11891
true_df len13846
in fraction,df len24755
Test Data Fraction: 0.54
false_df len11439
true_df len13316
Size of Test Dataset: 24658
Size of Total Dataset: 25640
25640


In [143]:
bgl3_train_df, bgl3_test_df, bgl3_remaining_df = get_train_and_test_df(bgl3_df, 0.74, 982)
bgl3_train_dummy, bgl3_small_test_df, bgl3_remaining_df = get_train_and_test_df(bgl3_test_df, 0.74, 1000)
print("SMALL TEST")
print(len(bgl3_small_test_df))
bgl3_df_format = pd.concat([bgl3_train_df, bgl3_small_test_df])
print(len(bgl3_df_format))
ssf.write_data_file("bgl3_MLformat_982_train_16723_test_t3", protein_seq_bgl3, bgl3_df_format)

in fraction,df len25737
726
256
Train Data Fraction: 0.739
false_df len11891
true_df len13846
in fraction,df len24755
Test Data Fraction: 0.74
false_df len11635
true_df len13120
Size of Test Dataset: 17726
Size of Total Dataset: 18708
in fraction,df len17726
740
260
Train Data Fraction: 0.74
false_df len4609
true_df len13117
in fraction,df len16726
Test Data Fraction: 0.74
false_df len4349
true_df len12377
Size of Test Dataset: 16723
Size of Total Dataset: 17723
SMALL TEST
16723
17705
Filename: bgl3_MLformat_982_train_16723_test_t3.txt


In [37]:
bgl3_train_df_2, bgl3_test_df_2, bgl3_remaining_df_2 = get_train_and_test_df(bgl3_df, 0.54, 982)
bgl3_df_format_2 = pd.concat([bgl3_train_df_2, bgl3_test_df_2])
print(len(bgl3_df_format_2))
ssf.write_data_file("bgl3_MLformat_982_train_13945_test_t2", protein_seq_bgl3, bgl3_df_format_2)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len17674
true_df len8063
in fraction,df len24755
Test Data Fraction: 0.54
false_df len17222
true_df len7533
Size of Test Dataset: 13945
Size of Total Dataset: 14927
14927
Filename: bgl3_MLformat_982_train_13945_test_t2.txt


In [41]:
bgl3_train_df_3, bgl3_test_df_3, bgl3_remaining_df_3 = get_train_and_test_df(bgl3_df, 0.54, 982)
bgl3_df_format_3 = pd.concat([bgl3_train_df_3, bgl3_test_df_3])
print(len(bgl3_df_format_3))
ssf.write_data_file("bgl3_MLformat_982_train_13945_test_t3", protein_seq_bgl3, bgl3_df_format_3)

in fraction,df len25737
530
452
Train Data Fraction: 0.54
false_df len17674
true_df len8063
in fraction,df len24755
Test Data Fraction: 0.54
false_df len17222
true_df len7533
Size of Test Dataset: 13945
Size of Total Dataset: 14927
14927
Filename: bgl3_MLformat_982_train_13945_test_t3.txt


In [42]:
print(len(bgl3_train_df))
print(len(bgl3_test_df))
print(len(bgl3_train_df_2))
print(len(bgl3_test_df_2))
print(len(bgl3_train_df_3))
print(len(bgl3_test_df_3))

982
13945
982
13945
982
13945


# Data from Perturbation Networks

https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0261829#sec014

### 1be9

In [115]:
# loading data
pro_1be9_df1 = pd.read_csv("../Raw Data/functional_1be9.csv", sep=",")

# renaming mutated residue column
pro_1be9_df1 = pro_1be9_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [116]:
# reorganizing data to other form
col_list_1be9 = list(pro_1be9_df1)
col_list_1be9 = col_list_1be9[1:]

# mutations going down from the leftmost column
mutations_1be9 = []

for column in col_list_1be9:
    for mutation in pro_1be9_df1["mutated_res"]:
        mutations_1be9.append(column + mutation)

# getting scores
scores_1be9 = []

for column in pro_1be9_df1.drop('mutated_res', axis=1):
    for val in pro_1be9_df1[column]:
        scores_1be9.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_1be9_df = pd.DataFrame(list(zip(mutations_1be9, scores_1be9)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [117]:
# rounding score column to 6 decimal points
pro_1be9_df["score"] = pro_1be9_df["score"].round(6)

# shuffling
pro_1be9_df = pro_1be9_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_1be9_mut = pro_1be9_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_1be9_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_1be9_mut)

# get mutated residue and place in seperate col
pro_1be9_df["MUTATED_RES"] = ssf.get_mutation_type(pro_1be9_mut)

# get position and place in seperate col
pro_1be9_df["POSITION"] = ssf.get_position(pro_1be9_mut)

# need positionssplit
pro_1be9_df["positions_split"] = ssf.get_positions_split(pro_1be9_df)

positions_split_subtracted = []
for pos_list in pro_1be9_df["positions_split"]:
    pos_list = [x - 301 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_1be9_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_1be9_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_1be9_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_1be9_df["variant"] = ssf.get_mutations_names_list(pro_1be9_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [118]:
string_seq_1be9 = "FLGEEDIPREPRRIVIHRGSTGLGFNIIGGEDGEGIFISFILAGGPADLSGELRKGDQILSVNGVDLRNASHEQAAIALKNAGQTVTIIAQYKPEEYSRFEANSRVNSSGRIVTNKQTSV"
protein_seq_1be9 = ssf.get_expanded_seq(string_seq_1be9)
protein_seq_1be9_split = protein_seq_1be9.split()

In [119]:
path = "../PDB and STRIDE Files/" + '1be9_stride.txt'
pro_1be9_stride_file = open(path, 'r')

pro_1be9_ss_indexes = ssf.get_all_sec_struc_boolean(pro_1be9_stride_file)
print(pro_1be9_ss_indexes.count(True))
print(pro_1be9_ss_indexes.count(False))

# add in_sec_str_col
pro_1be9_df = add_sec_str_col(pro_1be9_df, pro_1be9_ss_indexes, 0)

97
23


In [123]:
pro_1be9_train_df, pro_1be9_test_df, pro_1be9_remaining_df = get_train_and_test_df(pro_1be9_df, 0.81, 235)
pro_1be9_df_format = pd.concat([pro_1be9_train_df, pro_1be9_test_df])
print(len(pro_1be9_df_format))
ssf.write_data_file("pro_1be9_MLformat_235_train_915_test_t3", protein_seq_1be9, pro_1be9_df_format)

in fraction,df len1660
190
45
Train Data Fraction: 0.809
false_df len220
true_df len1440
in fraction,df len1425
Test Data Fraction: 0.81
false_df len175
true_df len1250
Size of Test Dataset: 915
Size of Total Dataset: 1150
1150
Filename: pro_1be9_MLformat_235_train_915_test_t3.txt


### 1d5r

In [103]:
# loading data
pro_1d5r_df1 = pd.read_csv("../Raw Data/1d5r.csv", sep=",")

# renaming mutated residue column
pro_1d5r_df1 = pro_1d5r_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [104]:
# reorganizing data to other form
col_list_1d5r = list(pro_1d5r_df1)
col_list_1d5r = col_list_1d5r[1:]

# mutations going down from the leftmost column
mutations_1d5r = []

for column in col_list_1d5r:
    for mutation in pro_1d5r_df1["mutated_res"]:
        mutations_1d5r.append(column + mutation)

# getting scores
scores_1d5r = []

for column in pro_1d5r_df1.drop('mutated_res', axis=1):
    for val in pro_1d5r_df1[column]:
        scores_1d5r.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_1d5r_df = pd.DataFrame(list(zip(mutations_1d5r, scores_1d5r)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [105]:
# rounding score column to 6 decimal points
pro_1d5r_df["score"] = pro_1d5r_df["score"].round(6)

# shuffling
pro_1d5r_df = pro_1d5r_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_1d5r_mut = pro_1d5r_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_1d5r_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_1d5r_mut)

# get mutated residue and place in seperate col
pro_1d5r_df["MUTATED_RES"] = ssf.get_mutation_type(pro_1d5r_mut)

# get position and place in seperate col
pro_1d5r_df["POSITION"] = ssf.get_position(pro_1d5r_mut)

# need positionssplit
pro_1d5r_df["positions_split"] = ssf.get_positions_split(pro_1d5r_df)

positions_split_subtracted = []
for pos_list in pro_1d5r_df["positions_split"]:
    pos_list = [x - 14 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_1d5r_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_1d5r_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_1d5r_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_1d5r_df["variant"] = ssf.get_mutations_names_list(pro_1d5r_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [117]:
print(len(pro_1d5r_df))
pro_1d5r_df = pro_1d5r_df.dropna(axis=0)
print(len(pro_1d5r_df))
print(pro_1d5r_df.head(20))

6140
5833
     variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split
542    27ASP -0.353747             R           D       27            [27]
4319  215TYR  0.017479             S           Y      215           [215]
4836  241THR -1.469905             V           T      241           [241]
5638  312TRP -2.612767             D           W      312           [312]
5057  252VAL -0.179405             K           V      252           [252]
3636  181THR -2.370733             F           T      181           [181]
904    45PHE  1.481437             S           F       45            [45]
269    13LEU -4.008635             Y           L       13            [13]
5849  323LEU -0.327921             F           L      323           [323]
970    48MET -0.361043             K           M       48            [48]
2752  137PRO -2.677493             A           P      137           [137]
228    11LYS -3.105071             L           K       11            [11]
3906  195HIS  1.142440      

In [54]:
def get_seq_from_stride(stride_file):    
    aa_str = ""

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split()
            aa_str =  aa_str + split_line[1] + " "

    return aa_str.rstrip()

In [110]:
protein_seq_1d5r = get_seq_from_stride(pro_1d5r_stride_file)
protein_seq_1d5r_split = protein_seq_1d5r.split()

In [124]:
print(len(protein_seq_1d5r_split))
# print(protein_seq_1d5r_split[319])
# print(len(protein_seq_1d5r_split))
print(protein_seq_1d5r_split[312])

307


IndexError: list index out of range

In [113]:
path = "../PDB and STRIDE Files/" + '1d5r_stride.txt'
pro_1d5r_stride_file = open(path, 'r')

In [114]:
pro_1d5r_ss_indexes = ssf.get_sec_struc_boolean(pro_1d5r_stride_file)
print(pro_1d5r_ss_indexes.count(True))
print(pro_1d5r_ss_indexes.count(False))

# add in_sec_str_col
# pro_1d5r_df = add_sec_str_col(pro_1d5r_df, pro_1d5r_ss_indexes, 0)

182
125


In [138]:
pro_1d5r_train_df, pro_1d5r_test_df, pro_1d5r_remaining_df = get_train_and_test_df(pro_1d5r_df, 0.55, 235)
pro_1d5r_df_format = pd.concat([pro_1d5r_train_df, pro_1d5r_test_df])
print(len(pro_1d5r_df_format))
ssf.write_data_file("pro_1d5r_MLformat_235_train_1317_test_t3", protein_seq_1d5r, pro_1d5r_df_format)

in fraction,df len1660
129
106
Train Data Fraction: 0.549
false_df len700
true_df len960
in fraction,df len1425
Test Data Fraction: 0.55
false_df len594
true_df len831
Size of Test Dataset: 1317
Size of Total Dataset: 1552
1552
Filename: pro_1be9_MLformat_235_train_1317_test_t3.txt


In [None]:
## figure out protein sequence that is not working

### 1nd4

In [124]:
# loading data
pro_1nd4_df1 = pd.read_csv("../Raw Data/1nd4.txt", sep=",")

# renaming mutated residue column
pro_1nd4_df1 = pro_1nd4_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [125]:
# reorganizing data to other form
col_list_1nd4 = list(pro_1nd4_df1)
col_list_1nd4 = col_list_1nd4[1:]

# mutations going down from the leftmost column
mutations_1nd4 = []

for column in col_list_1nd4:
    for mutation in pro_1nd4_df1["mutated_res"]:
        mutations_1nd4.append(column + mutation)

# getting scores
scores_1nd4 = []

for column in pro_1nd4_df1.drop('mutated_res', axis=1):
    for val in pro_1nd4_df1[column]:
        scores_1nd4.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_1nd4_df = pd.DataFrame(list(zip(mutations_1nd4, scores_1nd4)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [126]:
# drop nans
print(len(pro_1nd4_df))
pro_1nd4_df = pro_1nd4_df.dropna(axis=0)
pro_1nd4_df = pro_1nd4_df[pro_1nd4_df['score'] != 0.0]
print(len(pro_1nd4_df))
# print(pro_1nd4_df.tail(30))

5100
5095


In [127]:
# rounding score column to 6 decimal points
pro_1nd4_df["score"] = pro_1nd4_df["score"].round(6)

# shuffling
pro_1nd4_df = pro_1nd4_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_1nd4_mut = pro_1nd4_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_1nd4_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_1nd4_mut)

# get mutated residue and place in seperate col
pro_1nd4_df["MUTATED_RES"] = ssf.get_mutation_type(pro_1nd4_mut)

# get position and place in seperate col
pro_1nd4_df["POSITION"] = ssf.get_position(pro_1nd4_mut)

# need positionssplit
pro_1nd4_df["positions_split"] = ssf.get_positions_split(pro_1nd4_df)

positions_split_subtracted = []
for pos_list in pro_1nd4_df["positions_split"]:
    pos_list = [x - 10 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_1nd4_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_1nd4_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_1nd4_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_1nd4_df["variant"] = ssf.get_mutations_names_list(pro_1nd4_df)

# drop unneccesary columns
# to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [179]:
print(len(pro_1nd4_df))

5095


In [128]:
string_seq_1nd4 = "GSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNELQDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAMRRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPDGEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRFLVLYGIAAPDSQRIAFYRLLDEFFGSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNELQDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAMRRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPDGEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRFLVLYGIAAPDSQRIAFYRLLDEFF"
protein_seq_1nd4 = ssf.get_expanded_seq(string_seq_1nd4)
protein_seq_1nd4_split = protein_seq_1nd4.split()
print(len(protein_seq_1nd4_split))
print(protein_seq_1nd4_split[38])

510
PHE


In [129]:
path = "../PDB and STRIDE Files/" + '1nd4_stride.txt'
pro_1nd4_stride_file = open(path, 'r')

pro_1nd4_ss_indexes = ssf.get_all_sec_struc_boolean(pro_1nd4_stride_file)
print(len(pro_1nd4_ss_indexes))
print(pro_1nd4_ss_indexes.count(True))
print(pro_1nd4_ss_indexes.count(False))

# add in_sec_str_col
pro_1nd4_df = add_sec_str_col(pro_1nd4_df, pro_1nd4_ss_indexes, 0)

510
415
95


In [133]:
pro_1nd4_train_df, pro_1nd4_test_df, pro_1nd4_remaining_df = get_train_and_test_df(pro_1nd4_df, 0.81, 1000)
pro_1nd4_df_format = pd.concat([pro_1nd4_train_df, pro_1nd4_test_df])
print(len(pro_1nd4_df_format))
ssf.write_data_file("pro_1nd4_MLformat_1000_train_3626_test_t3", protein_seq_1nd4, pro_1nd4_df_format)

in fraction,df len5095
810
190
Train Data Fraction: 0.81
false_df len880
true_df len4215
in fraction,df len4095
Test Data Fraction: 0.81
false_df len690
true_df len3405
Size of Test Dataset: 3626
Size of Total Dataset: 4626
4626
Filename: pro_1nd4_MLformat_1000_train_3626_test_t3.txt


### 3dqw

In [62]:
# loading data
pro_3dqw_df1 = pd.read_csv("../Raw Data/3dqw.txt", sep=",")

# renaming mutated residue column
pro_3dqw_df1 = pro_3dqw_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [63]:
# reorganizing data to other form
col_list_3dqw = list(pro_3dqw_df1)
col_list_3dqw = col_list_3dqw[1:]

# mutations going down from the leftmost column
mutations_3dqw = []

for column in col_list_3dqw:
    for mutation in pro_3dqw_df1["mutated_res"]:
        mutations_3dqw.append(column + mutation)

# getting scores
scores_3dqw = []

for column in pro_3dqw_df1.drop('mutated_res', axis=1):
    for val in pro_3dqw_df1[column]:
        scores_3dqw.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_3dqw_df = pd.DataFrame(list(zip(mutations_3dqw, scores_3dqw)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [64]:
pro_3dqw_df = pro_3dqw_df.dropna(axis=0)
print(len(pro_3dqw_df))

3315


In [65]:
# rounding score column to 6 decimal points
pro_3dqw_df["score"] = pro_3dqw_df["score"].round(6)

# shuffling
pro_3dqw_df = pro_3dqw_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_3dqw_mut = pro_3dqw_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_3dqw_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_3dqw_mut)

# get mutated residue and place in seperate col
pro_3dqw_df["MUTATED_RES"] = ssf.get_mutation_type(pro_3dqw_mut)

# get position and place in seperate col
pro_3dqw_df["POSITION"] = ssf.get_position(pro_3dqw_mut)

# need positionssplit
pro_3dqw_df["positions_split"] = ssf.get_positions_split(pro_3dqw_df)

positions_split_subtracted = []
for pos_list in pro_3dqw_df["positions_split"]:
    pos_list = [x - 255 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_3dqw_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_3dqw_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_3dqw_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_3dqw_df["variant"] = ssf.get_mutations_names_list(pro_3dqw_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [66]:
print(len(pro_3dqw_df))

3315


In [67]:
path = "../PDB and STRIDE Files/" + '3dqw_stride.txt'
pro_3dqw_stride_file = open(path, 'r')
protein_seq_3dqw = get_seq_from_stride(pro_3dqw_stride_file)

In [68]:
protein_seq_3dqw_split = protein_seq_3dqw.split()
print(len(protein_seq_3dqw_split))
print(protein_seq_3dqw_split[128])

1107
VAL


In [69]:
path = "../PDB and STRIDE Files/" + '3dqw_stride.txt'
pro_3dqw_stride_file = open(path, 'r')

pro_3dqw_ss_indexes = ssf.get_all_sec_struc_boolean(pro_3dqw_stride_file)
print(len(pro_3dqw_ss_indexes))
print(pro_3dqw_ss_indexes.count(True))
print(pro_3dqw_ss_indexes.count(False))

# add in_sec_str_col
pro_3dqw_df = add_sec_str_col(pro_3dqw_df, pro_3dqw_ss_indexes, 0)

1107
926
181


In [73]:
pro_3dqw_train_df, pro_3dqw_test_df, pro_3dqw_remaining_df = get_train_and_test_df(pro_3dqw_df, 0.84, 2170)
pro_3dqw_df_format = pd.concat([pro_3dqw_train_df, pro_3dqw_test_df])
print(len(pro_3dqw_df_format))
ssf.write_data_file("pro_3dqw_MLformat_2170_train_1087_test_t3", protein_seq_3dqw, pro_3dqw_df_format)

in fraction,df len3315
1822
348
Train Data Fraction: 0.84
false_df len579
true_df len2736
in fraction,df len1145
Test Data Fraction: 0.84
false_df len231
true_df len914
Size of Test Dataset: 1087
Size of Total Dataset: 3257
3257
Filename: pro_3dqw_MLformat_2170_train_1087_test_t3.txt


### 4bz3

In [47]:
# loading data
pro_4bz3_df1 = pd.read_csv("../Raw Data/4bz3.txt", sep=",")

# renaming mutated residue column
pro_4bz3_df1 = pro_4bz3_df1.rename(columns={"Unnamed: 0": "mutated_res"})

In [48]:
# reorganizing data to other form
col_list_4bz3 = list(pro_4bz3_df1)
col_list_4bz3 = col_list_4bz3[1:]

# mutations going down from the leftmost column
mutations_4bz3 = []

for column in col_list_4bz3:
    for mutation in pro_4bz3_df1["mutated_res"]:
        mutations_4bz3.append(column + mutation)

# getting scores
scores_4bz3 = []

for column in pro_4bz3_df1.drop('mutated_res', axis=1):
    for val in pro_4bz3_df1[column]:
        scores_4bz3.append(val)
    
# adding to df and renaming variant and score to match formatting for other proteins
pro_4bz3_df = pd.DataFrame(list(zip(mutations_4bz3, scores_4bz3)),
               columns =['variant', 'score'])

# i have the data, next is formatting it so i can add the indexes

In [49]:
print(len(pro_4bz3_df))
pro_4bz3_df = pro_4bz3_df.dropna(axis=0)
print(len(pro_4bz3_df))

4620
4554


In [50]:
# rounding score column to 6 decimal points
pro_4bz3_df["score"] = pro_4bz3_df["score"].round(6)

# shuffling
pro_4bz3_df = pro_4bz3_df.sample(frac=1)

# splitting variant list if there are multiple mutations
pro_4bz3_mut = pro_4bz3_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pro_4bz3_df["WILD_TYPE_RES"] = ssf.get_wild_type(pro_4bz3_mut)

# get mutated residue and place in seperate col
pro_4bz3_df["MUTATED_RES"] = ssf.get_mutation_type(pro_4bz3_mut)

# get position and place in seperate col
pro_4bz3_df["POSITION"] = ssf.get_position(pro_4bz3_mut)

# need positionssplit
pro_4bz3_df["positions_split"] = ssf.get_positions_split(pro_4bz3_df)

positions_split_subtracted = []
for pos_list in pro_4bz3_df["positions_split"]:
    pos_list = [x - 32 for x in pos_list] # reset index at 301
    positions_split_subtracted.append(pos_list)  

pro_4bz3_df["positions_split"] = positions_split_subtracted    
    
new_positions = []
pos_string = ""
for pos_list in pro_4bz3_df["positions_split"]:
    pos_string = ",".join(map(str, pos_list))
    new_positions.append(pos_string)
    pos_string = ""

pro_4bz3_df["POSITION"] = new_positions # changes positions into new adjusted values (0 index)
# replace variant column with reformatted variant name

# replace variant column with reformatted variant name
pro_4bz3_df["variant"] = ssf.get_mutations_names_list(pro_4bz3_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

In [51]:
print(len(pro_4bz3_df))

4554


In [52]:
print(pro_4bz3_df.head(10))

     variant     score WILD_TYPE_RES MUTATED_RES POSITION positions_split
3586  180HIS -0.099294             A           H      180           [180]
4181  210CYS -1.664693             L           C      210           [210]
3596  180THR -0.182043             A           T      180           [180]
2623  131GLU -0.395186             D           E      131           [131]
487    24ILE -7.565904             H           I       24            [24]
1171   58ASN  0.503021             K           N       58            [58]
3765  189GLY  0.076992             T           G      189           [189]
1904   95PHE -1.452111             R           F       95            [95]
4434  222ARG  0.030239             N           R      222           [222]
4034  202ARG -0.102308             Q           R      202           [202]


In [55]:
path = "../PDB and STRIDE Files/" + '4bz3_stride.txt'
pro_4bz3_stride_file = open(path, 'r')
protein_seq_4bz3 = get_seq_from_stride(pro_4bz3_stride_file)

In [56]:
protein_seq_4bz3_split = protein_seq_4bz3.split()
print(len(protein_seq_4bz3_split))
print(protein_seq_4bz3_split[51])

463
ILE


In [57]:
path = "../PDB and STRIDE Files/" + '4bz3_stride.txt'
pro_4bz3_stride_file = open(path, 'r')

pro_4bz3_ss_indexes = ssf.get_all_sec_struc_boolean(pro_4bz3_stride_file)
print(len(pro_4bz3_ss_indexes))
print(pro_4bz3_ss_indexes.count(True))
print(pro_4bz3_ss_indexes.count(False))

# add in_sec_str_col
pro_4bz3_df = add_sec_str_col(pro_4bz3_df, pro_4bz3_ss_indexes, 0)

463
380
83


In [61]:
pro_4bz3_train_df, pro_4bz3_test_df, pro_4bz3_remaining_df = get_train_and_test_df(pro_4bz3_df, 0.82, 908)
pro_4bz3_df_format = pd.concat([pro_4bz3_train_df, pro_4bz3_test_df])
print(len(pro_4bz3_df_format))
ssf.write_data_file("pro_4bz3_MLformat_908_train_3588_test_t3", protein_seq_4bz3, pro_4bz3_df_format)

in fraction,df len4554
744
164
Train Data Fraction: 0.819
false_df len811
true_df len3743
in fraction,df len3646
Test Data Fraction: 0.82
false_df len647
true_df len2999
Size of Test Dataset: 3588
Size of Total Dataset: 4496
4496
Filename: pro_4bz3_MLformat_908_train_3588_test_t3.txt
