# Datasets for Other Proteins

This notebook formats data for the additional protein data used.

In [2]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide

In [54]:
import secStrucFormatting as ssf

In [3]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Metadata from compiled dataset

In [56]:
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)
# print(df.head)
print(df.columns)

print(df["PROTEIN"].value_counts().head(10))
print(df.loc[df['PROTEIN'] == 'Lysozyme'])

Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'ddG_(kcal/mol)',
       'ddG_H2O_(kcal/mol)', 'STATE', 'REVERSIBILITY', 'PUBMED_ID',
       'REFERENCE', 'MUTATED_CHAIN', 'KINGDOM', 'PBD_MUTANT', 'MEASURE',
       'METHOD', 'POSITION', 'WILD_TYPE_RES', 'MUTATED_RES', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS'],
      dtype='object')
Lysozyme                                                   2897
Immunoglobulin G-binding protein G                         1996
Thermonuclease                                             1586
Staphylococcal nuclease                                    1457
Endolysin                                                  1110
Ribonuclease                                                983
Ribonuclease HI                                             710
Guanine nucleotide-binding protein G(i) subunit alpha-1     704
Myo

In [57]:
print(df["UNIPROT_ID"].value_counts().head(10)) # 4 possible could be used?

P00644    3033
P00720    2767
P06654    2297
P61626    1146
P00648     981
P00651     904
P0A7Y4     722
P63096     698
P00044     546
P00698     491
Name: UNIPROT_ID, dtype: int64


#### P00644 (Thermonuclease)

In [58]:
# finding column with most values

# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']

print(len(nuclease_df))
nuclease_df.count() 
# using ddG_

3033


DATABASE              3033
PROTEIN               3033
UNIPROT_ID            3033
MUTATION              2901
SOURCE                2901
PBD_WILD              3031
PBD_CHAIN_MUTATION    1360
pH                    3031
T_(C)                 2577
Tm_(C)                 380
dTm_(C)                585
dH_(kcal/mol)           48
dG_(kcal/mol)           43
ddG_(kcal/mol)        1282
ddG_H2O_(kcal/mol)    1068
STATE                   12
REVERSIBILITY         1450
PUBMED_ID             3006
REFERENCE             2901
MUTATED_CHAIN         2943
KINGDOM               1451
PBD_MUTANT              23
MEASURE               1556
METHOD                1556
POSITION              2901
WILD_TYPE_RES         2901
MUTATED_RES           2901
IS_CURATED             132
CONSERVATION           131
NOTES                   24
DATASETS                29
dtype: int64

In [59]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [60]:
nuclease_ss_indexes = ssf.get_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements

In [61]:
ss = nuclease_ss_indexes.count(True)
not_ss = nuclease_ss_indexes.count(False)
print(ss)
print(not_ss)

144
87


#### P00720 (Endolysin)

In [62]:
endolysin_df = df.loc[df['UNIPROT_ID'] == 'P00720']
print(len(endolysin_df))
endolysin_df.count()

2767


DATABASE              2767
PROTEIN               2767
UNIPROT_ID            2767
MUTATION              2482
SOURCE                2482
PBD_WILD              2767
PBD_CHAIN_MUTATION    1512
pH                    2766
T_(C)                 1520
Tm_(C)                1117
dTm_(C)               1633
dH_(kcal/mol)           56
dG_(kcal/mol)            4
ddG_(kcal/mol)        1376
ddG_H2O_(kcal/mol)     158
STATE                   89
REVERSIBILITY         1656
PUBMED_ID             2711
REFERENCE             2482
MUTATED_CHAIN         2623
KINGDOM                825
PBD_MUTANT             378
MEASURE               1055
METHOD                1055
POSITION              2482
WILD_TYPE_RES         2482
MUTATED_RES           2482
IS_CURATED             285
CONSERVATION           285
NOTES                  116
DATASETS                81
dtype: int64

In [63]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'endolysin_stride.txt'
endolysin_stride_file = open(path, 'r')

In [64]:
endolysin_ss_indexes = ssf.get_sec_struc_boolean(endolysin_stride_file) # boolean list of secondary structure assignements

In [66]:
ss = endolysin_ss_indexes.count(True)
not_ss = endolysin_ss_indexes.count(False)
print(ss)
print(not_ss)

122
41


#### P06654 (Immunoglobulin G-binding protein G)

In [67]:
protein_G_df = df.loc[df['UNIPROT_ID'] == 'P06654']
# print(protein_G_df.head)
print(len(protein_G_df))
protein_G_df.count()

2297


DATABASE              2297
PROTEIN               2297
UNIPROT_ID            2297
MUTATION              2274
SOURCE                2274
PBD_WILD              2297
PBD_CHAIN_MUTATION    1078
pH                    2295
T_(C)                 2130
Tm_(C)                 155
dTm_(C)                185
dH_(kcal/mol)           13
dG_(kcal/mol)           54
ddG_(kcal/mol)        1221
ddG_H2O_(kcal/mol)     862
STATE                  834
REVERSIBILITY         1121
PUBMED_ID             2289
REFERENCE             2274
MUTATED_CHAIN         2254
KINGDOM               1136
PBD_MUTANT               8
MEASURE               1166
METHOD                1168
POSITION              2274
WILD_TYPE_RES         2274
MUTATED_RES           2274
IS_CURATED              23
CONSERVATION            20
NOTES                    6
DATASETS                 2
dtype: int64

In [68]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'protein_G_stride.txt'
protein_G_stride_file = open(path, 'r')

In [69]:
protein_G_ss_indexes = ssf.get_sec_struc_boolean(protein_G_stride_file) # boolean list of secondary structure assignements

In [70]:
ss = protein_G_ss_indexes.count(True)
not_ss = protein_G_ss_indexes.count(False)
print(ss)
print(not_ss)

237
211


From Gelman et al.

**avGFP**

In [71]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [72]:
avgfp_ss_indexes = ssf.get_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [73]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

152
85


Formatting Data

In [89]:
# importing pab1 data from Gelman et al.
avgfp_df1 = pd.read_csv("../Raw Data/avgfp.tsv.txt", sep="\t")
avgfp_df = avgfp_df1.dropna()
print(len(avgfp_df))
print(avgfp_df.columns)

54024
Index(['variant', 'num_mutations', 'score', 'score_wt_norm'], dtype='object')


In [90]:
# rounding score column to 2 decimal points
avgfp_df["score"] = avgfp_df["score"].round(6)
print(len(avgfp_df))

# remove values with wildcard star thing cause idk what it means
avgfp_df = avgfp_df[avgfp_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
print(len(avgfp_df))

54024
51714


In [92]:
# split variant name into wild-type, position, and mutation type
avgfp_mut = avgfp_df["variant"].str.split(",")
avgfp_df["WILD_TYPE_RES"] = ssf.get_wild_type(avgfp_mut)
avgfp_df["MUTATED_RES"] = ssf.get_mutation_type(avgfp_mut)
avgfp_df["POSITION"] = ssf.get_position(avgfp_mut)
avgfp_df["variant"] = ssf.get_mutations_names_list(avgfp_df)

In [93]:
avgfp_df["positions_split"] = ssf.get_positions_split(avgfp_df)
# print(pab1_df["positions_split"].tail(40))

In [None]:
avgfp_in_domain_df = get_domain_dataset(avgfp_df, 126, 201, not_included_pab1)

In [None]:
pab1_ss_df = get_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 126)
print(len(pab1_ss_df))
# 5828 values

In [None]:
# mini-dataset of 2880 values to compare mutations to non mutations:
pab1_ss_df_3000 = pab1_ss_df.sample(n=800)
print(len(pab1_ss_df_3000))

In [None]:
pab1_not_ss_df = get_not_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 126)
print(len(pab1_not_ss_df))

In [None]:
pab1_not_ss_df = get_not_ss_dataset(pab_in_domain_df, pab1_ss_indexes, 126)
print(len(pab1_not_ss_df))

In [None]:
protein_seq_pab1 = get_protein_seq("P04147")

In [None]:
split_pab1 = protein_seq_pab1.split()
print(len(split_pab1))

In [None]:
# write data to formatted txt file

write_data_file("pab1_MLformat_ss_800_v3_lim", protein_seq_pab1, pab1_ss_df_3000)
write_data_file("pab1_MLformat_not_ss_800_v3_lim", protein_seq_pab1, pab1_not_ss_df_3000)

**GB1**

In [74]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [75]:
gb1_ss_indexes = ssf.get_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [76]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

39
17


**GAL4**

In [77]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

In [78]:
gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

In [79]:
ss = gal4_ss_indexes.count(True)
not_ss = gal4_ss_indexes.count(False)
print(ss)
print(not_ss)

415
466


**Alpha-synuclein**

In [80]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'alpha-synuclein_stride.txt'
alpha_synuclein_stride_file = open(path, 'r')

In [81]:
alpha_synuclein_ss_indexes = ssf.get_sec_struc_boolean(alpha_synuclein_stride_file) # boolean list of secondary structure assignements

In [82]:
ss = alpha_synuclein_ss_indexes.count(True)
not_ss = alpha_synuclein_ss_indexes.count(False)
print(ss)
print(not_ss)

92
48


**Small ubiquitin-related modifier 1**

In [83]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

In [84]:
modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

In [85]:
ss = modifier_1_ss_indexes.count(True)
not_ss = modifier_1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
54


**TAR DNA-binding protein 43**

In [86]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

In [87]:
tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

In [88]:
ss = tar_ss_indexes.count(True)
not_ss = tar_ss_indexes.count(False)
print(ss)
print(not_ss)

149
265


**Human Glucokinase**