# Datasets for Other Proteins

This notebook formats data for the additional protein data used.

In [3]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide
import re

In [4]:
import secStrucFormatting as ssf

In [3]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Metadata from compiled dataset

In [4]:
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)
# print(df.head)
print(df.columns)

print(df["PROTEIN"].value_counts().head(10))
print(df.loc[df['PROTEIN'] == 'Lysozyme'])

Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'ddG_(kcal/mol)',
       'ddG_H2O_(kcal/mol)', 'STATE', 'REVERSIBILITY', 'PUBMED_ID',
       'REFERENCE', 'MUTATED_CHAIN', 'KINGDOM', 'PBD_MUTANT', 'MEASURE',
       'METHOD', 'POSITION', 'WILD_TYPE_RES', 'MUTATED_RES', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS'],
      dtype='object')
Lysozyme                                                   2897
Immunoglobulin G-binding protein G                         1996
Thermonuclease                                             1586
Staphylococcal nuclease                                    1457
Endolysin                                                  1110
Ribonuclease                                                983
Ribonuclease HI                                             710
Guanine nucleotide-binding protein G(i) subunit alpha-1     704
Myo

In [5]:
print(df["UNIPROT_ID"].value_counts().head(10)) # 4 possible could be used?

P00644    3033
P00720    2767
P06654    2297
P61626    1146
P00648     981
P00651     904
P0A7Y4     722
P63096     698
P00044     546
P00698     491
Name: UNIPROT_ID, dtype: int64


#### P00644 (Thermonuclease)

In [6]:
# finding column with most values

# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']

print(len(nuclease_df))
nuclease_df.count() 
# using ddG_

3033


DATABASE              3033
PROTEIN               3033
UNIPROT_ID            3033
MUTATION              2901
SOURCE                2901
PBD_WILD              3031
PBD_CHAIN_MUTATION    1360
pH                    3031
T_(C)                 2577
Tm_(C)                 380
dTm_(C)                585
dH_(kcal/mol)           48
dG_(kcal/mol)           43
ddG_(kcal/mol)        1282
ddG_H2O_(kcal/mol)    1068
STATE                   12
REVERSIBILITY         1450
PUBMED_ID             3006
REFERENCE             2901
MUTATED_CHAIN         2943
KINGDOM               1451
PBD_MUTANT              23
MEASURE               1556
METHOD                1556
POSITION              2901
WILD_TYPE_RES         2901
MUTATED_RES           2901
IS_CURATED             132
CONSERVATION           131
NOTES                   24
DATASETS                29
dtype: int64

In [7]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [8]:
nuclease_ss_indexes = ssf.get_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements

In [9]:
ss = nuclease_ss_indexes.count(True)
not_ss = nuclease_ss_indexes.count(False)
print(ss)
print(not_ss)

144
87


#### P00720 (Endolysin)

In [10]:
endolysin_df = df.loc[df['UNIPROT_ID'] == 'P00720']
print(len(endolysin_df))
endolysin_df.count()

2767


DATABASE              2767
PROTEIN               2767
UNIPROT_ID            2767
MUTATION              2482
SOURCE                2482
PBD_WILD              2767
PBD_CHAIN_MUTATION    1512
pH                    2766
T_(C)                 1520
Tm_(C)                1117
dTm_(C)               1633
dH_(kcal/mol)           56
dG_(kcal/mol)            4
ddG_(kcal/mol)        1376
ddG_H2O_(kcal/mol)     158
STATE                   89
REVERSIBILITY         1656
PUBMED_ID             2711
REFERENCE             2482
MUTATED_CHAIN         2623
KINGDOM                825
PBD_MUTANT             378
MEASURE               1055
METHOD                1055
POSITION              2482
WILD_TYPE_RES         2482
MUTATED_RES           2482
IS_CURATED             285
CONSERVATION           285
NOTES                  116
DATASETS                81
dtype: int64

In [11]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'endolysin_stride.txt'
endolysin_stride_file = open(path, 'r')

In [12]:
endolysin_ss_indexes = ssf.get_sec_struc_boolean(endolysin_stride_file) # boolean list of secondary structure assignements

In [13]:
ss = endolysin_ss_indexes.count(True)
not_ss = endolysin_ss_indexes.count(False)
print(ss)
print(not_ss)

122
41


#### P06654 (Immunoglobulin G-binding protein G)

In [281]:
protein_G_df = df.loc[df['UNIPROT_ID'] == 'P06654']
# print(protein_G_df.head)
print(len(protein_G_df))
protein_G_df.count()
# print(protein_G_df.head)
protein_G_df = protein_G_df[protein_G_df["ddG_(kcal/mol)"].notna()]
print(len(protein_G_df))

2297
1221


In [282]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'protein_G_stride.txt'
protein_G_stride_file = open(path, 'r')

In [16]:
protein_G_ss_indexes = ssf.get_sec_struc_boolean(protein_G_stride_file) # boolean list of secondary structure assignements

In [239]:
# getting protein sequence
string_seq = "MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRNGGELTNLLGNSETTLALRNEESATADLTAAAVADTVAAAAAENAGAAAWEAAAAADALAKAKADALKEFNKYGVSDYYKNLINNAKTVEGIKDLQAQVVESAKKARISEATDGLSDFLKSQTPAEDTVKSIELAEAKVLANRELDKYGVSDYHKNLINNAKTVEGVKELIDEILAALPKTDTYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEKPEVIDASELTPAVTTYKLVINGKTLKGETTTKAVDAETAEKAFKQYANDNGVDGVWTYDDATKTFTVTEMVTEVPGDAPTEPEKPEASIPLVPLTPATPIAKDDAKKDDTKKEDAKKPEAKKDDAKKAETLPTTGEGSNPFFTAAALAVMAGAGALAVASKRKED"

In [272]:
protein_seq_protein_G = ssf.get_expanded_seq(string_seq)
print(protein_seq_protein_G)

MET GLU LYS GLU LYS LYS VAL LYS TYR PHE LEU ARG LYS SER ALA PHE GLY LEU ALA SER VAL SER ALA ALA PHE LEU VAL GLY SER THR VAL PHE ALA VAL ASP SER PRO ILE GLU ASP THR PRO ILE ILE ARG ASN GLY GLY GLU LEU THR ASN LEU LEU GLY ASN SER GLU THR THR LEU ALA LEU ARG ASN GLU GLU SER ALA THR ALA ASP LEU THR ALA ALA ALA VAL ALA ASP THR VAL ALA ALA ALA ALA ALA GLU ASN ALA GLY ALA ALA ALA TRP GLU ALA ALA ALA ALA ALA ASP ALA LEU ALA LYS ALA LYS ALA ASP ALA LEU LYS GLU PHE ASN LYS TYR GLY VAL SER ASP TYR TYR LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY ILE LYS ASP LEU GLN ALA GLN VAL VAL GLU SER ALA LYS LYS ALA ARG ILE SER GLU ALA THR ASP GLY LEU SER ASP PHE LEU LYS SER GLN THR PRO ALA GLU ASP THR VAL LYS SER ILE GLU LEU ALA GLU ALA LYS VAL LEU ALA ASN ARG GLU LEU ASP LYS TYR GLY VAL SER ASP TYR HIS LYS ASN LEU ILE ASN ASN ALA LYS THR VAL GLU GLY VAL LYS GLU LEU ILE ASP GLU ILE LEU ALA ALA LEU PRO LYS THR ASP THR TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA 

In [248]:
protein_seq_protein_G_split = protein_seq_protein_G.split()

In [283]:
protein_G_df = protein_G_df[protein_G_df["POSITION"].str.contains("pga_A") == False]

protein_G_df["variant"] = ssf.get_mutations_names_list(protein_G_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# rename to score
protein_G_df.rename(columns = {'ddG_(kcal/mol)':'score'}, inplace = True)

protein_G_df = protein_G_df.drop(columns=to_drop)
print(protein_G_df.columns)


Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'score', 'ddG_H2O_(kcal/mol)',
       'STATE', 'REVERSIBILITY', 'PUBMED_ID', 'REFERENCE', 'MUTATED_CHAIN',
       'KINGDOM', 'PBD_MUTANT', 'MEASURE', 'METHOD', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS', 'variant'],
      dtype='object')


In [284]:
protein_G_df = protein_G_df.sample(n=320)

In [275]:
print(len(protein_G_df))

320


In [285]:
print(protein_G_df["score"])

30262   -0.950000
30473   -3.310000
21789    0.520000
30722    0.480000
24082    4.100000
30707    0.200000
30857   -1.350000
30832    0.440000
27182   -0.200000
31021   -0.270000
31001   -0.910000
30878    0.120000
15198   -0.860000
30515   -4.000000
30598    0.120000
27187   -3.500000
30373    0.040000
30553   -0.230000
30506   -2.670000
30575   -0.050000
30408   -1.280000
28478    1.075526
30955    0.250000
24119    1.500000
30821    0.550000
30529   -0.300000
30187    0.150000
30988   -2.070000
30543   -0.140000
24100    2.500000
30690   -0.420000
30306    0.570000
30422    0.520000
15197   -0.450000
13675   -0.300000
30465   -0.190000
10027    0.520000
30115    0.310000
22728   -2.820000
30541    0.020000
9945     2.430000
30772    0.310000
30705   -0.090000
30472   -1.100000
24091    3.140000
30250   -0.130000
30511    0.260000
30415    0.610000
30658   -4.000000
30796    0.260000
30106    0.270000
30517   -2.910000
30440   -2.490000
30612    0.900000
30531    0.340000
30406   -2

In [286]:
ssf.write_data_file("protein_G_MLformat_320", protein_seq_protein_G, protein_G_df)

Filename: protein_G_MLformat_320.txt


From Gelman et al.

**avGFP**

In [220]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [221]:
avgfp_ss_indexes = ssf.get_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [20]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

152
85


Formatting Data

In [222]:
# importing avGFP data from Gelman et al.
avgfp_df1 = pd.read_csv("../Raw Data/avgfp.tsv.txt", sep="\t")
avgfp_df = avgfp_df1.dropna()
print(len(avgfp_df))
print(avgfp_df.columns)

54024
Index(['variant', 'num_mutations', 'score', 'score_wt_norm'], dtype='object')


In [223]:
# rounding score column to 2 decimal points
avgfp_df["score"] = avgfp_df["score"].round(6)
print(len(avgfp_df))

# remove values with wildcard star thing cause idk what it means
avgfp_df = avgfp_df[avgfp_df["variant"].str.contains("\*") == False]

# pab1_df = pab1_df.head(37600)
avgfp_df = avgfp_df.sample(n=160)
print(len(avgfp_df))

54024
160


In [224]:
# getting dataset size to run

string_seq = "SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(len(string_seq)) # <- domain length of 75
protein_seq_avgfp = ssf.get_expanded_seq(string_seq)
print(protein_seq_avgfp)

237
SER LYS GLY GLU GLU LEU PHE THR GLY VAL VAL PRO ILE LEU VAL GLU LEU ASP GLY ASP VAL ASN GLY HIS LYS PHE SER VAL SER GLY GLU GLY GLU GLY ASP ALA THR TYR GLY LYS LEU THR LEU LYS PHE ILE CYS THR THR GLY LYS LEU PRO VAL PRO TRP PRO THR LEU VAL THR THR LEU SER TYR GLY VAL GLN CYS PHE SER ARG TYR PRO ASP HIS MET LYS GLN HIS ASP PHE PHE LYS SER ALA MET PRO GLU GLY TYR VAL GLN GLU ARG THR ILE PHE PHE LYS ASP ASP GLY ASN TYR LYS THR ARG ALA GLU VAL LYS PHE GLU GLY ASP THR LEU VAL ASN ARG ILE GLU LEU LYS GLY ILE ASP PHE LYS GLU ASP GLY ASN ILE LEU GLY HIS LYS LEU GLU TYR ASN TYR ASN SER HIS ASN VAL TYR ILE MET ALA ASP LYS GLN LYS ASN GLY ILE LYS VAL ASN PHE LYS ILE ARG HIS ASN ILE GLU ASP GLY SER VAL GLN LEU ALA ASP HIS TYR GLN GLN ASN THR PRO ILE GLY ASP GLY PRO VAL LEU LEU PRO ASP ASN HIS TYR LEU SER THR GLN SER ALA LEU SER LYS ASP PRO ASN GLU LYS ARG ASP HIS MET VAL LEU LEU GLU PHE VAL THR ALA ALA GLY ILE THR HIS GLY MET ASP GLU LEU TYR LYS


In [225]:
# splitting variant list if there are multiple mutations
avgfp_mut = avgfp_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
avgfp_df["WILD_TYPE_RES"] = ssf.get_wild_type(avgfp_mut)

# get mutated residue and place in seperate col
avgfp_df["MUTATED_RES"] = ssf.get_mutation_type(avgfp_mut)

# get position and place in seperate col
avgfp_df["POSITION"] = ssf.get_position(avgfp_mut)

# replace variant column with reformatted variant name
avgfp_df["variant"] = ssf.get_mutations_names_list(avgfp_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

avgfp_df = avgfp_df.drop(columns=to_drop)

In [226]:
# writing data to txt file
ssf.write_data_file("avgfp_MLformat_160", protein_seq_avgfp, avgfp_df)

Filename: avgfp_MLformat_160.txt


**GB1**

In [204]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [75]:
gb1_ss_indexes = ssf.get_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [76]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

39
17


In [215]:
# importing pab1 data from Gelman et al.
gb1_df1 = pd.read_csv("../Raw Data/gb1.tsv.txt", sep="\t")
gb1_df = gb1_df1.dropna()
print(len(gb1_df))
# gb1_df = gb1_df.sample(n=480)
print(gb1_df.columns)

536084
Index(['variant', 'num_mutations', 'inp', 'sel', 'score'], dtype='object')


In [216]:
# rounding score column to 2 decimal points
gb1_df["score"] = gb1_df["score"].round(6)
print(len(gb1_df))

# remove values with wildcard star thing cause idk what it means
gb1_df = gb1_df[gb1_df["variant"].str.contains("\*") == False]

gb1_df = gb1_df.sample(n=40)
# pab1_df = pab1_df.head(37600)
print(len(gb1_df))

536084
40


In [183]:
# getting protein sequence
# protein_seq_gb1 = ssf.get_protein_seq("P04386")

In [217]:
# getting dataset size to run

string_seq = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
print(len(string_seq)) # <- domain length of 75
protein_seq_gb1 = ssf.get_expanded_seq(string_seq)
print(protein_seq_gb1)

56
MET GLN TYR LYS LEU ILE LEU ASN GLY LYS THR LEU LYS GLY GLU THR THR THR GLU ALA VAL ASP ALA ALA THR ALA GLU LYS VAL PHE LYS GLN TYR ALA ASN ASP ASN GLY VAL ASP GLY GLU TRP THR TYR ASP ASP ALA THR LYS THR PHE THR VAL THR GLU


In [218]:
# splitting variant list if there are multiple mutations
gb1_mut = gb1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
gb1_df["WILD_TYPE_RES"] = ssf.get_wild_type(gb1_mut)

# get mutated residue and place in seperate col
gb1_df["MUTATED_RES"] = ssf.get_mutation_type(gb1_mut)

# get position and place in seperate col
gb1_df["POSITION"] = ssf.get_position(gb1_mut)

# replace variant column with reformatted variant name
gb1_df["variant"] = ssf.get_mutations_names_list(gb1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

gb1_df = gb1_df.drop(columns=to_drop)

In [219]:
# writing data to txt file
ssf.write_data_file("gb1_MLformat_40", protein_seq_gb1, gb1_df)

Filename: gb1_MLformat_40.txt


**GAL4**

In [77]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

In [78]:
gal4_ss_indexes = ssf.get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

In [79]:
ss = gal4_ss_indexes.count(True)
not_ss = gal4_ss_indexes.count(False)
print(ss)
print(not_ss)

415
466


**Alpha-synuclein**

In [80]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'alpha-synuclein_stride.txt'
alpha_synuclein_stride_file = open(path, 'r')

In [81]:
alpha_synuclein_ss_indexes = ssf.get_sec_struc_boolean(alpha_synuclein_stride_file) # boolean list of secondary structure assignements

In [82]:
ss = alpha_synuclein_ss_indexes.count(True)
not_ss = alpha_synuclein_ss_indexes.count(False)
print(ss)
print(not_ss)

92
48


**Small ubiquitin-related modifier 1**

In [83]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

In [84]:
modifier_1_ss_indexes = ssf.get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

In [85]:
ss = modifier_1_ss_indexes.count(True)
not_ss = modifier_1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
54


**TAR DNA-binding protein 43**

In [86]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'tar_stride.txt'
tar_stride_file = open(path, 'r')

In [87]:
tar_ss_indexes = ssf.get_sec_struc_boolean(tar_stride_file) # boolean list of secondary structure assignements

In [88]:
ss = tar_ss_indexes.count(True)
not_ss = tar_ss_indexes.count(False)
print(ss)
print(not_ss)

149
265


**Human Glucokinase**

## Cleaning MaveDB Data

**GAL4**

In [146]:
gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")

In [147]:
# take note of offset
# format:

gal4_df1 = pd.read_csv("../Raw Data/gal4.csv.csv")
gal4_df1.columns = gal4_df1.iloc[3]
print(gal4_df1.columns)
# print(gal4_df1.head)
print(len(gal4_df1))

# first name the columns variant and score
# gal4_df1.rename(columns = {'Unnamed: 3':'variant', 'Unnamed: 4':'scores'}, inplace = True)
# print(gal4_df1.columns)
gal4_df = gal4_df1[(gal4_df1["hgvs_pro"].str.contains("del") == False) & (gal4_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (gal4_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(gal4_df))
# gal4_df.drop([0], axis=0, inplace=True)
# print(len(gal4_df))
#gal4_df = gal4_df2[gal4_df2["variant"].str.contains("hgvs") == True]
# print(len(gal4_df))

# print(gal4_df["variant"].iloc[0])
# print(gal4_df.iloc[0])

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score'], dtype='object', name=3)
1323
1196


In [61]:
# getting uniprot to compare offset
# protein_seq_gal4 = ssf.get_protein_seq("P04386")

In [37]:
protein_seq_gal4_list = protein_seq_gal4.split(" ")
# print(protein_seq_gal4_list)
print(protein_seq_gal4_list[41]) 

PRO


In [148]:
import re
# delete columns with "del" in it or *
# normalize scores?
def format_mavedb_variant(df, variant_col_name, offset):
    new_var_col = []
    for variant in df[variant_col_name]:
        wild_type = Bio.PDB.Polypeptide.three_to_one(variant[2:5].upper())
        position = int(re.findall("[0-9]+", variant)[0]) + offset
        mut_type = Bio.PDB.Polypeptide.three_to_one(variant[-3:].upper())
        new_var_col.append(wild_type + str(position) + mut_type)
    return new_var_col

In [152]:
col = format_mavedb_variant(gal4_df, "hgvs_pro", 0)
print(len(gal4_df))

1196


**Small ubiquitin-related modifier 1**

In [155]:
modifier_1_df1 = pd.read_csv("../Raw Data/modifier_1_mod.csv")

In [160]:
# print(modifier_1_df1.head)
print(modifier_1_df1.columns)
print(len(modifier_1_df1))

modifier_1_df = modifier_1_df1[(modifier_1_df1["hgvs_pro"].str.contains("=") == False) & (modifier_1_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (modifier_1_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(modifier_1_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'sd', 'se',
       'exp.score', 'exp.sd', 'df', 'pred.score'],
      dtype='object')
2020
1919


In [None]:
# offset


**TAR DNA-binding protein 43**

In [162]:
tar_df_pt1 = pd.read_csv("../Raw Data/tar1_mod.csv")
tar_df_pt2 = pd.read_csv("../Raw Data/tar2_mod.csv")

In [167]:
print(len(tar_df_pt1))
print(len(tar_df_pt2))
tar_df1 = pd.concat([tar_df_pt1, tar_df_pt2])
print(len(tar_df1))

704
714
1418


In [168]:
print(tar_df1.columns)

tar_df = tar_df1[(tar_df1["hgvs_pro"].str.contains("\*") == False) & (tar_df1["hgvs_pro"].str.contains("hgvs") == False)
                   & (tar_df1["hgvs_pro"].str.contains("Ter") == False)]
print(len(tar_df))

Index(['accession', 'hgvs_nt', 'hgvs_splice', 'hgvs_pro', 'score', 'se'], dtype='object')
1342


## Getting Dataset Fraction

In [82]:
# adds boolean column to dataframe to indicate whether value is in secondary structure
# needs positions split column
def add_sec_str_col(df, bool_ss_list, domain_start_index):
    has_sec_str = []
    for val in df["positions_split"]:
        # list of boolean values that are true if all mutation positions in line are sec. strc.
        all_pos_sec_struc = []

        for position in val:
            if (bool_ss_list[position - domain_start_index] == False):  # line up ss_indexes w/ position
                all_pos_sec_struc.append(False)
            else:
                all_pos_sec_struc.append(True)

        # all pos sec struc should match val list
        # if there's a value in all_pos_sec_struc that's false, append false
        # otherwise, append true
        if (all_pos_sec_struc.count(False) == 0):
            has_only_sec_str = True
        else:
            has_only_sec_str = False

        has_sec_str.append(has_only_sec_str)
        all_pos_sec_struc.clear()

    # print(len(has_sec_str)) # should match dataframe length
    df['in_sec_str'] = has_sec_str
    return df

In [188]:
def get_fractioned_dataset(df, fraction_ss, size):
    if (size is not None): # need training dataset size
        num_in_ss = int(size*fraction_ss) 
        num_not_ss = size - num_in_ss
        
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Train Data Fraction: " + str(real_fraction))
    else: # need test dataset and size of dataset doesn't matter
        num_in_ss, num_not_ss = get_num_ss(df, fraction_ss)
        real_fraction = round(float(num_in_ss)/(num_in_ss+num_not_ss), 3)
        print("Test Data Fraction: " + str(real_fraction))
        
    true_df = df[df["in_sec_str"] == True]
    ss_df = true_df.sample(num_in_ss)

    false_df = df[df["in_sec_str"] == False]
    not_ss_df = false_df.sample(num_not_ss)
    
    fractioned_df = pd.concat([ss_df, not_ss_df]).sample(frac=1)
    
    remaining_df = pd.concat([fractioned_df, df])
    remaining_df = remaining_df[~remaining_df.index.duplicated(keep=False)]
    
    return fractioned_df, remaining_df

In [160]:
def get_num_ss(df, fraction_ss): # use when size is dependent on remaining values
    num_ss_vals = (df["in_sec_str"] == True).sum()
    print(num_ss_vals)
    num_not_ss_vals = (df["in_sec_str"] == False).sum()
    print(num_not_ss_vals)
    
    if (num_not_ss_vals < num_ss_vals):
        ideal_split_ss = (num_not_ss_vals/(1-fraction_ss))*fraction_ss
        ideal_split_not_ss = num_not_ss_vals
        # instead do min value, max value and fraction it is
    else:
        ideal_split_ss = num_ss_vals
        ideal_split_not_ss = (num_ss_vals/fraction_ss)*(1-fraction_ss)
    while not (ideal_split_ss <= num_ss_vals and ideal_split_not_ss <= num_not_ss_vals):
        
        if (ideal_split_ss < ideal_split_not_ss):
            ideal_split_ss = ideal_split_ss - 1
            ideal_split_not_ss = (ideal_split_ss/fraction_ss)*(1-fraction_ss)
        else:
            ideal_split_not_ss = ideal_split_not_ss - 1
            ideal_split_ss = (ideal_split_not_ss/(1-fraction_ss))*(fraction_ss)
    return int(ideal_split_ss), int(ideal_split_not_ss)

In [164]:
# gets both train and test datasets based on size
# fraction_ss is a percent (e.g. 0.56)

# need "is_sec_struc_col"

def get_train_and_test_df(df, fraction_ss, train_size):
    train_df, remaining_df = get_fractioned_dataset(df, fraction_ss, train_size)
    test_df, remaining_df = get_fractioned_dataset(remaining_df, fraction_ss, None)
    print("Size of Test Dataset: " + str(len(remaining_df)))
    return train_df, test_df


#### Train and Test Pab1

In [179]:
# importing pab1 data from Gelman et al.
pab1_df1 = pd.read_csv("../Raw Data/pab1.tsv.txt", sep="\t")
pab1_df = pab1_df1.dropna()
print(len(pab1_df))
print(pab1_df.columns)

40852
Index(['variant', 'num_mutations', 'score'], dtype='object')


In [180]:
# rounding score column to 6 decimal points
pab1_df["score"] = pab1_df["score"].round(6)
print(len(pab1_df))

# remove values with wildcard star next to them
pab1_df = pab1_df[pab1_df["variant"].str.contains("\*") == False]
print(len(pab1_df))
# change this value depending on amount of data needed for dataset
# pab1_df = pab1_df.sample(n=10000)
print(len(pab1_df))

40852
37710
37710


In [181]:
# splitting variant list if there are multiple mutations
pab1_mut = pab1_df["variant"].str.split(",")

# get wild type of residue and place in seperate col
pab1_df["WILD_TYPE_RES"] = ssf.get_wild_type(pab1_mut)

# get mutated residue and place in seperate col
pab1_df["MUTATED_RES"] = ssf.get_mutation_type(pab1_mut)

# get position and place in seperate col
pab1_df["POSITION"] = ssf.get_position(pab1_mut)

# replace variant column with reformatted variant name
pab1_df["variant"] = ssf.get_mutations_names_list(pab1_df)

# drop unneccesary columns
to_drop = ["WILD_TYPE_RES", "MUTATED_RES", "POSITION"]

# pab1_df = pab1_df.drop(columns=to_drop)
pab1_df["positions_split"] = ssf.get_positions_split(pab1_df)

In [182]:
print(len(pab1_df))

37710


In [84]:
path = "../PDB and STRIDE Files/" + 'pab1_stride.txt'
pab1_stride_file = open(path, 'r')

In [85]:
pab1_ss_indexes = ssf.get_sec_struc_boolean(pab1_stride_file) # boolean list of secondary structure assignements

In [183]:
# add column

pab1_df = add_sec_str_col(pab1_df, pab1_ss_indexes, 126)
# need positionssplit

In [90]:
print(pab1_df.columns)

Index(['variant', 'num_mutations', 'score', 'WILD_TYPE_RES', 'MUTATED_RES',
       'POSITION', 'positions_split', 'in_sec_str'],
      dtype='object')


In [189]:
test_train, test_test = get_train_and_test_df(pab1_df, 0.70, 100)

Train Data Fraction: 0.7
works
100 - train_df
37610 - remaining
17957
19653
Test Data Fraction: 0.7
doesn't work
Size of Test Dataset: 11958


In [186]:
print(test_train["in_sec_str"].value_counts())

True     70
False    30
Name: in_sec_str, dtype: int64


In [187]:
print(test_test["in_sec_str"].value_counts())

True     17957
False     7695
Name: in_sec_str, dtype: int64
