# Datasets for Other Proteins

This notebook formats data for the additional protein data used.

In [2]:
# import statements
import os
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from io import StringIO
import Bio.PDB.Polypeptide

In [3]:
# setting jupyter notebook viewing options
max_rows = 1000
max_cols = 1000
pd.set_option("display.max_rows", max_rows, "display.max_columns", max_cols)

### Methods Used to Format Data

Formatting protein sequence into form for machine learning:

In [11]:
# parameters:
#      "uniprot_id" - string representing uniprot id of desired protein.
# This method uses a given uniprot id to query the uniprot data and 
# return a string respresention of the protein sequence. 
# E.g. MADIT
def get_protein_seq(uniprot_id):
    
    # importing fasta file from uniprot.org and getting protein sequence
    # taken from StackOverflow: 
    # https://stackoverflow.com/questions/52569622/protein-sequence-from-uniprot-protein-id-python
    url = "http://www.uniprot.org/uniprot/"
    complete_url = url + uniprot_id + ".fasta"
    response = requests.post(complete_url)
    data =''.join(response.text)
    sequence =StringIO(data)
    protein_seq=list(SeqIO.parse(sequence,'fasta'))

    # protein sequence as string (single-letter amino acids)
    string_seq = str(protein_seq[0].seq)
    
    # protein sequence w/ three-letter convention
    protein_seq = get_expanded_seq(string_seq)
    return protein_seq

Getting Secondary Structure assignment from STRIDE file

In [12]:
# Parameters: 
#      "stride file" - stride file of protein
#      "is_sec_struc" - list of boolean values for each secondary structure value
#                       if it is, true, else false
# returns list of boolean values indicating if position is secondary strcuture or not
def get_sec_struc_boolean(stride_file):
    is_sec_struc = []
    sec_struc_assign = []

    for line in stride_file:
        if line.startswith('ASG'):
            split_line = line.split();
            sec_struc_assign.append(split_line[5])

    for sec_struc in sec_struc_assign:
        if (sec_struc =='C' or sec_struc =='T'):
            is_sec_struc.append(False)
        else:
            is_sec_struc.append(True)
            
    return is_sec_struc

### Metadata from compiled dataset

In [4]:
path = "../Raw Data/" + 'all_data_clean.csv'
df = pd.read_csv(path)
# print(df.head)
print(df.columns)

print(df["PROTEIN"].value_counts().head(10))
print(df.loc[df['PROTEIN'] == 'Lysozyme'])

Index(['DATABASE', 'PROTEIN', 'UNIPROT_ID', 'MUTATION', 'SOURCE', 'PBD_WILD',
       'PBD_CHAIN_MUTATION', 'pH', 'T_(C)', 'Tm_(C)', 'dTm_(C)',
       'dH_(kcal/mol)', 'dG_(kcal/mol)', 'ddG_(kcal/mol)',
       'ddG_H2O_(kcal/mol)', 'STATE', 'REVERSIBILITY', 'PUBMED_ID',
       'REFERENCE', 'MUTATED_CHAIN', 'KINGDOM', 'PBD_MUTANT', 'MEASURE',
       'METHOD', 'POSITION', 'WILD_TYPE_RES', 'MUTATED_RES', 'IS_CURATED',
       'CONSERVATION', 'NOTES', 'DATASETS'],
      dtype='object')
Lysozyme                                                   2897
Immunoglobulin G-binding protein G                         1996
Thermonuclease                                             1586
Staphylococcal nuclease                                    1457
Endolysin                                                  1110
Ribonuclease                                                983
Ribonuclease HI                                             710
Guanine nucleotide-binding protein G(i) subunit alpha-1     704
Myo

In [5]:
print(df["UNIPROT_ID"].value_counts().head(10)) # 4 possible could be used?

P00644    3033
P00720    2767
P06654    2297
P61626    1146
P00648     981
P00651     904
P0A7Y4     722
P63096     698
P00044     546
P00698     491
Name: UNIPROT_ID, dtype: int64


#### P00644 (Thermonuclease)

In [6]:
# finding column with most values

# print(df.loc[df['UNIPROT_ID'] == 'P00644'])
nuclease_df = df.loc[df['UNIPROT_ID'] == 'P00644']

print(len(nuclease_df))
nuclease_df.count() 
# using ddG_

3033


DATABASE              3033
PROTEIN               3033
UNIPROT_ID            3033
MUTATION              2901
SOURCE                2901
PBD_WILD              3031
PBD_CHAIN_MUTATION    1360
pH                    3031
T_(C)                 2577
Tm_(C)                 380
dTm_(C)                585
dH_(kcal/mol)           48
dG_(kcal/mol)           43
ddG_(kcal/mol)        1282
ddG_H2O_(kcal/mol)    1068
STATE                   12
REVERSIBILITY         1450
PUBMED_ID             3006
REFERENCE             2901
MUTATED_CHAIN         2943
KINGDOM               1451
PBD_MUTANT              23
MEASURE               1556
METHOD                1556
POSITION              2901
WILD_TYPE_RES         2901
MUTATED_RES           2901
IS_CURATED             132
CONSERVATION           131
NOTES                   24
DATASETS                29
dtype: int64

In [14]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'thermonuclease_stride.txt'
nuclease_stride_file = open(path, 'r')

In [15]:
nuclease_ss_indexes = get_sec_struc_boolean(nuclease_stride_file) # boolean list of secondary structure assignements

In [17]:
ss = nuclease_ss_indexes.count(True)
not_ss = nuclease_ss_indexes.count(False)
print(ss)
print(not_ss)

144
87


#### P00720 (Endolysin)

In [9]:
endolysin_df = df.loc[df['UNIPROT_ID'] == 'P00720']
print(len(endolysin_df))
endolysin_df.count()

2767


DATABASE              2767
PROTEIN               2767
UNIPROT_ID            2767
MUTATION              2482
SOURCE                2482
PBD_WILD              2767
PBD_CHAIN_MUTATION    1512
pH                    2766
T_(C)                 1520
Tm_(C)                1117
dTm_(C)               1633
dH_(kcal/mol)           56
dG_(kcal/mol)            4
ddG_(kcal/mol)        1376
ddG_H2O_(kcal/mol)     158
STATE                   89
REVERSIBILITY         1656
PUBMED_ID             2711
REFERENCE             2482
MUTATED_CHAIN         2623
KINGDOM                825
PBD_MUTANT             378
MEASURE               1055
METHOD                1055
POSITION              2482
WILD_TYPE_RES         2482
MUTATED_RES           2482
IS_CURATED             285
CONSERVATION           285
NOTES                  116
DATASETS                81
dtype: int64

In [20]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'endolysin_stride.txt'
endolysin_stride_file = open(path, 'r')

In [21]:
endolysin_ss_indexes = get_sec_struc_boolean(endolysin_stride_file) # boolean list of secondary structure assignements

In [22]:
ss = endolysin_ss_indexes.count(True)
not_ss = endolysin_ss_indexes.count(False)
print(ss)
print(not_ss)

122
41


#### P06654 (Immunoglobulin G-binding protein G)

In [10]:
protein_G_df = df.loc[df['UNIPROT_ID'] == 'P06654']
# print(protein_G_df.head)
print(len(protein_G_df))
protein_G_df.count()

2297


DATABASE              2297
PROTEIN               2297
UNIPROT_ID            2297
MUTATION              2274
SOURCE                2274
PBD_WILD              2297
PBD_CHAIN_MUTATION    1078
pH                    2295
T_(C)                 2130
Tm_(C)                 155
dTm_(C)                185
dH_(kcal/mol)           13
dG_(kcal/mol)           54
ddG_(kcal/mol)        1221
ddG_H2O_(kcal/mol)     862
STATE                  834
REVERSIBILITY         1121
PUBMED_ID             2289
REFERENCE             2274
MUTATED_CHAIN         2254
KINGDOM               1136
PBD_MUTANT               8
MEASURE               1166
METHOD                1168
POSITION              2274
WILD_TYPE_RES         2274
MUTATED_RES           2274
IS_CURATED              23
CONSERVATION            20
NOTES                    6
DATASETS                 2
dtype: int64

In [26]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'protein_G_stride.txt'
protein_G_stride_file = open(path, 'r')

In [27]:
protein_G_ss_indexes = get_sec_struc_boolean(protein_G_stride_file) # boolean list of secondary structure assignements

In [28]:
ss = protein_G_ss_indexes.count(True)
not_ss = protein_G_ss_indexes.count(False)
print(ss)
print(not_ss)

237
211


From Gelman et al.

**avGFP**

In [30]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'avgfp_stride.txt'
avgfp_stride_file = open(path, 'r')

In [31]:
avgfp_ss_indexes = get_sec_struc_boolean(avgfp_stride_file) # boolean list of secondary structure assignements

In [32]:
ss = avgfp_ss_indexes.count(True)
not_ss = avgfp_ss_indexes.count(False)
print(ss)
print(not_ss)

152
85


**GB1**

In [35]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gb1_stride.txt'
gb1_stride_file = open(path, 'r')

In [36]:
gb1_ss_indexes = get_sec_struc_boolean(gb1_stride_file) # boolean list of secondary structure assignements

In [37]:
ss = gb1_ss_indexes.count(True)
not_ss = gb1_ss_indexes.count(False)
print(ss)
print(not_ss)

39
17


**GAL4**

In [38]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'gal4_stride.txt'
gal4_stride_file = open(path, 'r')

In [39]:
gal4_ss_indexes = get_sec_struc_boolean(gal4_stride_file) # boolean list of secondary structure assignements

In [40]:
ss = gal4_ss_indexes.count(True)
not_ss = gal4_ss_indexes.count(False)
print(ss)
print(not_ss)

415
466


**Alpha-synuclein**

In [41]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'alpha-synuclein_stride.txt'
alpha_synuclein_stride_file = open(path, 'r')

In [42]:
alpha_synuclein_ss_indexes = get_sec_struc_boolean(alpha_synuclein_stride_file) # boolean list of secondary structure assignements

In [44]:
ss = alpha_synuclein_ss_indexes.count(True)
not_ss = alpha_synuclein_ss_indexes.count(False)
print(ss)
print(not_ss)

92
48


**Small ubiquitin-related modifier 1**

In [45]:
# importing STRIDE file
path = "../PDB and STRIDE Files/" + 'modifier_1_stride.txt'
modifier_1_stride_file = open(path, 'r')

In [46]:
modifier_1_ss_indexes = get_sec_struc_boolean(modifier_1_stride_file) # boolean list of secondary structure assignements

In [47]:
ss = modifier_1_ss_indexes.count(True)
not_ss = modifier_1_ss_indexes.count(False)
print(ss)
print(not_ss)

47
54
