In [None]:
# 1). ----- Import Libraries and Datasets ------

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Import Datasets
df_seq = pd.read_csv('/work/pdb_data_seq.csv')
df_char = pd.read_csv('/work/pdb_data_no_dups.csv')

print('Datasets have been loaded...')

Datasets have been loaded...


In [None]:
# 2). ----- Filter and Process Dataset ------

# Filter for only proteins
protein_char = df_char[df_char.macromoleculeType == 'Protein']
protein_seq = df_seq[df_seq.macromoleculeType == 'Protein']

# Select only necessary variables to join
protein_char = protein_char[['structureId','classification']]
protein_seq = protein_seq[['structureId','sequence']]
protein_seq.head()

Unnamed: 0,structureId,sequence
4,101M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
7,102L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...
8,102M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
11,103L,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...
12,103M,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [None]:
protein_char.head()

Unnamed: 0,structureId,classification
2,101M,OXYGEN TRANSPORT
4,102L,HYDROLASE(O-GLYCOSYL)
5,102M,OXYGEN TRANSPORT
7,103L,HYDROLASE(O-GLYCOSYL)
8,103M,OXYGEN TRANSPORT


In [None]:
# Join two datasets on structureId
model_f = protein_char.set_index('structureId').join(protein_seq.set_index('structureId'))
model_f.head()


Unnamed: 0_level_0,classification,sequence
structureId,Unnamed: 1_level_1,Unnamed: 2_level_1
101M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
102L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...
102M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
103L,HYDROLASE(O-GLYCOSYL),MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...
103M,OXYGEN TRANSPORT,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [None]:
model_f = model_f.dropna()


In [None]:
# Look at classification type counts
counts = model_f.classification.value_counts()


In [None]:
# Get classification types where counts are over 1000
types = np.asarray(counts[(counts > 1000)].index)

# Filter dataset's records for classification types > 1000
data = model_f[model_f.classification.isin(types)]

print(types)
print('%d is the number of records in the final filtered dataset' %data.shape[0])


['HYDROLASE' 'TRANSFERASE' 'OXIDOREDUCTASE' 'IMMUNE SYSTEM' 'LYASE'
 'HYDROLASE/HYDROLASE INHIBITOR' 'TRANSCRIPTION' 'VIRAL PROTEIN'
 'TRANSPORT PROTEIN' 'VIRUS' 'SIGNALING PROTEIN' 'ISOMERASE' 'LIGASE'
 'MEMBRANE PROTEIN' 'PROTEIN BINDING' 'STRUCTURAL PROTEIN' 'CHAPERONE'
 'STRUCTURAL GENOMICS, UNKNOWN FUNCTION' 'SUGAR BINDING PROTEIN'
 'DNA BINDING PROTEIN' 'PHOTOSYNTHESIS' 'ELECTRON TRANSPORT'
 'TRANSFERASE/TRANSFERASE INHIBITOR' 'METAL BINDING PROTEIN'
 'CELL ADHESION' 'UNKNOWN FUNCTION' 'PROTEIN TRANSPORT' 'TOXIN'
 'CELL CYCLE' 'RNA BINDING PROTEIN' 'DE NOVO PROTEIN' 'HORMONE'
 'GENE REGULATION' 'OXIDOREDUCTASE/OXIDOREDUCTASE INHIBITOR' 'APOPTOSIS'
 'MOTOR PROTEIN' 'PROTEIN FIBRIL' 'METAL TRANSPORT'
 'VIRAL PROTEIN/IMMUNE SYSTEM' 'CONTRACTILE PROTEIN' 'FLUORESCENT PROTEIN'
 'TRANSLATION' 'BIOSYNTHETIC PROTEIN']
278866 is the number of records in the final filtered dataset


In [None]:
# 3). ----- Train Test Split -----

# Split Data
X_train, X_test,y_train,y_test = train_test_split(data['sequence'], data['classification'], test_size = 0.2, random_state = 1)

# Create a Count Vectorizer to gather the unique elements in sequence
vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (4,4), max_features = 165)

# Fit and Transform CountVectorizer
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

#X_train_df = X_train
#X_test_df = X_test

#Print a few of the features
print(vect.get_feature_names()[-20:])

['tilk', 'tlee', 'tllv', 'tlqe', 'tlra', 'tltl', 'tsgg', 'tstl', 'tvgg', 'vaaa', 'vaag', 'vaal', 'vdlg', 'veal', 'veel', 'vlaa', 'vprg', 'vtvs', 'xxxx', 'yfqs']


In [None]:
# 4). ------ Machine Learning Models ------

# Make a prediction dictionary to store accuracys
prediction = dict()

# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df, y_train)
NB_pred = model.predict(X_test_df)
prediction["MultinomialNB"] = accuracy_score(NB_pred, y_test)
print( prediction['MultinomialNB'])


0.19634596765517984


In [None]:
# sample input
input12 = ["MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAAELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"]

# use the Count Vectorizer on the input
vect.fit(input12)
testingforme = vect.transform(input12)

# put out the model prediction

print(model.predict(testingforme)[0])

HYDROLASE


![Picture title](image-20211207-032528.png)

![Picture title](image-20211207-025852.png)

In [None]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(X_train_df,y_train)
ADA_pred = model.predict(X_test_df)
prediction["Adaboost"] = accuracy_score(ADA_pred , y_test)
print(prediction["Adaboost"])


0.1982106357801126


In [None]:
print(model.predict(testingforme)[0])

TRANSFERASE


In [None]:
print(X_test)

structureId
3J7I    MRECISIHVGQAGVQIGNACWELYCLEHGIQPDGQMPSDKTIGGGD...
5U58    MDVRTLAVGKAHLEALLATRKMTLEHLQDVRHDATQVYFDGLEHLQ...
5T4P    MENLNMDLLYMAAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLI...
4XQ3    GAMDMQAKVENPLKSLRTAINRIVLVKLKDGSEYIGKLEQTDGTMN...
2WBI    SMSKRTFSTVLPQIDTTGQLFVQTRKGQEVLIKVKHFMKQHILPAE...
                              ...                        
1QUQ    MVDMMDLPRSRINAGMLAQFIDKPVCFVGRLEKIHPTGKMFILSDG...
2CLZ    IQKTPQIQVYSRHPPENGKPNILNCYVTQFHPPHIEIQMLKNGKKI...
5SXU    APADNAADARPVDVSVSIFINKIYGVNTLEQTYKVDGYIVAQWTGK...
2Q5R    MGSSHHHHHHSSGLVPRGSHMILTLTLNPSVDISYPLTALKLDDVN...
3FOW    MALDNLLRHLKISKEQITPVVLVVGDPGRVDKIKVVCDSYVDLAYN...
Name: sequence, Length: 55774, dtype: object


In [None]:
print(classification_report(y_test, NB_pred, target_names = types))

                                         precision    recall  f1-score   support

                              HYDROLASE       0.03      0.03      0.03       250
                            TRANSFERASE       0.23      0.06      0.09       211
                         OXIDOREDUCTASE       0.05      0.06      0.06       589
                          IMMUNE SYSTEM       0.19      0.07      0.10       509
                                  LYASE       0.34      0.21      0.26       859
          HYDROLASE/HYDROLASE INHIBITOR       0.33      0.43      0.38       224
                          TRANSCRIPTION       0.16      0.21      0.18       326
                          VIRAL PROTEIN       0.02      0.01      0.02       622
                      TRANSPORT PROTEIN       0.00      0.00      0.00       601
                                  VIRUS       0.09      0.27      0.13       209
                      SIGNALING PROTEIN       0.12      0.42      0.19       309
                           

"![Picture title](image-20211207-024057.png)

![Picture title](image-20211207-023201.png)

![Picture title](image-20211207-023134.png)

In [None]:
# install anvil

!pip install anvil-uplink

Collecting anvil-uplink
  Downloading anvil_uplink-0.3.41-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 5.9 MB/s 
[?25hCollecting ws4py
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 438 kB/s 
[?25hCollecting argparse
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25ldone
[?25h  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45217 sha256=f1f3105a6616bce1a70302c50dcdea90c1b23e1904a8e1e3996de3a688554197
  Stored in directory: /root/.cache/pip/wheels/29/ea/7d/3410aa0aa0e4402ead9a7a97ab2214804887e0f5c2b76f0c96
Successfully built ws4py
Installing collected packages: ws4py, argparse, anvil-uplink
Successfully installed anvil-uplink-0.3.41 argparse-1.4.0 ws4py-0.5.1
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# connect to anvil

import anvil.server

anvil.server.connect("IO5CL3WI5XJOK5RCR3BW7OCW-FF7FCRJBKEBP2N5S")


Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


In [None]:
# take in the input from Anvil and run it through the model

import anvil.media

@anvil.server.callable
def classify_protein(sequence_input):

    sequence_list = []

    if (type(sequence_input) == str):
        sequence_list.append(sequence_input)
        vect.fit(sequence_list)
        final_fit = vect.transform(sequence_list)
    

    score = model.predict(final_fit)[0]

    return score

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d08c32ac-6fc1-4441-b12d-cc77c2d43474' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>