In [5]:
import pandas as pd

## Data Preparation

In [6]:
#load data
df_seq=pd.read_csv('pdb_data_seq.csv')
df_properties=pd.read_csv('pdb_data_no_dups.csv')
df_total=df_seq.merge(df_properties,left_on='structureId',right_on = 'structureId')


In [7]:
df_total.columns

Index(['structureId', 'chainId', 'sequence', 'residueCount_x',
       'macromoleculeType_x', 'classification', 'experimentalTechnique',
       'macromoleculeType_y', 'residueCount_y', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'],
      dtype='object')

#### Select only protein, and filtered by top N

In [8]:
#select the data in top n by count, top 10?
df_total=df_total[df_total['macromoleculeType_x'].isin(set(['Protein']))]

count = df_total['classification'].value_counts(dropna=False)[:10]
df_selected=df_total[df_total['classification'].isin(set(count.index))]
#we want only protein
#count = df_selected['classification'].value_counts(dropna=True)[:10]

In [46]:
count

HYDROLASE                        46336
TRANSFERASE                      36424
OXIDOREDUCTASE                   34322
IMMUNE SYSTEM                    15615
LYASE                            11682
HYDROLASE/HYDROLASE INHIBITOR    11218
TRANSCRIPTION                     8919
VIRAL PROTEIN                     8495
TRANSPORT PROTEIN                 8371
VIRUS                             6972
Name: classification, dtype: int64

#### Select proteins with only one chain

In [9]:
#select proteins with only one chain in the data set
#how to justify this operation?
#df_onechain = df_selected[df_selected.groupby('structureId').structureId.transform(len) == 1]

In [9]:
test_df = df_selected[['structureId','classification','sequence']]
#test_df = df_onechain[['structureId','classification','sequence']]

Things to be done

Further select data and simplify problem, select proteins with only one chain?


Figure out how to convert sequence data into array and training model afterwards. (22 - n -n)


More models and discussion (*LSTM)

Models build on features other than sequence.



## Data balancing

In [10]:
from sklearn.utils import resample
#upsample the minorities and then take 50000 sample from all

df_majority = test_df[test_df['classification']=='HYDROLASE']
df_minority = test_df[test_df['classification']=='HYDROLASE']
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=46336,    # to match majority class
                                 random_state=123) # reproducible results
keylist = list(count.index)
datalist = [0]*9
for i in range(1,10):
    df_minority = test_df[test_df['classification']==keylist[i]]
    datalist[i-1] = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=46336,    # to match majority class
                                 random_state=123) # reproducible results
    #print(len(df_majority))
    df_majority = pd.concat([df_majority, datalist[i-1]])
data = df_majority.sample(50000)
    

## Model Training

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier

In [19]:
#optional, take part of the data for faster verification
#data = test_df#.sample(50000)

#need to remove nulls
data = data.dropna()
X_train, X_test,y_train,y_test = \
train_test_split(data['sequence'], data['classification'], test_size = 0.1, random_state = 1)

In [20]:
test_count = data['classification'].value_counts(dropna=False)[:10]

In [31]:
np.asarray(test_count.index)

array(['HYDROLASE', 'VIRUS', 'HYDROLASE/HYDROLASE INHIBITOR',
       'IMMUNE SYSTEM', 'LYASE', 'OXIDOREDUCTASE', 'TRANSPORT PROTEIN',
       'TRANSCRIPTION', 'TRANSFERASE', 'VIRAL PROTEIN'], dtype=object)

#### Feature Extraction From Sequence Data

In [None]:
"""
#vectorize data, prepare for building models
#Convert a collection of text documents to a matrix of token counts
#seems has nothing to do with sequence but only with the frequency

#ngram is a parameter we need to focus on, 

vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (1,1))
#vect = CountVectorizer(analyzer = 'char_wb')

# Fit and Transform CountVectorizer
#occasionally may meet np.nan error
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

#to store the results for different mothods
prediction = dict()
"""

Iterate through models

In [25]:
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,recall_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.neural_network import MLPClassifier
import time

In [23]:
models = [MultinomialNB(),AdaBoostClassifier(),RandomForestClassifier(),KNeighborsClassifier(),DecisionTreeClassifier(),ExtraTreesClassifier(),SVC()]

model_names=['NB','adaboost','randomForest','Kneighbors','DecisionTree','ExtraTree','SVM']
extra_models = dict(zip(model_names,models))

In [29]:
count

HYDROLASE                        46336
TRANSFERASE                      36424
OXIDOREDUCTASE                   34322
IMMUNE SYSTEM                    15615
LYASE                            11682
HYDROLASE/HYDROLASE INHIBITOR    11218
TRANSCRIPTION                     8919
VIRAL PROTEIN                     8495
TRANSPORT PROTEIN                 8371
VIRUS                             6972
Name: classification, dtype: int64

In [None]:
precisions={}
recalls={}
confusion_matrixs={}
types  = np.asarray(test_count.index)
score_table = {}
from tqdm import tqdm
init_time = time.time()
for i in tqdm(range(1,6)):
    #featrue extraction by count vectorization
    vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (i,i))
    vect.fit(X_train)
    X_train_df = vect.transform(X_train)
    X_test_df = vect.transform(X_test)
    for key in extra_models.keys():
        model = extra_models[key]
        model.fit(X_train_df,y_train)
        ADA_pred = model.predict(X_test_df)
        if key not in precisions:
            precisions[key] = [accuracy_score(ADA_pred , y_test)]
            #recalls[key] = [recall_score(ADA_pred , y_test)]
            confusion_matrixs[key] = [confusion_matrix(ADA_pred, y_test, labels = types)]
            score_table[key] = [classification_report(ADA_pred,y_test, labels = types)]
        else:
            precisions[key].append(accuracy_score(ADA_pred , y_test))  
            #recalls[key].append(recall_score(ADA_pred, y_test))
            confusion_matrixs[key].append(confusion_matrix(ADA_pred,y_test, labels = types))
            score_table[key].append( classification_report(ADA_pred,y_test, labels = types))
        print('time elapsed:',time.time()-init_time)
        print(i,key,accuracy_score(ADA_pred , y_test))






  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

time elapsed: 6.282205581665039
1 NB 0.3702
time elapsed: 11.75955581665039
1 adaboost 0.3824
time elapsed: 16.93671202659607
1 randomForest 0.8884
time elapsed: 28.89974856376648
1 Kneighbors 0.8074
time elapsed: 31.874820232391357
1 DecisionTree 0.8656
time elapsed: 38.63072609901428
1 ExtraTree 0.883




time elapsed: 164.06582617759705
1 SVM 0.8572







 20%|████████████████▌                                                                  | 1/5 [02:44<10:56, 164.07s/it]

time elapsed: 174.71338176727295
2 NB 0.4758
time elapsed: 191.8634898662567
2 adaboost 0.3672
time elapsed: 215.21606874465942
2 randomForest 0.8748
time elapsed: 252.87243175506592
2 Kneighbors 0.805
time elapsed: 290.78999495506287
2 DecisionTree 0.8586
time elapsed: 321.80706000328064
2 ExtraTree 0.8768




time elapsed: 1618.5239880084991
2 SVM 0.8294







 40%|█████████████████████████████████▏                                                 | 2/5 [26:58<27:33, 551.19s/it]

time elapsed: 1631.9959318637848
3 NB 0.6548
time elapsed: 1659.830552816391
3 adaboost 0.3342
time elapsed: 1710.1370558738708
3 randomForest 0.8766
time elapsed: 1729.1023654937744
3 Kneighbors 0.742
time elapsed: 1836.4708368778229
3 DecisionTree 0.8656
time elapsed: 1899.9940252304077
3 ExtraTree 0.8814


