In [1]:
import pandas as pd
import numpy as np

## Data Preparation

In [2]:
#load data
df_seq=pd.read_csv('pdb_data_seq.csv')
df_properties=pd.read_csv('pdb_data_no_dups.csv')
df_total=df_seq.merge(df_properties,left_on='structureId',right_on = 'structureId')


In [3]:
df_total.columns

Index(['structureId', 'chainId', 'sequence', 'residueCount_x',
       'macromoleculeType_x', 'classification', 'experimentalTechnique',
       'macromoleculeType_y', 'residueCount_y', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'],
      dtype='object')

#### Select only protein, and filtered by top N

In [4]:
#select the data in top n by count, top 10?
count = df_total['classification'].value_counts(dropna=False)[:10]
df_selected=df_total[df_total['classification'].isin(set(count.index))]
#we want only protein
df_selected=df_selected[df_selected['macromoleculeType_x'].isin(set(['Protein']))]

#### Select proteins with only one chain

In [5]:
#select proteins with only one chain in the data set
#how to justify this operation?
#df_onechain = df_selected[df_selected.groupby('structureId').structureId.transform(len) == 1]

In [6]:
test_df = df_selected[['structureId','classification','sequence']]
#test_df = df_onechain[['structureId','classification','sequence']]

Things to be done

Further select data and simplify problem, select proteins with only one chain?


Figure out how to convert sequence data into array and training model afterwards.


More models and discussion (*LSTM)

Models build on features other than sequence.



## Model Training Focus on ngram_range

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.neural_network import MLPClassifier

In [8]:
#optional, take part of the data for faster verification
data = test_df.sample(100000)

#need to remove nulls
data = data.dropna()
X_train, X_test,y_train,y_test = \
train_test_split(data['sequence'], data['classification'], test_size = 0.1, random_state = 1)

#### Feature Extraction From Sequence Data

In [None]:
model_type = [KNeighborsClassifier(),MultinomialNB(),RandomForestClassifier(), AdaBoostClassifier(),GradientBoostingClassifier(),DecisionTreeClassifier(),ExtraTreesClassifier()]
model_score = {}
def model_select(classifier):
    model = classifier
    model.fit(X_train_df, y_train)
    #test on test set
    NB_pred = model.predict(X_test_df)
    return accuracy_score(NB_pred, y_test)
for i in range(10):
    #vectorize data, prepare for building models
    #Convert a collection of text documents to a matrix of token counts
    #seems has nothing to do with sequence but only with the frequency

    #ngram is a parameter we need to focus on, 
    vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (i,i))
    vect.fit(X_train)
    # Fit and Transform CountVectorizer
    # Occasionally may meet np.nan error
    X_train_df = vect.transform(X_train)
    X_test_df = vect.transform(X_test)
    model_score[i] = [model_select(j) for j in model_type]



In [None]:
def model_select(classifier):
    cv_result = []
    cv_means = []
    # Cross validate model with Kfold stratified cross val
    kfold = StratifiedKFold(n_splits=5)
    cv_result.append(cross_val_score(classifier, X_train, y = y_train, scoring = "accuracy", cv = kfold, n_jobs=4))
    cv_means.append(np.mean(cv_result))
    return cv_means
# Fitting all the models 
model_type = [KNeighborsClassifier(),GaussianNB(),RandomForestClassifier(),
              AdaBoostClassifier(),GradientBoostingClassifier(),DecisionTreeClassifier(),ExtraTreesClassifier()]
model_score = [model_select(i) for i in model_type]

In [None]:
classifier = ['KNeighbors','Naive Bayes','Random Forest','AdaBoost','Gradient Boosting','Decision Tree','Extra Trees']
# Place result in a data Frame
ml_model = pd.DataFrame(model_score,classifier).reset_index()
ml_model.columns=['Model','acc_score']
ml_model.sort_values('acc_score',ascending = False,inplace=True)
ml_model.reset_index(drop=True,inplace = True)
f, ax = plt.subplots(figsize=(10,8))
sns.barplot('acc_score','Model',data=ml_model, ax=ax,palette='RdBu_r',edgecolor=".2")
for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_width()+.01, i.get_y()+.55, \
        str(round((i.get_width()), 2)), fontsize=12, color='black') 
kwargs= {'length':3, 'width':1, 'colors':'black','labelsize':'large'}
ax.tick_params(**kwargs)
x_axis = ax.axes.get_xaxis().set_visible(False)
ax.set_title('Model & Accuracy Score',fontsize=16)
sns.despine(bottom=True)
plt.show()