<a href="https://colab.research.google.com/github/gitakartika/related-projects/blob/master/Protein%20Antioxidant%20Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
pip install biopython

In [0]:
pip install discere

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import discere.discere as di
import itertools as it
import os,pickle
import shutil, pkg_resources
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.decomposition import PCA
from itertools import cycle
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, accuracy_score

In [0]:
di.extract_feature('/content/drive/My Drive/PemodelanMatematika/positive_training.fasta', 
                     '/content/drive/My Drive/PemodelanMatematika/negative_training.fasta', 
                     '/content/drive/My Drive/PemodelanMatematika')

In [0]:
df = pd.read_csv('/content/drive/My Drive/PemodelanMatematika/output/tain_DL.csv',header=None)

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df.isnull().sum()

In [0]:
print("Total number of labels: {}".format(df.shape[0]))
print("Number of Non-Antioxidant Protein: {}".format(df[df[27] == 0].shape[0]))
print("Number of Antioxidant Protein: {}".format(df[df[27] == 1].shape[0]))

In [0]:
df.corr()

In [0]:
figure(num=None, figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
sns.heatmap(df.corr(),annot=True)

In [0]:
X=df.iloc[:, :-1]
X.head()

In [0]:
y=df.iloc[:,-1]

def_encoder = LabelEncoder()
y = def_encoder.fit_transform(y)
y

In [0]:
correlated_features = set()
correlation_matrix = df.drop(27, axis=1).corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [0]:
list(correlated_features)

In [0]:
X = X.drop(list(correlated_features),axis=1)
target = y

rfc = RandomForestClassifier(random_state=101)
rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10), scoring='accuracy')
rfecv.fit(X, target)

In [0]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

In [0]:
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination dengan Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Banyaknya Fitur yang Dipilih', fontsize=14, labelpad=20)
plt.ylabel('% Akurasi', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

plt.show()

In [0]:
print(np.where(rfecv.support_ == False)[0])

X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)

In [0]:
rfecv.estimator_.feature_importances_

In [0]:
dset = pd.DataFrame()
dset['attr'] = X.columns
dset['importance'] = rfecv.estimator_.feature_importances_
dset['attr']=dset['attr'].astype(str)
dset = dset.sort_values(by='importance', ascending=False)


plt.figure(figsize=(16, 14))
plt.barh(y=dset['attr'], width=dset['importance'], color='#1976D2')
plt.title('RFECV - Kepentingan Fitur', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Nilai Kepentingan Fitur', fontsize=14, labelpad=20)
plt.show()

In [0]:
X.columns

In [0]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [0]:
def plot_svc_decision_function(model, ax=None, plot_support=True):
    """Plot the decision function for a 2D SVC"""
    if ax is None:
        ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # create grid to evaluate model
    x = np.linspace(xlim[0], xlim[1], 30)
    y = np.linspace(ylim[0], ylim[1], 30)
    Y, X = np.meshgrid(y, x)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)
    
    # plot decision boundary and margins
    ax.contour(X, Y, P, colors='k',
               levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    
    # plot support vectors
    if plot_support:
        ax.scatter(model.support_vectors_[:, 0],
                   model.support_vectors_[:, 1],
                   s=300, linewidth=1, facecolors='none');
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

In [0]:
pca = PCA(n_components=2, whiten=True).fit(X)
X_pca = pca.transform(X)
print("explained variance ratio:", pca.explained_variance_ratio_)
print("Preserved Variance:", sum(pca.explained_variance_ratio_))

# Print scatter plot to view classification of the simplified dataset
colors = cycle('rgb')
target_names = [0, 1]

plt.figure()

target_list = np.array(y).flatten()
for t_name, c in zip(target_names, colors):
    plt.scatter(X_pca[target_list == t_name, 0], X_pca[target_list ==t_name, 1], c=c, label=t_name)

plt.legend()
plt.show()

In [0]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=1)

svc=SVC()
svc.fit(X_train_pca,y_train_pca)
y_pred=svc.predict(X_test_pca)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test_pca,y_pred))

In [0]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, s=50, cmap='autumn');
plot_svc_decision_function(svc)

In [0]:
svc=SVC()
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
svc=SVC(kernel='poly')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

# Cross Validation

In [0]:
svc=SVC(kernel='linear')
scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())

In [0]:
svc=SVC(kernel='rbf')
scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy') 
print(scores)
print(scores.mean())

In [0]:
svc=SVC()
scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy') 
print(scores)
print(scores.mean())

In [0]:
svc=SVC(kernel='poly')
scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy') 
print(scores)
print(scores.mean())

In [0]:
C_range=list(np.arange(1,10,0.5))
acc_score=[]
for c in C_range:
    svc = SVC(kernel='linear', C=c)
    scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
    acc_score.append(scores.mean())

C_values=list(np.arange(1,10,0.5))
figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(C_values,acc_score)
plt.xticks(np.arange(1,10,1))
plt.xlabel('Value of C for SVC')
plt.ylabel('Cross-Validated Accuracy')

In [0]:
C_range=list(np.arange(3,5,0.1))
acc_score=[]
for c in C_range:
    svc = SVC(kernel='linear', C=c)
    scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
    acc_score.append(scores.mean())

C_values=list(np.arange(3,5,0.1))
figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(C_values,acc_score)
plt.xticks(np.arange(3,5,0.1))
plt.xlabel('Value of C for SVC ')
plt.ylabel('Cross-Validated Accuracy')

In [0]:
gamma_range=list(np.arange(0.1,5,0.1))
acc_score=[]
for g in gamma_range:
    svc = SVC(kernel='rbf', gamma=g)
    scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
    acc_score.append(scores.mean())

figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(gamma_range,acc_score)
plt.xlabel('Value of gamma for SVC ')
plt.xticks(np.arange(0.1,5,0.1))
plt.ylabel('Cross-Validated Accuracy')

In [0]:
gamma_range=list(np.arange(0.3,0.45,0.005))
acc_score=[]
for g in gamma_range:
    svc = SVC(kernel='rbf', gamma=g)
    scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
    acc_score.append(scores.mean())

figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(gamma_range,acc_score)
plt.xlabel('Value of gamma for SVC ')
plt.ylabel('Cross-Validated Accuracy')

In [0]:
degree=[1,2,3,4,5,6,7,8,9,10]
acc_score=[]
for d in degree:
    svc = SVC(kernel='poly', degree=d)
    scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
    acc_score.append(scores.mean())
    
figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(degree,acc_score,color='r')
plt.xlabel('degrees for SVC ')
plt.ylabel('Cross-Validated Accuracy')

In [0]:
svc= SVC(kernel='linear',C=3.5)
svc.fit(X_train,y_train)
y_predict=svc.predict(X_test)
accuracy_score= metrics.accuracy_score(y_test,y_predict)
print('Akurasi:',accuracy_score)

scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print('Skor CV:',scores)
print('Mean Skor CV:',scores.mean())

In [0]:
svc= SVC(kernel='rbf',gamma=0.31)
svc.fit(X_train,y_train)
y_predict=svc.predict(X_test)
accuracy_score=metrics.accuracy_score(y_test,y_predict)
print('Akurasi:',accuracy_score)

scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print('Skor CV:',scores)
print('Mean Skor CV:',scores.mean())

In [0]:
svc=SVC(kernel='linear',gamma=0.31)
svc.fit(X_train,y_train)
y_predict=svc.predict(X_test)
accuracy_score=metrics.accuracy_score(y_test,y_predict)
print('Akurasi:',accuracy_score)

scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print('Skor CV:',scores)
print('Mean Skor CV:',scores.mean())

In [0]:
svc= SVC(kernel='poly',degree=7)
svc.fit(X_train,y_train)
y_predict=svc.predict(X_test)
accuracy_score= metrics.accuracy_score(y_test,y_predict)
print('Akurasi:',accuracy_score)

scores = cross_val_score(svc, X, y, cv=5, scoring='accuracy')
print('Skor CV:',scores)
print('Mean Skor CV:',scores.mean())

In [0]:
svm_model= SVC()
tuned_parameters = {'C': np.arange(3,3.7,0.1) ,
                    'kernel': ['linear','rbf','poly'],
                    'gamma': np.arange(0.3,0.45,0.005),
                    'degree': [6,7,8]} 
model_svm = GridSearchCV(svm_model, tuned_parameters,cv=5,scoring='accuracy',return_train_score=True)

In [0]:
model_svm.fit(X_train, y_train)
print(model_svm.best_score_)

In [0]:
print(model_svm.best_params_)

In [0]:
y_pred= model_svm.predict(X_test)
print('Akurasi:',metrics.accuracy_score(y_pred,y_test))

In [0]:
#Buat cek overfitting ato ga
y_of=model_svm.predict(X_train)
print('Akurasi:',metrics.accuracy_score(y_of,y_train))

In [0]:
#Kalo nilai akurasi validasi jauh lebih rendah dibanding akurasi trainnya maka overfit (ini ga)
scores=cross_val_score(model_svm,X_train,y_train,cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### DI BAWAH INI YANG NYOBA PAKAI FITUR DATASET PROTEIN BENERAN

In [0]:
pip install discere



In [0]:
import discere.discere as di
di.extract_feature('/content/drive/My Drive/Pemodelan Matematika/train_data_positive.fasta', 
                  '/content/drive/My Drive/Pemodelan Matematika/train_data_negative.fasta',
                   '/content/drive/My Drive/Pemodelan Matematika/')

Processing fasta files....
Cleaning existing data...


FileNotFoundError: ignored

In [0]:
df = pd.read_csv('/content/drive/My Drive/Pemodelan Matematika/output/tain_DL.csv',header=None)
df.head()
df[26]

In [0]:
X=df.iloc[:, :-1]
X.head()

In [0]:
y=df.iloc[:,-1]

def_encoder = LabelEncoder()
y = def_encoder.fit_transform(y)


In [0]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [0]:
pca = PCA(n_components=2, whiten=True).fit(X)
X_pca = pca.transform(X)
print("explained variance ratio:", pca.explained_variance_ratio_)
print("Preserved Variance:", sum(pca.explained_variance_ratio_))

# Print scatter plot to view classification of the simplified dataset
colors = cycle('rgb')
target_names = [0, 1]

plt.figure()

target_list = np.array(y).flatten()
for t_name, c in zip(target_names, colors):
    plt.scatter(X_pca[target_list == t_name, 0], X_pca[target_list ==t_name, 1], c=c, label=t_name)

plt.legend()
plt.show()

In [0]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=1)

svc=SVC()
svc.fit(X_train_pca,y_train_pca)
y_pred_pca=svc.predict(X_test_pca)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test_pca,y_pred_pca))

In [0]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, s=50, cmap='autumn');
plot_svc_decision_function(svc)

In [0]:
svc=SVC()
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [0]:
SVM_model=SVC()
tuned_parameters = {
 'C': np.arange(3,4,0.1) , 'kernel': ['linear','rbf','poly'],
 'gamma': [0.01,0.013,0.015,0.017,0.02], 
 'degree': [0,1,2,3]
                   }

In [0]:
model_svm = GridSearchCV(SVM_model, tuned_parameters,cv=5,scoring='accuracy')
model_svm.fit(X_train, y_train)
print(model_svm.best_score_)
print(model_svm.best_params_)