In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
%cd /content/drive/My\ Drive/Projet-Proteins/notebooks
!ls

/content/drive/My Drive/Projet-Proteins/notebooks
clfswitcher.py	mylib.py  preprocessing.py  protein_main.ipynb	__pycache__


In [65]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

import pandas as pd

from importlib import reload

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid, train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE

import preprocessing
from preprocessing import PreprocessingTransformer
from preprocessing import load_datasets
from preprocessing import prepare_target

import clfswitcher
from clfswitcher import ClfSwitcher

from sklearn.metrics import confusion_matrix, classification_report

import joblib

pd.options.mode.chained_assignment = None

reload(preprocessing);
reload(clfswitcher);

In [24]:
df = load_datasets()
df.shape

(471149, 16)

In [54]:
df = prepare_target(df)

[1mFinal DataFrame has 288179 lines et 16 columns after removing all classes with less than 5000 items


In [55]:
#Get  train / test data 
data = df.drop('classification', axis = 1)
target = df.classification
X_train, X_test, y_train,  y_test = train_test_split(data, target, test_size=0.3, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((201725, 15), (86454, 15), (201725,), (86454,))

In [56]:
#Feature selection
rfe = RFE(ExtraTreesClassifier(), n_features_to_select=8)

In [60]:
#Pipeline
steps = [('data_prep', PreprocessingTransformer()),
         ('feat_select', rfe),
         ('clf', ExtraTreesClassifier())
         #('clf', ClfSwitcher())
         ]

grid_params = [
    {
        'clf__estimator': [ExtraTreesClassifier()], 
        'clf__estimator__max_features': range(1,8),
        'clf__estimator__min_samples_split': range(4,10),
        'clf__estimator__min_samples_leaf': range(1,6)
    },
    {
        #'feat_select__estimator': [RandomForestClassifier()],
        #'feat_select__n_features':[4,6,8],
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__max_depth': [10,20,40], 
        'clf__estimator__n_estimators':[100,200,300],
        'clf__estimator__min_samples_leaf': [1,2,5]
    },
]

pipeline = Pipeline(steps, verbose=True)

#grid = GridSearchCV(pipeline, grid_params, cv=5, n_jobs=-1, return_train_score=False, verbose=3)
#grid.fit(X_train, y_train)
pipeline.fit(X_train, y_train)

1.Drop useless columns
2.Replace missing values in X
3.Reduce modalities
4.Correct skewness
5.scale and encode categ values
-- Preprocessing done -- 
[Pipeline] ......... (step 1 of 3) Processing data_prep, total=   2.0s
[Pipeline] ....... (step 2 of 3) Processing feat_select, total= 3.7min
[Pipeline] ............... (step 3 of 3) Processing clf, total=  15.2s


Pipeline(steps=[('data_prep', PreprocessingTransformer()),
                ('feat_select',
                 RFE(estimator=ExtraTreesClassifier(), n_features_to_select=8)),
                ('clf', ExtraTreesClassifier())],
         verbose=True)

In [61]:
y_pred = pipeline.predict(X_test)

1.Drop useless columns
2.Replace missing values in X
3.Reduce modalities
4.Correct skewness
5.scale and encode categ values
-- Preprocessing done -- 


In [62]:
print("score on Train set:", pipeline.score(X_train, y_train))

1.Drop useless columns
2.Replace missing values in X
3.Reduce modalities
4.Correct skewness
5.scale and encode categ values
-- Preprocessing done -- 
score on Train set: 0.9996083777419754


In [63]:
print("score on Test set:", pipeline.score(X_test, y_test))

1.Drop useless columns
2.Replace missing values in X
3.Reduce modalities
4.Correct skewness
5.scale and encode categ values
-- Preprocessing done -- 
score on Test set: 0.8937700973928332


In [64]:
print(classification_report(y_test, y_pred))

                               precision    recall  f1-score   support

                    HYDROLASE       0.79      0.89      0.84     14269
HYDROLASE/HYDROLASE INHIBITOR       0.91      0.87      0.89      3365
                IMMUNE SYSTEM       0.91      0.92      0.92      4760
                    ISOMERASE       0.94      0.79      0.86      2056
                       LIGASE       0.91      0.75      0.82      1515
                        LYASE       0.93      0.86      0.90      3526
               OXIDOREDUCTASE       0.91      0.88      0.90     10577
              PROTEIN BINDING       0.84      0.76      0.80      1505
                     RIBOSOME       0.98      0.99      0.98     18349
          RIBOSOME/ANTIBIOTIC       0.90      0.77      0.83      1583
            SIGNALING PROTEIN       0.83      0.75      0.79      1961
                TRANSCRIPTION       0.89      0.84      0.86      3136
                  TRANSFERASE       0.83      0.87      0.85     11242
     

In [66]:
#save the pipeline
joblib.dump(pipeline, 'protein_pipeline.joblib')

PicklingError: ignored