# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [1]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [2]:
print(bioresponce.shape)
bioresponce.head(5)

(3751, 1777)


Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [2]:
y = bioresponce.Activity.values

In [3]:
X = bioresponce.iloc[:, 1:]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [10]:
type(preds)

numpy.ndarray

In [11]:
10 // 9

1

In [12]:
print(sum(preds == y_test) / len(preds))

0.75605815832


In [13]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [14]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [None]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

In [16]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [18]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.715670436187 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.741518578352 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.781906300485 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
 

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

### Random forest + Grid search

In [7]:
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

In [46]:
rf = RandomForestClassifier(random_state=1)

grid_params = {'n_estimators': np.arange(90, 150, 10),
               'max_depth': np.arange(10, 20),
              }
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
grid_cv = GridSearchCV(rf, grid_params, cv=cv, n_jobs=3)

In [47]:
%%time
grid_cv.fit(X_train, y_train)

CPU times: user 37.8 s, sys: 371 ms, total: 38.2 s
Wall time: 4min 51s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'n_estimators': array([ 90, 100, 110, 120, 130, 140]), 'max_depth': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
rf = grid_cv.best_estimator_
print(rf)
cross_val_score(rf, X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)


array([ 0.78042959,  0.79116945,  0.78853047])

### xgboost + Randomized search

In [14]:
import xgboost as xgb
from scipy.stats import poisson, randint

In [72]:
gb = xgb.XGBClassifier(seed=1)

params_dist = {'n_estimators': poisson(mu=80), 
               'max_depth': poisson(mu=4)}

grid_cv = RandomizedSearchCV(gb, params_dist, cv=cv, n_iter=50)

In [73]:
%%time
grid_cv.fit(X_train, y_train)

CPU times: user 1h 34min 52s, sys: 28.1 s, total: 1h 35min 20s
Wall time: 26min 2s


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3abb7e90b8>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3abb946b70>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [74]:
gb = grid_cv.best_estimator_
print(gb)
cross_val_score(gb, X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=74, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1, silent=True, subsample=1)


array([ 0.77684964,  0.78400955,  0.79928315])

-  RandomizedSearch работает гораздо медленнее, чем GridSearch
-  В случае, когда среди множества настраиваемых параметров, есть те, важность которых больше, лучше использовать RandomizedSearch (см. картинку) 

<img src="example.jpg"/>