In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifierfrom sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [2]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [25]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    if type(cv).__name__ == 'KFold':
        splits = cv.split(X_train)
    elif type(cv).__name__ == 'StratifiedKFold':
        splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [26]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [5]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

Задание 6.6.2

In [6]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', multi_class='ovr', max_iter=2000, random_state=42),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', multi_class='multinomial', max_iter=2000, random_state=42),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:16<00:00, 79.21s/it]


In [7]:
stacked_features_train.shape

(1437, 40)

In [8]:
stacked_features_test.shape

(360, 40)

In [9]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [10]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.979699

Задание 6.6.3

In [11]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=200, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:25<00:00, 12.75s/it]


In [12]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [13]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.982421

Задание 6.6.4

In [15]:
stacked_features_train, stacked_features_test = generate_meta_features([
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.24s/it]


In [16]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [17]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.989904

Задание 6.6.5

In [18]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', multi_class='ovr', max_iter=2000, random_state=42),
    KNeighborsClassifier(),  
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    AdaBoostClassifier(random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:55<00:00, 28.77s/it]


In [19]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [20]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.987404

Задание 6.6.6

In [21]:
cvs = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [24]:
type(cvs).__name__, type(cv).__name__

('StratifiedKFold', 'KFold')

In [27]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cvs)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:16<00:16, 16.10s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:29<00:00, 14.83s/it][A


In [28]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [29]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.983918

Задание 6.6.7

In [30]:
cvs_20 = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

In [31]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cvs_20)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:30<00:30, 30.78s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:56<00:00, 28.01s/it][A


In [32]:
clf = LogisticRegression(penalty='none', solver='lbfgs', random_state=42)

In [33]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.984228

Задание 6.6.8

In [43]:
cvs_5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [44]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cvs_20)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:29<00:29, 29.74s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:55<00:00, 27.75s/it][A


In [45]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42)

In [46]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.98115

Задание 6.6.9

In [47]:
clf = KNeighborsClassifier()

In [48]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.98417

Задание 6.6.10

In [49]:
clf = GradientBoostingClassifier(random_state=42)

In [50]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.974924

Задание 6.6.11

In [51]:
cvs_3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [52]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cvs_20)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:30<00:30, 30.42s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:55<00:00, 27.63s/it][A


In [53]:
clf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42)

In [54]:
compute_metric(clf, stacked_features_train, y_train, stacked_features_test)

0.981669

Задание 6.6.12

In [55]:
clf1 = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=24, n_jobs=-1, random_state=42)
clf2 = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, random_state=42)
clf3 = LogisticRegression(random_state=42)

In [56]:
clf1.fit(X_train, y_train)
prediction1 = clf1.predict(X_test)

In [58]:
clf2.fit(X_train, y_train)
prediction2 = clf2.predict(X_test)

In [59]:
clf3.fit(X_train, y_train)
prediction3 = clf3.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [65]:
prediction1

array([6, 9, 3, 7, 2, 1, 5, 2, 5, 2, 1, 9, 4, 0, 4, 2, 3, 7, 8, 8, 4, 3,
       9, 7, 5, 6, 3, 5, 6, 3, 4, 9, 1, 4, 4, 6, 9, 4, 7, 6, 6, 9, 1, 3,
       6, 1, 3, 0, 6, 5, 5, 1, 9, 5, 6, 0, 9, 0, 0, 1, 0, 4, 5, 2, 4, 5,
       7, 0, 7, 5, 9, 5, 5, 4, 7, 0, 4, 5, 5, 9, 9, 0, 2, 3, 8, 0, 6, 4,
       4, 9, 1, 2, 8, 3, 5, 2, 9, 0, 4, 4, 4, 3, 5, 3, 1, 3, 5, 9, 4, 2,
       7, 7, 4, 4, 1, 9, 2, 7, 8, 7, 2, 6, 9, 4, 0, 7, 2, 7, 5, 8, 7, 5,
       7, 9, 0, 6, 6, 4, 2, 8, 0, 9, 4, 6, 9, 9, 6, 9, 0, 5, 5, 6, 6, 0,
       6, 4, 3, 9, 3, 7, 7, 2, 9, 0, 4, 5, 8, 6, 5, 9, 9, 8, 4, 2, 1, 3,
       7, 7, 2, 2, 3, 9, 8, 0, 3, 2, 2, 5, 6, 9, 9, 4, 1, 5, 4, 2, 3, 6,
       4, 8, 5, 9, 5, 7, 1, 9, 4, 8, 1, 5, 4, 4, 9, 6, 1, 8, 6, 0, 4, 5,
       2, 7, 4, 6, 4, 5, 6, 0, 3, 2, 3, 6, 7, 1, 5, 1, 4, 7, 6, 8, 1, 5,
       5, 1, 5, 2, 8, 8, 9, 5, 7, 6, 2, 2, 2, 3, 4, 8, 8, 3, 6, 0, 9, 7,
       7, 0, 1, 0, 4, 5, 1, 5, 3, 6, 0, 4, 1, 0, 0, 3, 6, 5, 9, 7, 3, 5,
       5, 9, 9, 8, 5, 3, 3, 2, 0, 5, 8, 3, 4, 0, 2,

In [66]:
answer = np.round((prediction1 + prediction2 + prediction3)/3)
answer

array([6., 9., 3., 7., 2., 1., 5., 2., 5., 2., 1., 9., 4., 0., 4., 2., 3.,
       7., 8., 8., 4., 3., 9., 7., 5., 6., 3., 5., 6., 3., 4., 9., 1., 4.,
       4., 6., 9., 4., 7., 6., 6., 9., 1., 3., 6., 1., 3., 0., 6., 5., 5.,
       1., 7., 5., 6., 0., 9., 0., 0., 1., 0., 4., 5., 2., 4., 5., 7., 0.,
       7., 5., 9., 5., 5., 4., 7., 0., 4., 5., 5., 9., 9., 0., 2., 3., 8.,
       0., 6., 4., 4., 9., 1., 2., 8., 3., 5., 2., 9., 0., 4., 4., 4., 3.,
       5., 3., 1., 3., 5., 9., 4., 2., 7., 7., 4., 4., 1., 9., 2., 7., 8.,
       7., 2., 6., 9., 4., 0., 7., 2., 7., 5., 8., 7., 5., 7., 8., 0., 6.,
       6., 4., 2., 8., 0., 9., 4., 6., 9., 9., 6., 9., 0., 5., 5., 6., 6.,
       0., 6., 4., 3., 9., 3., 7., 7., 2., 9., 0., 5., 5., 5., 6., 5., 9.,
       9., 8., 4., 2., 1., 3., 7., 7., 2., 2., 3., 9., 8., 0., 3., 2., 2.,
       5., 6., 9., 9., 4., 1., 4., 4., 2., 3., 6., 4., 8., 5., 9., 5., 7.,
       3., 9., 4., 8., 1., 5., 4., 4., 9., 6., 1., 8., 6., 0., 4., 5., 2.,
       7., 3., 6., 4., 5.

In [67]:
np.round(f1_score(y_test, answer, average='macro'), 6)

0.960096