Сделаем ансамбль из 100 моделей, каждая из которых будет говорить является ли очередной коллаж 
лидером по конкретной характеристике, или нет. Лидер характеристики - тот коллаж, у которого 
ранжирование определенной характеристики равно 5.

In [99]:
import catboost

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from src.utils import get_root_path


In [229]:
rankings = pd.read_csv(get_root_path() / 'data/interim/attributes.csv',
                       index_col=0).set_index('fileName').filter(regex='_no', axis=1).sort_index()
embs = pd.read_csv(get_root_path() / 'data/interim/inception_v3_whole_collages.csv',
                   index_col=0).set_index('fileName').sort_index()


In [230]:
class Model:
    def __init__(self, attribute, target, embs, test_size=None,
                 random_state=None):

        self._attribute = attribute
        self._target = target
        self._embs = embs
        self._test_size = test_size
        self._random_state = random_state
        self._model = catboost.CatBoostClassifier(iterations=200,
                                                  depth=3,
                                                  learning_rate=0.5,
                                                  loss_function='MultiClass',
                                                  eval_metric='Accuracy')
        self._test = None
        self._train = None

    def train(self):

        X_train, X_test, y_train, y_test = train_test_split(self._embs,
                                                            self._target,
                                                            test_size=self._test_size,
                                                            random_state=self._random_state,
                                                            stratify=self._target)

        self._model.fit(
            X_train, y_train,
            eval_set=(X_test, y_test),
            verbose=False
        )

    def predict(self, new_data):
        return self._model.predict_proba(new_data)


class Ensemble:
    def __init__(self, test_size=0.1, random_state=43):
        self.models = {}
        self.trains = None
        self.tests = None
        self.embs = None
        self.test_size = test_size
        self.random_state = random_state

    def append(self, attribute, target):
        self.models.update(
            {attribute: Model(attribute,
                              target,
                              self.embs,
                              self.test_size,
                              self.random_state)})

    def train(self, X_train, y_train):

        self.embs = X_train

        for fileName, col in y_train.iteritems():
            self.append(fileName, col.values)

        for key, model in tqdm(self.models.items()):
            model.train()

    def predict(self, new_data):
        probas = {}
        for key, model in tqdm(self.models.items()):
            proba = model.predict(new_data)
            probas.update({key: proba})
        return probas


X_train, X_test, y_train, y_test = train_test_split(embs,
                                                    rankings,
                                                    test_size=0.33,
                                                    random_state=70)


model = Ensemble()
model.train(X_train.values, y_train)
results = model.predict(X_test)



  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

In [231]:
votes = {}
f_score = {}
accuracy = {}

for m, proba in results.items():
    votes.update({m: proba.argmax(axis=1)})
    f_score.update({m: f1_score(votes[m], y_test[m], average='macro')})
    accuracy.update({m: (votes[m] == y_test[m]).mean()})


In [233]:
sorted(f_score.items(), key=lambda x: x[-1])


[('technical_no', 0.020325203252032516),
 ('rugged_no', 0.03677592011740669),
 ('masculine_no', 0.05637275783081073),
 ('glamorous_no', 0.08203984513412393),
 ('daring_no', 0.08439764521422108),
 ('smallTown_no', 0.09691323253885076),
 ('upperClass_no', 0.1083145096866195),
 ('feminine_no', 0.11874258061821276),
 ('reliable_no', 0.1196299345463131),
 ('independent_no', 0.12054991689178),
 ('tough_no', 0.12165153818710633),
 ('dynamic_no', 0.1222797452867419),
 ('western_no', 0.1234096426863157),
 ('contemporary_no', 0.12489213636972589),
 ('cool_no', 0.12689270893520507),
 ('spirited_no', 0.12896073417142404),
 ('charming_no', 0.12942328038687348),
 ('original_no', 0.1311129082810746),
 ('wholesome_no', 0.13176102148724364),
 ('outdoorsy_no', 0.13179663666063848),
 ('young_no', 0.1327794621801258),
 ('trendy_no', 0.13298279550688538),
 ('innovative_no', 0.13397032184103355),
 ('leader_no', 0.13477438311139925),
 ('downToEarth_no', 0.13591493017520173),
 ('familyOriented_no', 0.13672664