## Решение контеста Dota 2: Win Probability Prediction 
### Курс "Машинное обучение" ММП ВМК МГУ
### Каюмов Эмиль

Задача: предсказать по первым пяти минутам матча команду-победителя.

Метрика: AUC.

#### Подготовительные работы

In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; }</style>")

In [None]:
import os
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from nolearn.lasagne import NeuralNet, TrainSplit
from lasagne.layers import DenseLayer, InputLayer, DropoutLayer
from lasagne.updates import adagrad
from lasagne.nonlinearities import softmax

Откроем базовый датасет и выделим target.

In [None]:
X_train = pd.read_csv('input/features.csv', index_col='match_id')
X_test  = pd.read_csv('input/features_test.csv', index_col='match_id')

del_columns = [x for x in set(X_train.columns) - set(X_test.columns)] # target и дополнительные столбцы результата
y_train_df = X_train['radiant_win']
y_train = np.array(y_train_df)
X_train.drop(del_columns, axis=1, inplace=True)

train_size = X_train.shape[0]
test_size  = X_test.shape[0]
data = pd.concat((X_train, X_test)) # объединим выборки

Откроем файлы с дополнительными данными, включая таблицу с характеристиками героев.

In [None]:
heroes = pd.read_csv('input/dictionaries/heroes.csv', index_col='id')
abilities = pd.read_csv('input/dictionaries/abilities.csv', index_col='id')
items = pd.read_csv('input/dictionaries/items.csv', index_col='id')

heroes_database = pd.read_csv('input/dictionaries/heroes_db.csv', index_col='id')

#### Извлечение мешков слов для предметов и умений из JSON.

Используем исправленный скрипт для извлечения признаков основного датасета. Получим таблици, в которых строки соответствуют матчам, а столбцы предметам и умениям (первая половина столбцов одной команде, вторая половина другой).

In [None]:
!python2 extract_items.py input/matches.jsonlines.bz2      input/items_train.csv
!python2 extract_items.py input/matches_test.jsonlines.bz2 input/items_test.csv

In [None]:
!python2 extract_abilities.py input/matches.jsonlines.bz2      input/abilities_train.csv
!python2 extract_abilities.py input/matches_test.jsonlines.bz2 input/abilities_test.csv

#### Приготовление метапризнаков по мешкам слов из героев, предметов и умений.

Обучим модели отдельно на каждом из мешков и предскажем вероятность победы radiant.

Начнём с героев. Используем 4 модели для создания метапризнаков.

In [None]:
heroes_match = np.zeros((data.shape[0], heroes.shape[0]))

for i, match_id in enumerate(data.index):
    for p in range(5):
        heroes_match[i, data.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = +1
        heroes_match[i, data.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1

heroes_match_train = heroes_match[:train_size]
heroes_match_test  = heroes_match[train_size:]

In [None]:
clfs = [XGBClassifier(max_depth=4, n_estimators=1400, learning_rate=0.04, min_child_weight=7, colsample_bytree=0.4,
                      subsample=0.7, reg_alpha=1, reg_lambda=30, seed=789),
        KNeighborsClassifier(n_neighbors=10, p=2, weights='distance', n_jobs=-1),
        LogisticRegression(C=0.025, penalty='l2'),
        RandomForestClassifier(criterion='gini', n_estimators=1000, max_depth=12, max_features='auto', min_samples_split=2,
                               min_samples_leaf=7 ,random_state=101, n_jobs=-1)]

heroes_meta_train = np.zeros((train_size, len(clfs)))
heroes_meta_test  = np.zeros((test_size,  len(clfs)))

for j, clf in enumerate(clfs):
    print('Clf', j)
    dataset_meta_test_j = np.zeros((test_size, len(skf)))
    for i, (train, test) in enumerate(skf):
        print('Fold', i)
        X_tr = heroes_match_train[train]
        X_ts = heroes_match_train[test]
        y_tr = y_train[train]
        y_ts = y_train[test]
        clf.fit(X_tr, y_tr)
        heroes_meta_train[test, j] = clf.predict_proba(X_ts)[:, 1]
        dataset_meta_test_j[:, i] = clf.predict_proba(heroes_match_test)[:, 1]
    heroes_meta_test[:, j] = dataset_meta_test_j.mean(1)

# np.save('input/heroes_meta_train.npy', heroes_meta_train)
# np.save('input/heroes_meta_test.npy',  heroes_meta_test)

In [None]:
# heroes_meta_train = np.load('input/heroes_meta_train.npy')
# heroes_meta_test  = np.load('input/heroes_meta_test.npy')

heroes_meta_train = pd.DataFrame(heroes_meta_train, columns=['heroes_xgb', 'heroes_knn', 'heroes_lr', 'heroes_rf'], index=X_train.index)
heroes_meta_test  = pd.DataFrame(heroes_meta_test,  columns=['heroes_xgb', 'heroes_knn', 'heroes_lr', 'heroes_rf'], index=X_test.index)

Перейдём к предметам. Используем созданный ранее мешок слов.

In [None]:
items_train = pd.read_csv('input/items_train.csv', index_col='match_id')
items_test  = pd.read_csv('input/items_test.csv',  index_col='match_id')

Удалим те предметы, которые ни разу не встречаются в обучающей выборке (среди героев таковых всего несколько штук, а вот предметов заметно больше).

In [None]:
train_sum = items_train.sum(axis=0)

for i, col in enumerate(items_train.columns[:254]):
    if train_sum[i] == 0 and train_sum[i + 254] == 0:
        items_train.drop(col, axis=1, inplace=True)
        items_train.drop(col.replace('radiant', 'dire'), axis=1, inplace=True)

        items_test.drop(col, axis=1, inplace=True)
        items_test.drop(col.replace('radiant', 'dire'), axis=1, inplace=True)

Отмасштабируем и используем 4 модели для изготовления метапризнаков.

In [None]:
scaler = StandardScaler()
items_train = scaler.fit_transform(items_train)
items_test  = scaler.transform(items_test)

skf = list(KFold(y_train.shape[0], 5, shuffle=True, random_state=909))

clfs = [XGBClassifier(max_depth=4, n_estimators=1500, learning_rate=0.02, min_child_weight=5, colsample_bytree=0.3,
                      subsample=0.9, reg_alpha=5, reg_lambda=1, seed=4),
        KNeighborsClassifier(n_neighbors=10, p=2, weights='distance', n_jobs=-1),
        LogisticRegression(C=0.005, penalty='l2'),
        RandomForestClassifier(criterion='entropy', n_estimators=1000, max_depth=8, max_features='auto', min_samples_split=3,
                               min_samples_leaf=4, random_state=51, n_jobs=-1)]

items_meta_train = np.zeros((train_size, len(clfs)))
items_meta_test  = np.zeros((test_size,  len(clfs)))

for j, clf in enumerate(clfs):
    print('Clf', j)
    dataset_meta_test_j = np.zeros((test_size, len(skf)))
    for i, (train, test) in enumerate(skf):
        print('Fold', i)
        X_tr = items_train[train]
        X_ts = items_train[test]
        y_tr = y_train[train]
        y_ts = y_train[test]
        clf.fit(X_tr, y_tr)
        items_meta_train[test, j] = clf.predict_proba(X_ts)[:, 1]
        dataset_meta_test_j[:, i] = clf.predict_proba(items_test)[:, 1]
    items_meta_test[:, j] = dataset_meta_test_j.mean(1)

# np.save('input/items_meta_train.npy', items_meta_train)
# np.save('input/items_meta_test.npy',  items_meta_test)

In [None]:
# items_meta_train = np.load('input/items_meta_train.npy')
# items_meta_test  = np.load('input/items_meta_test.npy')

items_meta_train = pd.DataFrame(items_meta_train, columns=['items_xgb', 'items_knn', 'items_lr', 'items_rf'], index=X_train.index)
items_meta_test  = pd.DataFrame(items_meta_test,  columns=['items_xgb', 'items_knn', 'items_lr', 'items_rf'], index=X_test.index)

А теперь умения.

In [None]:
abilities_train = pd.read_csv('input/abilities_train.csv', index_col='match_id')
abilities_test  = pd.read_csv('input/abilities_test.csv',  index_col='match_id')

Снова удалим нулевые столбцы.

In [None]:
train_sum = abilities_train.sum(axis=0)

for i, col in enumerate(abilities_train.columns[:568]):
    if train_sum[i] == 0 and train_sum[i + 568] == 0:
        abilities_train.drop(col, axis=1, inplace=True)
        abilities_train.drop(col.replace('radiant', 'dire'), axis=1, inplace=True)

        abilities_test.drop(col, axis=1, inplace=True)
        abilities_test.drop(col.replace('radiant', 'dire'), axis=1, inplace=True)

Отмасштабируем и используем 5 моделей для метапризнаков (ранее не использовал ET, потому что забыл о нём). Это будет очень долго работать.

In [None]:
scaler = StandardScaler()
abilities_train = scaler.fit_transform(abilities_train)
abilities_test  = scaler.transform(abilities_test)

skf = list(KFold(y_train.shape[0], 5, shuffle=True, random_state=65))

clfs = [XGBClassifier(max_depth=6, n_estimators=700, learning_rate=0.02, seed=178),
        LogisticRegression(C=1.0, penalty='l2'),
        KNeighborsClassifier(n_neighbors=10, p=2, weights='distance', n_jobs=1),
        RandomForestClassifier(criterion='entropy', n_estimators=500, max_depth=10, max_features='sqrt', min_samples_split=2,
                               min_samples_leaf=1 ,random_state=90, n_jobs=1),
        ExtraTreesClassifier(criterion='gini', max_depth=9, n_estimators=500,
                             max_features='sqrt',random_state=45)]


abilities_meta_train = np.zeros((train_size, len(clfs)))
abilities_meta_test  = np.zeros((test_size,  len(clfs)))

for j, clf in enumerate(clfs):
    print('Clf', j)
    dataset_meta_test_j = np.zeros((test_size, len(skf)))
    for i, (train, test) in enumerate(skf):
        print('Fold', i)
        X_tr = abilities_train[train]
        X_ts = abilities_train[test]
        y_tr = y_train[train]
        y_ts = y_train[test]
        clf.fit(X_tr, y_tr)
        abilities_meta_train[test, j] = clf.predict_proba(X_ts)[:, 1]
        dataset_meta_test_j[:, i] = clf.predict_proba(abilities_test)[:, 1]
    abilities_meta_test[:, j] = dataset_meta_test_j.mean(1)

# np.save('input/abilities_meta_train.npy', abilities_meta_train)
# np.save('input/abilities_meta_test.npy',  abilities_meta_test)

In [None]:
# abilities_meta_train = np.load('input/abilities_meta_train.npy')
# abilities_meta_test  = np.load('input/abilities_meta_test.npy')

abilities_meta_train = pd.DataFrame(abilities_meta_train, columns=['abilities_xgb', 'abilities_knn', 'abilities_lr', 'abilities_rf', 'abilities_et'], index=X_train.index)
abilities_meta_test  = pd.DataFrame(abilities_meta_test,  columns=['abilities_xgb', 'abilities_knn', 'abilities_lr', 'abilities_rf', 'abilities_et'], index=X_test.index)

#### Первая модель - Vowpal Wabbit

Обучим первую модель (на самом деле лучший из двух результатов на отдельных моделях).

Не будем использовать метапризнак, основанный на умениях (не был готов к тому моменту). Также не будем использовать характеристики героев (тоже не было готово).

Необходимые для создания новых и удаления старых признаков функции:

In [None]:
def add_differences(data, cols):
    for col in cols:
        data[col + '_dif'] = np.sum([data['r%d_' % (i) + col] for i in range(1, 6)], axis=0) - \
                             np.sum([data['d%d_' % (i) + col] for i in range(1, 6)], axis=0)
    return data


def add_ratios(data, cols):
    for col in cols:
        data[col + '_rat'] = np.sum([data['r%d_' % (i) + col] for i in range(1, 6)], axis=0) / \
                             np.sum([data['d%d_' % (i) + col] for i in range(1, 6)], axis=0)
    return data


def add_max_differences(data, cols):
    for col in cols:
        data[col + '_maxdif'] = np.max([data['r%d_' % (i) + col] for i in range(1, 6)], axis=0) - \
                                np.max([data['d%d_' % (i) + col] for i in range(1, 6)], axis=0)
    return data


def add_min_differences(data, cols):
    for col in cols:
        data[col + '_mindif'] = np.min([data['r%d_' % (i) + col] for i in range(1, 6)], axis=0) - \
                                np.min([data['d%d_' % (i) + col] for i in range(1, 6)], axis=0)
    return data


def add_std_differences(data, cols):
    for col in cols:
        data[col + '_stddif'] = np.std([data['r%d_' % (i) + col] for i in range(1, 6)], axis=0) - \
                                np.std([data['d%d_' % (i) + col] for i in range(1, 6)], axis=0)
    return data


def del_individs(data, cols):
    for col in cols:
        data.drop(['%c%d_' % (c, i) + col for i in range(1, 6) for c in ['r', 'd']], axis=1, inplace=True)
    return data

Добавим новые признаки, основанные на разностях и отношеиях сумм характеристик героев команды, разностях между максимальными и минимальными значениями характеристик, разностях среднеквадратичных отклонений признаков команд. Добавим отдельно максимумы по опытам каждой из команд. 

In [None]:
cols_differences     = ['level', 'xp', 'gold', 'lh', 'items']
cols_ratios          = ['xp', 'gold']
cols_max_differences = ['level', 'xp', 'gold', 'lh', 'items']
cols_min_differences = ['xp', 'gold', 'lh']
cols_std_differences = ['xp', 'gold', 'lh']

data = add_differences(data, cols_differences)
data = add_ratios(data, cols_ratios)
data = add_max_differences(data, cols_max_differences)
data = add_min_differences(data, cols_min_differences)
data = add_std_differences(data, cols_std_differences)

data['r_xp_max'] = np.max([data['r%d_xp' % (i)] for i in range(1, 6)], axis=0)
data['d_xp_max'] = np.max([data['d%d_xp' % (i)] for i in range(1, 6)], axis=0)

data['bottle_time_dif'] = data.radiant_bottle_time - data.dire_bottle_time

data.fillna(0, inplace=True)

Удалим почти все индивидуальные признаки и некоторые из остальных.

In [None]:
cols_del_individs = ['hero', 'level', 'kills', 'deaths', 'items', 'lh', 'gold']
cols_del          = ['first_blood_team', 'radiant_ward_sentry_count', 'dire_ward_sentry_count',
                     'first_blood_player1', 'first_blood_player2', 'radiant_ward_observer_count',
                     'dire_ward_observer_count', 'lobby_type']

data = del_individs(data, cols_del_individs)
data.drop(cols_del, axis=1, inplace=True)

Обратно разделим на обучающую и тестовую выборки.

In [None]:
X_train = data.iloc[:train_size]
X_test  = data.iloc[train_size:]

Создадим функцию для генерации датасета для Vowpal Wabbit по базовым признакам, мешкам из героев, предметов и умений.

In [None]:
def make_features(data, heroes, items, abilities, name, is_train, y_train=None):

    half_items = items.shape[1] // 2
    half_abilities = abilities.shape[1] // 2

    with open(name, 'w') as fout:

        for match_id in data.index:

            row           = data.ix[match_id]
            row_heroes    = heroes.ix[match_id]
            row_items     = items.ix[match_id]
            row_abilities = abilities.ix[match_id]

            if is_train:
                target = 1 if y_train.ix[match_id] == 1 else -1
            else:
                target = 1

            fout.write(str(target) +
                       ' |features ' + ' '.join('{0}:{1}'.format(i, j) for i, j in zip(data.columns, row)) +
                       ' |heroes '  + ' '.join('{0}_{1}'.format('r' if i == 1 else 'd', heroes.columns[j]) for j, i in enumerate(row_heroes) if i != 0) +
                       ' |items ' + ' '.join('{0}:{1}'.format(items.columns[i], j) for i, j in enumerate(row_items) if j != 0) +
                       ' |abilities ' + ' '.join('{0}:{1}'.format(abilities.columns[i], j) for i, j in enumerate(row_abilities) if j != 0) +
                       '\n')

Отмасштабируем базовые признаки и разделим мешок слов по героям на две части.

In [None]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns,  index=X_test.index)

heroes_match_train = heroes_match.iloc[:train_size]
heroes_match_test  = heroes_match.iloc[train_size:]

Создадим признаки для VW.

In [None]:
make_features(X_train_vw, heroes_match_train, items_train, abilities_train, 'input/train.vw', True, y_train)
make_features(X_test_vw,  heroes_match_test,  items_test,  abilities_test,  'input/test.vw',  False)

Обучимся и сделаем предсказание (параметры оказались не самыми оптимальным: при уменьшении темпа обучения качество повышается).

In [None]:
os.system('vw -d input/train.vw -c -k -f model.vw --passes 200 -l 0.09 --power_t 0.36 --initial_t 0.1 -b 26 --loss_function logistic --quiet')
os.system('vw -d input/test.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

preds1 = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values
preds1 = pd.DataFrame(preds1, columns=['radiant_win'], index=X_test.index)

##### Private LB: 0.76413 .

#### Извлечение информации из характеристик героев.

Откроем заново базовые признаки.

In [None]:
X_train = pd.read_csv('input/features.csv', index_col='match_id')
X_test  = pd.read_csv('input/features_test.csv', index_col='match_id')

del_columns = [x for x in set(X_train.columns) - set(X_test.columns)] # target и дополнительные столбцы результата
y_train_df = X_train['radiant_win']
y_train = np.array(y_train_df)
X_train.drop(del_columns, axis=1, inplace=True)

train_size = X_train.shape[0]
test_size  = X_test.shape[0]
data = pd.concat((X_train, X_test)) # объединим выборки

Извлечём характеристики каждого из героев из базы данных характеристик.

In [None]:
for c in ['r', 'd']:
    for i in range(1, 6):
        data['%c%d_strength'      % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'STR'])
        data['%c%d_strength+'     % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'STR+'])
        data['%c%d_intelligence'  % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'INT'])
        data['%c%d_intelligence+' % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'INT+'])
        data['%c%d_agility'       % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'AGI'])
        data['%c%d_agility+'      % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'AGI+'])
        data['%c%d_attrs'         % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'T'])
        data['%c%d_attrs+'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'T+'])
        data['%c%d_speed'         % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'MOV'])
        data['%c%d_damage'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, '(MAX)'] + heroes_database.ix[x, '(MIN)'])
        data['%c%d_ranged'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'RNG'])
        data['%c%d_attacktime'    % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'BAT'])

for c in ['r', 'd']:
    for i in range(1, 6):
        data['%c%d_strength'     % (c, i)] += data['%c%d_strength+'     % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_intelligence' % (c, i)] += data['%c%d_intelligence+' % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_agility'      % (c, i)] += data['%c%d_agility+'      % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_attrs'        % (c, i)] += data['%c%d_attrs+'        % (c, i)] * data['%c%d_level' % (c, i)]

Снова аггрегируем характеристики команд.

In [None]:
cols_differences     = ['level', 'xp', 'gold', 'lh', 'items', 'strength', 'intelligence', 'agility', 'attrs', 'speed', 'damage', 'ranged', 'attacktime']
cols_ratios          = ['xp', 'gold', 'strength', 'intelligence', 'agility']
cols_max_differences = ['level', 'xp', 'gold', 'lh', 'items', 'strength', 'intelligence', 'agility', 'attrs', 'speed', 'damage', 'ranged', 'attacktime']
cols_min_differences = ['xp', 'gold', 'lh']
cols_std_differences = ['xp', 'gold', 'lh', 'strength', 'intelligence', 'agility']

data = add_differences(data, cols_differences)
data = add_ratios(data, cols_ratios)
data = add_max_differences(data, cols_max_differences)
data = add_min_differences(data, cols_min_differences)
data = add_std_differences(data, cols_std_differences)

data['r_xp_max'] = np.max([data['r%d_xp' % (i)] for i in range(1, 6)], axis=0)
data['d_xp_max'] = np.max([data['d%d_xp' % (i)] for i in range(1, 6)], axis=0)

data['bottle_time_dif'] = data.radiant_bottle_time - data.dire_bottle_time

data.fillna(0, inplace=True)

Посчитаем, сколько героем, каждой из ролей встречается в командах. Возьмём разность показателей между командами.

In [None]:
roles = ['Carry', 'Disabler', 'Lane_support', 'Initiator', 'Jungler', 'Support', 'Durable', 'Pusher', 'Nuker', 'Escape']
match_roles_table = np.zeros((data.shape[0], len(roles)))

for j, match_id in enumerate(data.index):

    for i in range(1, 6):
        match_roles_table[j] += np.array(heroes_database.ix[data.ix[match_id, 'r%d_hero' % (i)], roles], dtype=int)

    for i in range(1, 6):
        match_roles_table[j] -= np.array(heroes_database.ix[data.ix[match_id, 'd%d_hero' % (i)], roles], dtype=int)

match_roles_table = pd.DataFrame(match_roles_table, columns=roles, index=data.index)
match_roles_table.drop('Lane_support', axis=1, inplace=True) # не встречается
data = pd.concat((data, match_roles_table), axis=1)

Удалим ненужное (столько, пока у логистической регрессии из scikit-learn повышается результат).

In [None]:
cols_del_individs = ['damage', 'items', 'level', 'kills', 'deaths', 'intelligence+', 'strength+', 'agility+' , 'attacktime', 'speed', 'ranged']

cols_del          = ['first_blood_team', 'radiant_ward_sentry_count', 'dire_ward_sentry_count',
                     'radiant_ward_observer_count', 'dire_ward_observer_count']

data = del_individs(data, cols_del_individs)
data.drop(cols_del, axis=1, inplace=True)

Разделим и отмасштабируем (float32 для theano).

In [None]:
X_train = data.iloc[:train_size]
X_test  = data.iloc[train_size:]

scaler = StandardScaler()
X_train = np.array(scaler.fit_transform(X_train), dtype=np.float32)
X_test  = np.array(scaler.transform(X_test),      dtype=np.float32)

Пригодится для нейронной сети.

In [None]:
layers = [('input', InputLayer),
          ('dense0', DenseLayer),
          ('dropout0', DropoutLayer),
          ('output', DenseLayer)]

Используем 5 моделей для генерации метапризнаков.

In [None]:
skf = list(KFold(y_train.shape[0], 5, shuffle=True, random_state=8))

clfs = [NeuralNet(layers=layers, input_shape=(None, X_train.shape[1]),
                  dense0_num_units=80, dropout0_p=0.5,
                  output_num_units=2, output_nonlinearity=softmax,
                  update=adagrad, update_learning_rate=0.003,
                  train_split=TrainSplit(eval_size=0.2),
                  verbose=0, max_epochs=100),
        LogisticRegression(C=0.01, penalty='l2'),
        RandomForestClassifier(criterion='entropy', n_estimators=700, max_depth=7,
                               random_state=500, n_jobs=-1),
        ExtraTreesClassifier(criterion='gini', max_depth=12, n_estimators=700,
                             random_state=8010, n_jobs=-1),
        XGBClassifier(max_depth=4, n_estimators=700, learning_rate=0.015,
                      min_child_weight=5, seed=114)]


y_train = np.array(y_train, dtype=np.int32)

meta_train = np.zeros((train_size, len(clfs)))
meta_test  = np.zeros((test_size,  len(clfs)))

for j, clf in enumerate(clfs):
    print('Clf', j)
    dataset_meta_test_j = np.zeros((test_size, len(skf)))
    for i, (train, test) in enumerate(skf):
        print('Fold', i)
        X_tr = X_train[train]
        X_ts = X_train[test]
        y_tr = y_train[train]
        y_ts = y_train[test]
        clf.fit(X_tr, y_tr)
        meta_train[test, j] = clf.predict_proba(X_ts)[:, 1]
        dataset_meta_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
    meta_test[:, j] = dataset_meta_test_j.mean(1)

Сгенерируем метапризнаки с помощью Vowpal Wabbit с мешками.

In [None]:
X_train_vw = data.iloc[:train_size]
X_test_vw  = data.iloc[train_size:]

scaler = StandardScaler()
X_train_vw = pd.DataFrame(scaler.fit_transform(X_train_vw), columns=X_train_vw.columns, index=X_train_vw.index)
X_test_vw  = pd.DataFrame(scaler.transform(X_test_vw),      columns=X_test_vw.columns,  index=X_test_vw.index)

Обучим 2 VW: на всех признаках и мешках и только на мешках.

In [None]:
meta_vw_train = np.zeros((train_size, 2))
meta_vw_test  = np.zeros((test_size,  2))

dataset_meta_test_j1 = np.zeros((test_size, len(skf)))
dataset_meta_test_j2 = np.zeros((test_size, len(skf)))

for i, (train, test) in enumerate(skf):
    print('Fold', i)

    make_features(X_train_vw, heroes_match_train, items_train, abilities_train, 'input/train.vw', True, y_train)
    make_features(X_test_vw,  heroes_match_test,  items_test,  abilities_test, 'input/test.vw',  False)

    make_features(X_train_vw.iloc[train], heroes_match_train.iloc[train], items_train.iloc[train], abilities_train.iloc[train], 'input/train_cv.vw', True, y_train[train])
    make_features(X_train_vw.iloc[test],  heroes_match_train.iloc[test],  items_train.iloc[test],  abilities_train.iloc[test],  'input/test_cv.vw',  False)

    # first cv
    os.system('vw -d input/train_cv.vw -c -k -f model.vw --passes 200 -l 0.05 --power_t 0.5 --initial_t 0.0 -b 26 --loss_function logistic --quiet')
    os.system('vw -d input/test_cv.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

    meta_vw_train[test, 0] = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values

    # first test
    os.system('vw -d input/train.vw -c -k -f model.vw --passes 200 -l 0.05 --power_t 0.5 --initial_t 0.0 -b 26 --loss_function logistic --quiet')
    os.system('vw -d input/test.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

    dataset_meta_test_j1[:, i] = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values

    # second cv
    os.system('vw -d input/train_cv.vw -c -k -f model.vw --passes 200 -l 0.2 --power_t 0.43 --initial_t 0.0 -b 26 --loss_function logistic --quiet --ignore f')
    os.system('vw -d input/test_cv.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

    meta_vw_train[test, 1] = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values

    # second test
    os.system('vw -d input/train.vw -c -k -f model.vw --passes 200 -l 0.2 --power_t 0.43 --initial_t 0.0 -b 26 --loss_function logistic --quiet --ignore f')
    os.system('vw -d input/test.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

    dataset_meta_test_j2[:, i] = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values


meta_vw_test[:, 0] = dataset_meta_test_j1.mean(1)
meta_vw_test[:, 1] = dataset_meta_test_j2.mean(1)

meta_train = np.hstack((meta_train, meta_vw_train))
meta_test  = np.hstack((meta_test,  meta_vw_test))

# np.save('input/X_meta_train.npy', meta_train)
# np.save('input/X_meta_test.npy',  meta_test)

In [None]:
# meta_train = np.load('input/X_meta_train.npy')
# meta_test  = np.load('input/X_meta_test.npy')

meta_train = pd.DataFrame(meta_train, columns=['nn_meta', 'lr_meta', 'rf_meta', 'et_meta', 'xgb_meta', 'vw1_meta', 'vw2_meta'], index=X_train_vw.index)
meta_test  = pd.DataFrame(meta_test,  columns=['nn_meta', 'lr_meta', 'rf_meta', 'et_meta', 'xgb_meta', 'vw1_meta', 'vw2_meta'], index=X_test_vw.index)

#### Метапризнак по синергии и антисинергии.

In [None]:
X_train = pd.read_csv('input/features.csv', index_col='match_id')
X_test  = pd.read_csv('input/features_test.csv', index_col='match_id')

del_columns = [x for x in set(X_train.columns) - set(X_test.columns)] # target и дополнительные столбцы результата
y_train_df = X_train['radiant_win']
y_train = np.array(y_train_df)
X_train.drop(del_columns, axis=1, inplace=True)

train_size = X_train.shape[0]
test_size  = X_test.shape[0]
data = pd.concat((X_train, X_test)) # объединим выборки

Функция для подсчёта синергии по тренировочкой и тестовой выборке.

In [None]:
def add_synergy(train, test, y_train):

    heroes_count = 113
    train_size   = train.shape[0]
    synergy     = np.zeros((heroes_count, heroes_count)) 
    antisynergy = np.zeros((heroes_count, heroes_count)) 
    matchcounts = np.zeros((heroes_count, heroes_count)) 
    matchcounta = np.zeros((heroes_count, heroes_count)) 

    # считаем статистику
    for match_counter, match_id in enumerate(train.index):
        
        winteam = 'r' if y_train.ix[match_id] == 1 else 'd'
        looseteam = 'd' if winteam == 'r' else 'r'
        pind     = [0] * 5 
        antipind = [0] * 5 
        
        for i in range(5):
            pind[i] = train.ix[match_id, winteam + '%d_hero' % (i + 1)] - 1
        for i in range(5):
            antipind[i] = train.ix[match_id, looseteam + '%d_hero' % (i + 1)] - 1
        
        for i in range(5):
            for j in range(i+1,5):
                synergy[pind[i], pind[j]] += 1
                synergy[pind[j], pind[i]] += 1
        
        for i in range(5):
            for j in range(5):
                matchcounts[pind[i], pind[j]] += 1 
                matchcounts[antipind[i], antipind[j]] += 1 

        for i in range(5):
            for j in range(5):
                antisynergy[pind[i], antipind[j]] += 1
                matchcounta[pind[i], antipind[j]] += 1
                matchcounta[antipind[j], pind[i]] += 1

    synergyrate     = np.zeros((heroes_count, heroes_count))
    antisynergyrate = np.zeros((heroes_count, heroes_count))
    
    # нормализуем
    for i in range(heroes_count):
        for j in range(heroes_count):
            if matchcounts[i, j] != 0:
                synergyrate[i,j] = synergy[i, j] / matchcounts[i, j]
            else:
                synergyrate[i, j] = 0.5
            if matchcounta[i, j] != 0:
                antisynergyrate[i, j] = antisynergy[i, j] / matchcounta[i, j]
            else:
                antisynergyrate[i, j] = 0.5

    syn     = np.zeros(len(test))
    antisyn = np.zeros(len(test))
    
    # подсчитываем для тестовой выборки
    for match_counter, match_id in enumerate(test.index):
        rind = [0] * 5 
        dind = [0] * 5 
        
        for i in range(5):
            rind[i] = test.ix[match_id, 'r%d_hero' % (i + 1)] - 1
        for i in range(5):
            dind[i] = test.ix[match_id, 'd%d_hero' % (i + 1)] - 1
        
        for i in range(5):
            for j in range(i + 1, 5):
                syn[match_counter] += synergyrate[rind[i], rind[j]]
        
        for i in range(5):
            for j in range(i + 1, 5):
                syn[match_counter] -= synergyrate[dind[i], dind[j]]
        
        for i in range(5):
            for j in range(5):
                antisyn[match_counter] += antisynergyrate[rind[i], dind[j]]

    return syn, antisyn

In [None]:
skf = list(KFold(y_train.shape[0], 10, random_state=7))

synergy_train      = np.zeros(train_size)
synergy_test       = np.zeros(test_size)
antisynergy_train  = np.zeros(train_size)
antisynergy_test   = np.zeros(test_size)

for i, (train, test) in enumerate(skf):
    print('Fold', i)
    X_tr  = X_train.iloc[train]
    y_tr  = y_train.iloc[train]
    X_ts  = X_train.iloc[test]
    y_ts  = y_train.iloc[test]

    synergy_train[test], antisynergy_train[test] = add_synergy(X_tr, X_ts, y_tr)

synergy_test, antisynergy_test = add_synergy(X_train, X_test, y_train)

syn_train = np.hstack((synergy_train.reshape((-1, 1)), antisynergy_train.reshape((-1, 1))))
syn_test  = np.hstack((synergy_test.reshape((-1, 1)),  antisynergy_test.reshape((-1, 1))))

# np.save('input/meta_synergy_train.npy', syn_train)
# np.save('input/meta_synergy_test.npy',  syn_test)

In [None]:
# syn_train = np.load('input/meta_synergy_train.npy')
# syn_test  = np.load('input/meta_synergy_test.npy')

syn_train = pd.DataFrame(syn_train, columns=['synergy', 'antisynergy'], index=X_train.index)
syn_test  = pd.DataFrame(syn_test,  columns=['synergy', 'antisynergy'], index=X_test.index)

#### Вторая модель - Vowpal Wabbit

Будем использовать все метапризнаки: по мешкам, по другим алгоритмам на обычных признаках, по синергии.

In [None]:
bag_meta_train = pd.concat((heroes_meta_train, items_meta_train, abilities_meta_train), axis=1)
bag_meta_test  = pd.concat((heroes_meta_test,  items_meta_test,  abilities_meta_test),  axis=1)

Снова загрузим и обработаем признаки.

In [None]:
X_train = pd.read_csv('input/features.csv', index_col='match_id')
X_test  = pd.read_csv('input/features_test.csv', index_col='match_id')

del_columns = [x for x in set(X_train.columns) - set(X_test.columns)] # target и дополнительные столбцы результата
y_train_df = X_train['radiant_win']
y_train = np.array(y_train_df)
X_train.drop(del_columns, axis=1, inplace=True)

train_size = X_train.shape[0]
test_size  = X_test.shape[0]
data = pd.concat((X_train, X_test)) # объединим выборки

In [None]:
for c in ['r', 'd']:
    for i in range(1, 6):
        data['%c%d_strength'      % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'STR'])
        data['%c%d_strength+'     % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'STR+'])
        data['%c%d_intelligence'  % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'INT'])
        data['%c%d_intelligence+' % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'INT+'])
        data['%c%d_agility'       % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'AGI'])
        data['%c%d_agility+'      % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'AGI+'])
        data['%c%d_attrs'         % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'T'])
        data['%c%d_attrs+'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'T+'])
        data['%c%d_speed'         % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'MOV'])
        data['%c%d_damage'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, '(MAX)'] + heroes_database.ix[x, '(MIN)'])
        data['%c%d_ranged'        % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'RNG'])
        data['%c%d_attacktime'    % (c, i)] = data['%c%d_hero' % (c, i)].apply(lambda x: heroes_database.ix[x, 'BAT'])

for c in ['r', 'd']:
    for i in range(1, 6):
        data['%c%d_strength'     % (c, i)] += data['%c%d_strength+'     % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_intelligence' % (c, i)] += data['%c%d_intelligence+' % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_agility'      % (c, i)] += data['%c%d_agility+'      % (c, i)] * data['%c%d_level' % (c, i)]
        data['%c%d_attrs'        % (c, i)] += data['%c%d_attrs+'        % (c, i)] * data['%c%d_level' % (c, i)]

Будем использовать несколько другие признаки.

In [None]:
cols_differences     = ['level', 'xp', 'gold', 'lh', 'items', 'strength', 'intelligence', 'agility', 'attrs', 'speed', 'damage', 'ranged', 'attacktime']
cols_ratios          = ['xp', 'gold', 'strength', 'intelligence', 'agility']
cols_max_differences = ['level', 'xp', 'gold', 'lh', 'items', 'strength', 'intelligence', 'agility', 'attrs', 'speed', 'damage', 'ranged', 'attacktime']
cols_min_differences = ['xp', 'gold', 'lh']
cols_std_differences = ['xp', 'gold', 'lh', 'strength', 'intelligence', 'agility']

data = add_differences(data, cols_differences)
data = add_ratios(data, cols_ratios)
data = add_max_differences(data, cols_max_differences)
data = add_min_differences(data, cols_min_differences)
data = add_std_differences(data, cols_std_differences)

data['r_xp_max'] = np.max([data['r%d_xp' % (i)] for i in range(1, 6)], axis=0)
data['d_xp_max'] = np.max([data['d%d_xp' % (i)] for i in range(1, 6)], axis=0)

data['bottle_time_dif'] = data.radiant_bottle_time - data.dire_bottle_time

data.fillna(0, inplace=True)

In [None]:
cols_del_individs = ['ranged', 'kills', 'deaths', 'attacktime', 'speed', 'level', 'items', 'damage', 'strength+', 'hero', 'lh', 'xp',
                     'gold', 'strength', 'attrs', 'intelligence']
cols_del = ['first_blood_team', 'items_maxdif', 'lobby_type', 'first_blood_team', 'radiant_ward_sentry_count', 
            'dire_ward_sentry_count', 'first_blood_player1', 'level_maxdif', 'lh_mindif']

data = del_individs(data, cols_del_individs)
data.drop(cols_del, axis=1, inplace=True)

X_train = data.iloc[:train_size]
X_test  = data.iloc[train_size:]

In [None]:
X_train = pd.concat((X_train, bag_meta_train), axis=1)
X_test  = pd.concat((X_test,  bag_meta_test),  axis=1)

In [None]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns,  index=X_test.index)

In [None]:
scaler = StandardScaler()
meta_train = pd.DataFrame(scaler.fit_transform(meta_train), columns=meta_train.columns, index=X_train_vw.index)
meta_test  = pd.DataFrame(scaler.transform(meta_test),      columns=meta_test.columns,  index=X_test_vw.index)

In [None]:
scaler = StandardScaler()
syn_train = pd.DataFrame(scaler.fit_transform(syn_train), columns=syn_train.columns, index=X_train_vw.index)
syn_test  = pd.DataFrame(scaler.transform(syn_test),      columns=syn_test.columns,  index=X_test_vw.index)

Используем изменённую функцию для генерации признаков для Vowpal Wabbit.

In [None]:
def make_features(data, heroes, items, abilities, meta, synergy, name, is_train, y_train=None):
    
    half_items = items.shape[1] // 2
    half_abilities = abilities.shape[1] // 2
    
    with open(name, 'w') as fout:
    
        for match_id in log_progress(data.index, every=100):

            row           = data.ix[match_id]
            row_heroes    = heroes.ix[match_id]
            row_items     = items.ix[match_id]
            row_abilities = abilities.ix[match_id]
            row_meta      = meta.ix[match_id]
            row_synergy   = synergy.ix[match_id]

            if is_train:
                target = 1 if y_train.ix[match_id] == 1 else -1
            else:
                target = 1

            fout.write(str(target) + 
                       ' |features ' + ' '.join('{0}:{1}'.format(i, j) for i, j in zip(data.columns, row)) +
                       ' |heroes '  + ' '.join('{0}_{1}'.format('r' if i == 1 else 'd', heroes.columns[j]) for j, i in enumerate(row_heroes) if i != 0) +
                       ' |items ' + ' '.join('{0}:{1}'.format(items.columns[i], j) for i, j in enumerate(row_items) if j != 0) + 
                       ' |abilities ' + ' '.join('{0}:{1}'.format(abilities.columns[i], j) for i, j in enumerate(row_abilities) if j != 0) + 
                       ' |meta ' + ' '.join('{0}:{1}'.format(meta.columns[i], j) for i, j in enumerate(row_meta) if j != 0) + 
                       ' |synergy ' + ' '.join('{0}:{1}'.format(synergy.columns[i], j) for i, j in enumerate(row_synergy) if j != 0) + 
                       '\n')

In [None]:
make_features(X_train_vw, heroes_match_train, items_train, abilities_train, meta_train, syn_train, 'input/train.vw', True, y_train)
make_features(X_test_vw,  heroes_match_test,  items_test,  abilities_test,  meta_test,  syn_test,  'input/test.vw',  False)

Обучим и предскажем.

In [None]:
os.system('vw -d input/train.vw -c -k -f model.vw --passes 200 -l 0.5 --sgd -b 26 --loss_function logistic --quiet')
os.system('vw -d input/test.vw -i model.vw -t -p output/predictions_tmp.txt --quiet')

preds2 = pd.read_csv('output/predictions_tmp.txt', header=None).iloc[:, 0].values
preds2 = pd.DataFrame(preds2, columns=['radiant_win'], index=X_test.index)

##### Private LB: 0.76266

Хуже предыдущего (вероятно, переобучение, но было трудно кросс-валидироваться с метапризнаками).

#### Смешаем решения.

Коэффициенты были выбраны наугад за час до конца контеста, когда оставался лишь один сабмит.

In [None]:
preds1 = (preds1 - preds1.min()) / (preds1.max() - preds1.min())
preds2 = (preds2 - preds2.min()) / (preds2.max() - preds2.min())
preds = preds1 * 0.8 + preds2 * 0.2
preds.to_csv('output/ensemble.csv')

##### Private LB: 0.76458