In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier
import os

In [0]:
tower_map = ['t1', 't2', 't3', 'm1', 'm2', 'm3', 'b1', 'b2', 'b3', 'at', 'ab']

In [0]:
def get_data(name):
    return f'https://s3.eu-central-1.amazonaws.com/ai-academy-2019/public/final/academy2019_final_{name}'

In [0]:
df = pd.read_csv('processed_train.csv', index_col='id')
test_df = pd.read_csv('processed_test.csv', index_col='id')

In [10]:
df.shape

(40395, 819)

In [0]:
cat_features = ['player_team', 'winner_team', 'pre_game_duration', 'first_blood_claimed',
                'hero_id', 'hero_pick_order', 'leaver_status', 'is_winner', 'party_players',
                'level', 'tower_kills', 'roshan_kills', 'radiant_tower_status', 'dire_tower_status',
                'dire_barracks_status', 'radiant_barracks_status'] \
               + [f'{p}_tower_{n}' for n in tower_map for p in ('p', 'e')]

In [0]:
target = 'skilled'
numeric_features = [c for c in df.columns if
                    c not in cat_features and
                    c != target]

In [0]:
(X_train, X_test,
 y_train, y_test) = train_test_split(df.drop(target, axis=1), df[target], random_state=6741, test_size=0.3)

In [0]:
def cols_to_id(d, cc):
    i = d.columns.tolist()
    return [i.index(c) for c in cc]


cat_id = cols_to_id(X_train, cat_features)

In [0]:
def get_score_data(name):
    f = f'{name}_score.csv'
    if os.path.exists(f):
        return pd.read_csv(f).set_index('id')
    else:
        ret = pd.DataFrame({'id': [0], 'acc': [0]}).set_index('id')
        save_score_data(name, ret)
        return ret


def save_score_data(name, d):
    f = f'{name}_score.csv'
    d.index.name = 'id'
    d.to_csv(f, index=True, header=True)


def score_model(pred_func, name):
    s_data = get_score_data(name)
    acc_score = roc_auc_score(y_test, pred_func(X_test))
    last_acc = s_data.loc[s_data.index[-1]]['acc']
    max_acc = s_data['acc'].max()
    print(f"Scoring {name}")
    print("Accuracy:", acc_score)
    print("Diff from last:", round(acc_score - last_acc, 4))
    print("Diff from max:", round(acc_score - max_acc, 4))

    if abs(acc_score - last_acc) > 1e-5:
        s_data = s_data.append({'acc': acc_score}, ignore_index=True)
        save_score_data(name, s_data)

In [16]:
model

NameError: ignored

In [18]:
model = CatBoostClassifier(
    iterations=6500,
    learning_rate=None,
    depth=4,
    eval_metric='AUC',
    random_seed=6741,
    use_best_model=True,
    verbose=True
)

model.fit(X_train, y_train, cat_features=cat_id, eval_set=(X_test, y_test))

score_model(model.predict, 'catboost')

Learning rate set to 0.049742
0:	test: 0.6744963	best: 0.6744963 (0)	total: 374ms	remaining: 40m 31s
1:	test: 0.6935098	best: 0.6935098 (1)	total: 757ms	remaining: 41m
2:	test: 0.7004161	best: 0.7004161 (2)	total: 1.14s	remaining: 41m 8s
3:	test: 0.7036674	best: 0.7036674 (3)	total: 1.51s	remaining: 40m 53s
4:	test: 0.7081656	best: 0.7081656 (4)	total: 1.9s	remaining: 41m 8s
5:	test: 0.7152129	best: 0.7152129 (5)	total: 2.3s	remaining: 41m 26s
6:	test: 0.7165878	best: 0.7165878 (6)	total: 2.67s	remaining: 41m 19s
7:	test: 0.7235663	best: 0.7235663 (7)	total: 3.04s	remaining: 41m 5s
8:	test: 0.7306797	best: 0.7306797 (8)	total: 3.43s	remaining: 41m 16s
9:	test: 0.7336560	best: 0.7336560 (9)	total: 3.81s	remaining: 41m 13s
10:	test: 0.7386702	best: 0.7386702 (10)	total: 4.17s	remaining: 41m 1s
11:	test: 0.7406676	best: 0.7406676 (11)	total: 4.58s	remaining: 41m 14s
12:	test: 0.7397669	best: 0.7406676 (11)	total: 4.97s	remaining: 41m 18s
13:	test: 0.7462947	best: 0.7462947 (13)	total: 5.3

In [0]:
subm = pd.read_csv(get_data('sample_submission.csv'), index_col='id')

In [0]:
subm.tail()

Unnamed: 0_level_0,skilled_prob
id,Unnamed: 1_level_1
56226,0.5
56230,0.5
56231,0.5
56232,0.5
56234,0.5


In [0]:
test_df.shape[0], subm.shape[0]

(15835, 15836)

In [0]:
subm[~(subm.index.isin(test_df.index))]

Unnamed: 0_level_0,skilled_prob
id,Unnamed: 1_level_1


In [0]:
test_df.loc[13862] = 0

In [0]:
def make_submission(predict_func):
    pred = predict_func(test_df)
    subm = pd.DataFrame({'id': test_df.index, 'skilled_prob': pred[:, 1]})
    subm.to_csv('submission.csv', header=True, index=False)


def make_reversed_submission(predict_func):
    pred = predict_func(test_df)
    subm = pd.DataFrame({'id': test_df.index, 'skilled_prob': pred[:, 0]})
    subm.to_csv('reversed-submission.csv', header=True, index=False)


make_submission(model.predict_proba)
make_reversed_submission(model.predict_proba)

In [0]:
rs = pd.read_csv('reversed-submission.csv', index_col='id')

In [0]:
rs.head()

Unnamed: 0_level_0,skilled_prob
id,Unnamed: 1_level_1
4,0.119157
5,0.957819
6,0.031218
8,0.974809
9,0.862113


In [0]:
rs.shape

(15836, 1)

In [0]:
rs.min()

skilled_prob    0.004886
dtype: float64

In [0]:
(rs.index == subm.index).all()

True

In [0]:
from google.colab import files
files.download('reversed-submission.csv')

In [0]:
!head -5 reversed-submission.csv

id,skilled_prob
4,0.7759515881254013
5,0.22238141142151346
6,0.7601565837945379
8,0.2411486520047148
