In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import Pool
import numpy as np

In [66]:
raw_data_actions = pd.read_csv('users_hands_all_states_df.csv')

In [67]:
def colummns_to_drop(df):
    for col in df.columns:
        if len(set(df[col])) == 1:
            df.drop(col, axis=1, inplace=True)
            #print(col)

    columns_to_drop = ['hand_id', 'user_action_sum', 'user_result']
    df.drop(columns_to_drop, axis=1, inplace=True)

In [68]:
colummns_to_drop(raw_data_actions)

In [69]:
raw_data_actions = raw_data_actions.loc[raw_data_actions['user_action_type'].isin(['raises', 'bets', 'folds', 'checks', 'calls'])].copy()

In [70]:
raw_data_actions['user_action_type'].unique()

array(['raises', 'bets', 'folds', 'checks', 'calls'], dtype=object)

In [71]:
def label_func(x):
    if x in ['raises', 'bets']:
        x = 1
    elif x == 'calls':
        x = 0.75
    elif x == 'checks':
        x = 0.5
    elif x == 'folds':
        x = 0
    return x
raw_data_actions['target'] = raw_data_actions['user_action_type'].apply(label_func)

In [72]:
raw_data_actions.drop('user_action_type', axis=1, inplace=True)

In [73]:
raw_data_actions.shape

(15019, 166)

In [74]:
def impute_with_unknown(df):
    for col in df.columns:
        a = df[col].isna().sum()
        if a > 0:
            print(col, a)
            df[col].fillna('unknown', inplace=True)

In [75]:
impute_with_unknown(raw_data_actions)

flop11 9697
flop12 9697
flop21 9697
flop22 9697
flop31 9697
flop32 9697
turn11 12491
turn12 12491
river11 14044
river12 14044
pre-flop_player_3_action_type_1 4
pre-flop_player_4_action_type_1 12
pre-flop_player_4_action_type_2 3
flop_player_4_action_type_1 2
turn_player_4_action_type_1 1


In [76]:
number_of_missing = raw_data_actions.isna().sum()

In [77]:
number_of_missing.sum()

0

In [78]:
def func(x):
    if float(x) == 0.01:
        return 'one'
    elif float(x) == 0.02:
        return 'two'
    elif float(x) == 0.05:
        return 'five'
    elif float(x) == 0.25:
        return 'twentyfive'
    elif float(x) == 0.5:
        return 'fivety'
    elif float(x) == 1.0:
        return 'hundred'
    elif float(x) == 3.0:
        return '3hundred'
    elif float(x) == 0.1:
        return 'zeroone'
    elif float(x) == 2.0:
        return 'twozero'
    elif float(x) == 6.0:
        return 'sixzero'

In [79]:
raw_data_actions['type2'] = raw_data_actions['type2'].apply(func)
raw_data_actions['type3'] = raw_data_actions['type3'].apply(func)

In [80]:
raw_data_actions.shape

(15019, 166)

In [102]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from catboost import CatBoostClassifier, sum_models


def cv_catboost(X, Y, validation_df, cat_columns, target_column_name, number_of_runs):
    # CROSS VALIDATION
    cv = KFold(n_splits=5, shuffle=True, random_state=7)

    # RUN
    c = 0
    oof_preds = np.zeros((len(X), 5))
    models = []

    for train, valid in cv.split(X, Y):
        print("VAL fold %s" % c)
        X_train = X.iloc[train]
        Y_train = Y.iloc[train]
        X_valid = X.iloc[valid]
        Y_valid = Y.iloc[valid]

        model = CatBoostClassifier(iterations=1500, 
                                   verbose=200,
                                   random_seed=43,
                                   od_type='Iter',
                                   od_wait=500,
                                   loss_function='MultiClass'
                                   )
        
        model.fit(X_train, Y_train,
                eval_set=(X_valid, Y_valid), 
                use_best_model=True,
                cat_features=cat_columns,
                )
        
        
        print(model.predict_proba(X_valid))
        
        oof_preds[valid, 0] = [r[0] for r in model.predict_proba(X_valid)]
        oof_preds[valid, 1] = [r[1] for r in model.predict_proba(X_valid)]
        oof_preds[valid, 2] = [r[2] for r in model.predict_proba(X_valid)]
        oof_preds[valid, 3] = [r[3] for r in model.predict_proba(X_valid)]
        oof_preds[valid, 4] = Y_valid
        models.append(model)

        accuracy = accuracy_score([int(i*100) for i in Y_valid.tolist()], [int(i*100) for i in model.predict(X_valid)])
        print(f'Accuracy for fold {c} is {accuracy}')
        
        print(classification_report([int(i*100) for i in Y_valid.tolist()], [int(i*100) for i in model.predict(X_valid)]))
        
        c += 1
        
        if c == number_of_runs:
            break
        
    final_model = sum_models(models, ctr_merge_policy='LeaveMostDiversifiedTable')

    # SAVE OOF PREDS
    oof_pred_df = pd.DataFrame(columns=['ID_code', 'target'])
    oof_pred_df['ID_code'] = pd.Series(X.index.tolist())
    oof_pred_df['target'] = pd.Series([r[0] for r in oof_preds])
    oof_pred_df['real_target'] = pd.Series([r[1] for r in oof_preds])

    return final_model, oof_pred_df, models

In [82]:
raw_data_actions.sort_values(by=['user_id'], inplace=True)

In [83]:
users = list(set(raw_data_actions['user_id']))
users_train = users[:167]
users_valid = users[167:]

In [84]:
train_df = raw_data_actions.loc[raw_data_actions['user_id'].isin(users_train)]
validation_df = raw_data_actions.loc[raw_data_actions['user_id'].isin(users_valid)]

In [85]:
train_df.index = train_df['unique_id']
validation_df.index = validation_df['unique_id']
train_df.drop(['unique_id', 'user_id'], axis=1, inplace=True)
validation_df.drop(['unique_id', 'user_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [86]:
X, Y = train_df.drop(['target'], axis=1), train_df['target']

In [87]:
def get_cat_cols(X, other_method=False):
    categorical_cols = []
    for col in X.columns:
        if other_method is False:
            s = X[col].unique().shape[0]
            if s < 38 and 'sum' not in col and 'money_had' not in col:
                samp = set(X[col].sample(100))
                #print(f'For column {col} the number of unique valriables is {s}, sample {samp}')
                categorical_cols.append(col)
        else:
            if X[col].dtype == object:
                categorical_cols.append(col)
    return categorical_cols

In [88]:
categorical_cols = get_cat_cols(X)

In [89]:
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]

In [61]:
import time
start = time.time()
final_model, oof_pred_df, models = cv_catboost(X, Y, validation_df, categorical_indices, 'target', 1)
print(time.time() - start)

VAL fold 0
Learning rate set to 0.097035
0:	learn: 1.3273186	test: 1.3288941	best: 1.3288941 (0)	total: 360ms	remaining: 8m 59s
200:	learn: 0.8059814	test: 0.8659053	best: 0.8659053 (200)	total: 1m 56s	remaining: 12m 34s
400:	learn: 0.7271153	test: 0.8330444	best: 0.8330363 (399)	total: 5m 4s	remaining: 13m 53s
600:	learn: 0.6677347	test: 0.8142273	best: 0.8141703 (599)	total: 7m 36s	remaining: 11m 22s
800:	learn: 0.6208504	test: 0.8064176	best: 0.8064176 (800)	total: 10m 18s	remaining: 8m 59s
1000:	learn: 0.5832920	test: 0.8005979	best: 0.8004926 (988)	total: 12m 34s	remaining: 6m 16s
1200:	learn: 0.5479617	test: 0.7979607	best: 0.7976729 (1153)	total: 14m 48s	remaining: 3m 41s
1400:	learn: 0.5130975	test: 0.7970093	best: 0.7966879 (1385)	total: 17m 27s	remaining: 1m 14s
1499:	learn: 0.4977297	test: 0.7964613	best: 0.7962129 (1491)	total: 18m 59s	remaining: 0us

bestTest = 0.7962128838
bestIteration = 1491

Shrink model to first 1492 iterations.
[[3.99818055e-01 2.25801661e-04 5.22810

In [90]:
manual_embeddings_df = pd.read_csv('manual_player_embeddings_no_data_leakege.csv')

In [94]:
train_df_with_embeddings_df = train_df.merge(manual_embeddings_df, left_on=train_df.index, right_on='unique_id', how='inner')

In [95]:
validation_df_with_embeddings_df = validation_df.merge(manual_embeddings_df, left_on=validation_df.index, right_on='unique_id', how='inner')

In [98]:
train_df_with_embeddings_df.drop(['unique_id', 'user_id'], axis=1, inplace=True)
validation_df_with_embeddings_df.drop(['unique_id', 'user_id'], axis=1, inplace=True)

In [99]:
X, Y = train_df_with_embeddings_df.drop(['target'], axis=1), train_df_with_embeddings_df['target']

In [100]:
categorical_cols = get_cat_cols(X, True)
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]

In [103]:
start = time.time()
final_model, oof_pred_df, models = cv_catboost(X, Y, validation_df_with_embeddings_df, categorical_indices, 'target', 1)
print(time.time() - start)

VAL fold 0
Learning rate set to 0.074462
0:	learn: 1.3378291	test: 1.3369499	best: 1.3369499 (0)	total: 536ms	remaining: 26m 46s
200:	learn: 0.8136117	test: 0.8671182	best: 0.8671182 (200)	total: 3m 37s	remaining: 50m 29s
400:	learn: 0.7298008	test: 0.8305068	best: 0.8305068 (400)	total: 6m 53s	remaining: 44m 39s
600:	learn: 0.6774187	test: 0.8179279	best: 0.8179279 (600)	total: 10m 5s	remaining: 40m 17s
800:	learn: 0.6336743	test: 0.8088532	best: 0.8088320 (798)	total: 13m 50s	remaining: 38m
1000:	learn: 0.5965985	test: 0.8043433	best: 0.8040587 (995)	total: 17m 42s	remaining: 35m 22s
1200:	learn: 0.5658582	test: 0.8006693	best: 0.8005422 (1193)	total: 21m 38s	remaining: 32m 24s
1400:	learn: 0.5367390	test: 0.7988317	best: 0.7988317 (1400)	total: 25m 54s	remaining: 29m 34s
1600:	learn: 0.5110998	test: 0.7969978	best: 0.7969074 (1598)	total: 30m 17s	remaining: 26m 27s
1800:	learn: 0.4862127	test: 0.7961990	best: 0.7961990 (1800)	total: 34m 53s	remaining: 23m 13s
2000:	learn: 0.4649230	

In [113]:
important_features_df = models[0].get_feature_importance(Pool(validation_df_with_embeddings_df.drop('target', axis=1), validation_df_with_embeddings_df['target'], categorical_indices), prettified=True)

In [116]:
important_features_df

Unnamed: 0,Feature Id,Importances
0,stage,12.504272
1,bank_before,6.481073
2,user_step_num,4.802975
3,turn12,2.950595
4,pre-flop_player_3_action_type_1,2.523839
...,...,...
2483,24_river_checks_user_action_sum_std,0.000000
2484,24_river_calls_user_action_sum_count,0.000000
2485,24_river_calls_user_action_sum_mean,0.000000
2486,24_river_calls_user_action_sum_sum,0.000000


In [121]:
important_features_df = important_features_df[important_features_df['Importances']>0.01].copy()

In [122]:
important_features_df

Unnamed: 0,Feature Id,Importances
0,stage,12.504272
1,bank_before,6.481073
2,user_step_num,4.802975
3,turn12,2.950595
4,pre-flop_player_3_action_type_1,2.523839
...,...,...
483,turn_calls_div_action_sum_money_have_sum,0.010361
484,6_pre-flop_checks_user_action_sum_count,0.010319
485,12_river_bets_user_action_sum_count,0.010130
486,12_turn_calls_user_action_sum_count,0.010130


In [124]:
useful_features = important_features_df['Feature Id'].tolist()

In [125]:
new_X = X[useful_features].copy()
new_validation_df = validation_df_with_embeddings_df[useful_features + ['target']].copy()
categorical_cols = get_cat_cols(new_X, True)
categorical_indices = [new_X.columns.get_loc(col) for col in categorical_cols]

In [126]:
start = time.time()
final_model, oof_pred_df, models = cv_catboost(new_X, Y, new_validation_df, categorical_indices, 'target', 1)
print(time.time() - start)

VAL fold 0
Learning rate set to 0.074462
0:	learn: 1.3381361	test: 1.3362325	best: 1.3362325 (0)	total: 657ms	remaining: 32m 50s
200:	learn: 0.8119027	test: 0.8689470	best: 0.8689470 (200)	total: 2m 36s	remaining: 36m 18s
400:	learn: 0.7238180	test: 0.8328667	best: 0.8328667 (400)	total: 5m 5s	remaining: 33m 2s
600:	learn: 0.6691093	test: 0.8153767	best: 0.8153767 (600)	total: 7m 40s	remaining: 30m 39s
800:	learn: 0.6227599	test: 0.8064909	best: 0.8064909 (800)	total: 10m 11s	remaining: 27m 58s
1000:	learn: 0.5863778	test: 0.8014357	best: 0.8010967 (979)	total: 12m 51s	remaining: 25m 40s
1200:	learn: 0.5515581	test: 0.7969478	best: 0.7969478 (1200)	total: 15m 37s	remaining: 23m 24s
1400:	learn: 0.5237036	test: 0.7954406	best: 0.7954386 (1399)	total: 18m 36s	remaining: 21m 14s
1600:	learn: 0.4958653	test: 0.7935531	best: 0.7933106 (1574)	total: 21m 29s	remaining: 18m 46s
1800:	learn: 0.4727337	test: 0.7932602	best: 0.7929081 (1774)	total: 24m 38s	remaining: 16m 24s
2000:	learn: 0.449923

In [127]:
important_features_df.to_csv('important_features.csv', index=False)