# Data preparation

In [None]:
! pip install -q kaggle

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
! kaggle competitions download -c python-and-analyze-data-final-project

transactions.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
gender_test_kaggle_sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)
tr_types.csv: Skipping, found more recently modified local copy (use --force to force download)
tr_mcc_codes.csv: Skipping, found more recently modified local copy (use --force to force download)
gender_train.csv: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import catboost
import optuna

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from tqdm.notebook import tqdm_notebook
from warnings import filterwarnings
filterwarnings('ignore')

pd.set_option('max_columns', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 300)

In [None]:
tr_mcc_codes = pd.read_csv('/content/tr_mcc_codes.csv', sep=';', index_col='mcc_code')
tr_types = pd.read_csv('/content/tr_types.csv', sep=';', index_col='tr_type')

transactions = pd.read_csv('/content/transactions.csv.zip', index_col='customer_id')
gender_train = pd.read_csv('/content/gender_train.csv', index_col='customer_id')
gender_test = pd.read_csv('/content/gender_test_kaggle_sample_submission.csv', index_col='customer_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

In [None]:
for df in [transactions_train, transactions_test]:
    df['day'] = df['tr_datetime'].str.split().apply(lambda x: int(x[0]) % 7)
    df['hour'] = df['tr_datetime'].apply(lambda x: re.search(' \d*', x).group(0)).astype(int)
    df['night'] = ~df['hour'].between(6, 22).astype(int)

In [None]:
tqdm_notebook.pandas(desc="Progress:")

def features_creation_advanced(x): 
    features = []
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['night'].value_counts(normalize=True).add_prefix('night_')))
    
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('negative_transactions_')))

    features.append(pd.Series(x['mcc_code'].value_counts(normalize=True).add_prefix('mcc_')))
    features.append(pd.Series(x['tr_type'].value_counts(normalize=True).add_prefix('tr_type_')))
    
    return pd.concat(features)


In [None]:
data_train = transactions_train.groupby(transactions_train.index)\
                               .progress_apply(features_creation_advanced).unstack(-1)
data_test = transactions_test.groupby(transactions_test.index)\
                             .progress_apply(features_creation_advanced).unstack(-1)

target = data_train.join(gender_train, how='inner')['gender']
test_target = data_test.join(gender_test, how='inner')['probability']
target.value_counts()

Progress::   0%|          | 0/8400 [00:00<?, ?it/s]

Progress::   0%|          | 0/3600 [00:00<?, ?it/s]

0    4687
1    3713
Name: gender, dtype: int64

It is clear that classes are somewhat balanced

In [None]:
data_test

Unnamed: 0_level_0,day_0,day_1,day_2,day_3,day_4,day_5,day_6,hour_0,hour_1,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_2,hour_20,hour_21,hour_22,hour_23,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,mcc_1711,mcc_1731,mcc_1799,mcc_2741,mcc_3000,mcc_3351,mcc_3501,mcc_4111,mcc_4112,mcc_4121,mcc_4131,mcc_4214,mcc_4215,mcc_4411,mcc_4511,mcc_4722,mcc_4784,mcc_4789,mcc_4812,...,tr_type_2371,tr_type_2440,tr_type_2446,tr_type_2456,tr_type_2460,tr_type_4010,tr_type_4011,tr_type_4020,tr_type_4021,tr_type_4031,tr_type_4041,tr_type_4045,tr_type_4051,tr_type_4061,tr_type_4071,tr_type_4090,tr_type_4096,tr_type_4097,tr_type_4100,tr_type_4110,tr_type_4200,tr_type_4210,tr_type_4500,tr_type_6000,tr_type_6010,tr_type_6100,tr_type_6110,tr_type_6200,tr_type_6210,tr_type_7010,tr_type_7011,tr_type_7014,tr_type_7015,tr_type_7020,tr_type_7021,tr_type_7024,tr_type_7025,tr_type_7030,tr_type_7031,tr_type_7034,tr_type_7035,tr_type_7040,tr_type_7041,tr_type_7044,tr_type_7070,tr_type_7071,tr_type_7074,tr_type_7075,tr_type_8100,tr_type_8146
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
6815,0.119469,0.132743,0.119469,0.163717,0.181416,0.146018,0.137168,,0.004425,0.181416,0.132743,0.061947,0.079646,0.159292,0.079646,0.039823,0.039823,0.030973,0.026549,0.004425,0.030973,0.008850,,,,,,0.004425,0.008850,0.017699,0.088496,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.022124,,,,,,,,,0.004425,,,,,,,,,,,
27914,0.135135,0.261261,0.180180,0.090090,0.135135,0.027027,0.171171,0.018018,,0.126126,0.090090,0.027027,0.081081,0.126126,0.018018,0.117117,0.054054,0.027027,0.018018,,,,,,0.009009,0.009009,0.018018,0.018018,0.054054,0.144144,0.045045,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.297297,,,,0.018018,,,,,,,,,,,,,,,,
31385,0.153425,0.120548,0.145205,0.123288,0.145205,0.167123,0.145205,0.030137,,0.060274,0.106849,0.052055,0.098630,0.038356,0.043836,0.052055,0.043836,0.063014,0.049315,,0.049315,0.109589,0.071233,0.008219,,,,,0.043836,0.043836,0.035616,,,,,,,,,,,,,,,,,,,0.002740,...,,,,,,,0.00274,,,0.005479,0.005479,,0.002740,,,,,,,,,,,,,,0.021918,,,0.013699,0.005479,,,,0.010959,,,0.005479,0.002740,,,,,,0.002740,,,,,
38084,0.142857,0.153361,0.110294,0.130252,0.159664,0.137605,0.165966,0.143908,0.006303,0.053571,0.054622,0.055672,0.073529,0.059874,0.047269,0.118697,0.096639,0.046218,0.044118,0.004202,0.039916,0.013655,0.015756,0.008403,0.004202,0.002101,0.005252,0.008403,0.007353,0.028361,0.061975,,,,,,,,,0.003151,,,,,,,0.001050,,,0.005252,...,0.004202,,,,,0.007353,0.00105,,,,,,0.002101,,0.004202,,,,,,,,,,,,,,,0.087185,,,,,,,,0.014706,,,,,,,0.017857,,,,,
78659,0.159875,0.119122,0.134796,0.147335,0.134796,0.147335,0.156740,0.291536,,0.009404,0.018809,0.021944,0.012539,0.047022,0.043887,0.021944,0.040752,0.025078,0.040752,,0.141066,0.137931,0.062696,0.025078,0.009404,0.034483,0.003135,,,0.003135,0.009404,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.003135,,0.097179,,,,,,,,,,,,,,,0.043887,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99918216,0.132979,0.175532,0.101064,0.143617,0.138298,0.117021,0.191489,0.058511,,0.042553,0.047872,0.090426,0.042553,0.037234,0.010638,0.021277,0.026596,0.010638,0.010638,0.053191,0.015957,,0.015957,,0.015957,0.074468,0.101064,0.079787,0.117021,0.069149,0.058511,,,,,,,,,,,,,,,,0.005319,,,0.010638,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.026596,,,,,,,,,,,,,,,,,,,,
99938948,0.148265,0.162461,0.127760,0.124606,0.145110,0.162461,0.129338,0.160883,0.001577,0.037855,0.042587,0.056782,0.083596,0.077287,0.059937,0.069401,0.096215,0.055205,0.089905,,0.083596,0.014196,0.004732,0.014196,,,,,0.004732,0.011041,0.036278,,,,,,,,,,,,,,,,,,,,...,0.001577,,,,,,,,,,,,0.001577,,0.001577,,,,,,,,,,,,,,,0.001577,,,,,,,,0.039432,,,,,,,0.023659,0.003155,,,,
99953525,0.139535,0.162791,0.162791,0.133721,0.104651,0.133721,0.162791,0.069767,,0.069767,0.093023,0.110465,0.069767,0.063953,0.034884,,0.023256,0.040698,0.040698,,0.087209,0.017442,,,,,0.116279,0.040698,0.040698,,0.081395,,,,,,,,,,,,,,,,,,,,...,,0.005814,,,,0.052326,,,,,,,,,,,,,,0.005814,,,,,,,,,,0.011628,,,,0.005814,,,,0.069767,,,,,,,0.005814,,,,,
99985917,0.200000,0.050000,,0.300000,0.050000,0.200000,0.200000,,,,0.800000,0.100000,0.050000,,,0.050000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.150000,,,,0.050000,,,,,,,,,,,,,,,,


In [None]:
target = data_train.join(gender_train, how='inner')['gender']
test_target = data_test.join(gender_test, how='inner')['probability']
test_target = test_target.to_frame()

#some of the features are not present in test and train sets so let's delete them
test_features = data_test.columns.to_list()
train_features = data_train.columns.to_list()

data_test = data_test.drop([x for x in test_features if x not in train_features], axis=1)
data_train = data_train.drop([x for x in train_features if x not in test_features], axis=1)

In [None]:
X = data_train.copy()
y = target
test_df = data_test.copy()

X.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)

X.shape, test_df.shape

((8400, 304), (3600, 304))

# Model

In [None]:
model = catboost.CatBoostClassifier(n_estimators=300)
model.fit(X,y)

Learning rate set to 0.077105
0:	learn: 0.6708862	total: 85ms	remaining: 25.4s
1:	learn: 0.6514069	total: 153ms	remaining: 22.8s
2:	learn: 0.6372674	total: 220ms	remaining: 21.8s
3:	learn: 0.6210645	total: 283ms	remaining: 20.9s
4:	learn: 0.6084875	total: 348ms	remaining: 20.5s
5:	learn: 0.5954136	total: 414ms	remaining: 20.3s
6:	learn: 0.5845983	total: 481ms	remaining: 20.1s
7:	learn: 0.5744677	total: 548ms	remaining: 20s
8:	learn: 0.5654904	total: 617ms	remaining: 20s
9:	learn: 0.5592246	total: 683ms	remaining: 19.8s
10:	learn: 0.5515547	total: 755ms	remaining: 19.8s
11:	learn: 0.5446227	total: 823ms	remaining: 19.8s
12:	learn: 0.5387081	total: 893ms	remaining: 19.7s
13:	learn: 0.5338950	total: 957ms	remaining: 19.6s
14:	learn: 0.5294960	total: 1.04s	remaining: 19.7s
15:	learn: 0.5257418	total: 1.11s	remaining: 19.6s
16:	learn: 0.5210425	total: 1.17s	remaining: 19.5s
17:	learn: 0.5175217	total: 1.23s	remaining: 19.3s
18:	learn: 0.5142404	total: 1.3s	remaining: 19.2s
19:	learn: 0.5107

<catboost.core.CatBoostClassifier at 0x7fd294d36e50>

In [None]:
feat_imp = {
    'feature': data_train.columns,
    'feature_importance': model.get_feature_importance()
}

fimp = model.get_feature_importance()

df_feat_imp = pd.DataFrame(feat_imp)
df_feat_imp

In [None]:
#some of the features have zero feature importance

df_feat_imp = df_feat_imp.sort_values('feature_importance', ascending=False)
df_feat_imp = df_feat_imp[df_feat_imp['feature_importance']>0]

In [None]:
df_feat_imp.head()

Unnamed: 0,feature,feature_importance
146,mcc_5977,6.162479
94,mcc_5533,5.172112
95,mcc_5541,4.285753
105,mcc_5661,3.231115
124,mcc_5912,3.150768


It appears that Cosmetics (5977), Car accessories (5533), Car services (5541), Shoe stores (5661) and Pharmacies (5912) are the most significant categories.

In [None]:
#ket's see how the model will perform without zero-like feature_importance

data_train_upd = data_train[df_feat_imp.feature.iloc[:181].to_list()]
data_test_upd = data_test[df_feat_imp.feature.iloc[:181].to_list()]

X_upd = data_train_upd.copy()
y = target
test_df_upd = data_test_upd.copy()

X_upd.fillna(-1, inplace=True)
test_df_upd.fillna(-1, inplace=True)
scaler = StandardScaler()
X_upd = scaler.fit_transform(X_upd)
test_df_upd = scaler.fit_transform(test_df_upd)

X_upd.shape, test_df_upd.shape

((8400, 181), (3600, 181))

In [None]:
model = catboost.CatBoostClassifier(n_estimators=400)
model.fit(X_upd, y)

In [None]:
test_target = test_target.round(0).astype(int)

Let's finetune the model with optuna

In [None]:
def objective(trial):
  param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }
  if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
  elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

  mod = catboost.CatBoostClassifier(**param)

  mod.fit(X_upd,y)

  preds = mod.predict(test_df_upd)
  pred_labels = np.rint(preds)
  scor = roc_auc_score(test_target, pred_labels)
  return scor

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


In [None]:
model_optuned = catboost.CatBoostClassifier(colsample_bylevel=0.030140661107958613, depth=10, boosting_type='Ordered', bootstrap_type='Bayesian', bagging_temperature=2.621899464840254)
model_optuned.fit(X_upd, y)

In [None]:
y_train_predicted = model_optuned.predict_proba(X_upd)[:,-1]
y_test_predicted = model_optuned.predict_proba(test_df_upd)[:,-1]

I got score of 0.85024 on Kaggle: ![%D0%B8%D0%B7%D0%BE%D0%B1%D1%80%D0%B0%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5.png](attachment:%D0%B8%D0%B7%D0%BE%D0%B1%D1%80%D0%B0%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5.png)

In [None]:
submission = pd.DataFrame(index=data_test.index)
submission['probability'] = y_test_predicted
submission.to_csv('submission_catboost_optuned_3.csv')
submission

Unnamed: 0_level_0,probability
customer_id,Unnamed: 1_level_1
6815,0.324550
27914,0.653456
31385,0.017056
38084,0.003934
78659,0.599998
...,...
99918216,0.357100
99938948,0.003945
99953525,0.109387
99985917,0.602454


In [None]:
from google.colab import files

files.download('submission_catboost_optuned_3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>