### Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error
import gc
import os

### Read data

In [2]:
sample_submission = pd.read_csv('/kaggle/input/turkiye-is-bankas-machine-learning-challenge-3/sample_submission.csv')
monthly_expenditures = pd.read_csv('/kaggle/input/turkiye-is-bankas-machine-learning-challenge-3/monthly_expenditures.csv')
raw_train = pd.read_csv('/kaggle/input/turkiye-is-bankas-machine-learning-challenge-3/train.csv')
raw_test = pd.read_csv('/kaggle/input/turkiye-is-bankas-machine-learning-challenge-3/test.csv')

### Data cleaning

In [3]:
raw_train.pop('tarih')
raw_test.pop('tarih')

raw_train.set_index('musteri', inplace=True)
raw_test.set_index('musteri', inplace=True)

In [4]:
raw_train.fillna('na', inplace=True)
raw_test.fillna('na', inplace=True)

In [5]:
raw_train.head()

Unnamed: 0_level_0,yas,kidem_suresi,egitim,is_durumu,meslek_grubu,target
musteri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23b2476c8a,44.0,46.0,5f8ca0f83b,915be3a7a4,44af6626d6,0
5a7e3a7122,39.0,194.0,01a4f2c96c,915be3a7a4,44af6626d6,0
2ec95c7499,38.0,182.0,7e6640bfe0,d36d84f51a,44af6626d6,0
54399dac33,34.0,101.0,5f8ca0f83b,991c4998fb,44af6626d6,0
97a74b2e58,41.0,125.0,7bb291e291,242927d0f5,91ed9c1e18,0


### Feature engineering

In [6]:
train = raw_train.copy()
test = raw_test.copy()

#### Convert categorical features to integers

In [7]:
cat_features = ['egitim', 'is_durumu', 'meslek_grubu']

In [8]:
enc = OrdinalEncoder(dtype=np.int)

enc.fit(train[cat_features])

enc.categories_

[array(['01a4f2c96c', '2eb5ddd72c', '5f8ca0f83b', '7bb291e291',
        '7e6640bfe0', 'na'], dtype=object),
 array(['242927d0f5', '289777e76d', '3773727d6e', '51be29729b',
        '83a26fc2fd', '915be3a7a4', '991c4998fb', 'a996720382',
        'b026b8ee68', 'ba7b390fc4', 'd36d84f51a', 'eb35a5eb6b',
        'f1fcd26d00', 'na'], dtype=object),
 array(['070e3be3ae', '314826bda8', '419188ba43', '44af6626d6',
        '4e1d23cb0a', '51017102ac', '51155d7cc3', '527516f307',
        '5aaf0d2c89', '62260c15fb', '6da8606d6e', '7330d6bdb3',
        '8a8492b947', '8bbaa7ab87', '9034d4c83c', '91ed9c1e18',
        'b18c21407b', 'ba91c4a92b', 'bfd9cb6270', 'eea19ef68d', 'na'],
       dtype=object)]

In [9]:
train[cat_features] = enc.transform(train[cat_features])
test[cat_features] = enc.transform(test[cat_features])

#### Create features from monthly expenditures

In [10]:
#monthly_expenditures['avg_aylik_tutar'] = monthly_expenditures['aylik_toplam_tutar'] / monthly_expenditures['islem_adedi']

In [11]:
agg_funcs = [
    np.mean,
    np.median,
    min,
    max,
    np.std
]

In [12]:
values_to_use = [
    'islem_adedi',
    'aylik_toplam_tutar',
    #'avg_aylik_tutar'
]

In [13]:
agg_expenditures = monthly_expenditures.pivot_table(index='musteri',
                                                    columns=['sektor'],
                                                    values=values_to_use,
                                                    aggfunc= agg_funcs,
                                                    fill_value=0)
agg_expenditures.columns = ['_'.join(col).strip() for col in agg_expenditures.columns.values]
agg_expenditures.fillna(0, inplace=True)

In [14]:
all_train = train.join(agg_expenditures)
all_test = test.join(agg_expenditures)

In [15]:
columns = all_test.columns

In [16]:
all_train.shape

(60000, 136)

In [17]:
all_test.shape

(40000, 135)

In [18]:
target = all_train['target'].values

In [19]:
train_oof = np.zeros((len(all_train),))
test_preds = 0
train_oof.shape

(60000,)

In [20]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'min_data_in_leaf': 3000,
    'feature_fraction': 0.2,
    'max_depth':8,
    'num_leaves':70,
}

In [21]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(all_train, target))):
        #print(f'Fold {f}')
        train_df, val_df = all_train.iloc[train_ind][columns], all_train.iloc[val_ind][columns]
        train_target, val_target = target[train_ind], target[val_ind]
        
        model = LGBMRegressor(**params)
        model.fit(train_df, train_target, eval_set=[(val_df,val_target)],early_stopping_rounds=1600,verbose=False)
        temp_oof = model.predict(val_df)
        temp_test = model.predict(all_test[columns])

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))

|          | 0/? [00:00<?, ?it/s]

0.7671336337829031
0.771915324440274
0.764652435037717
0.7686567164179106
0.7374414618696654
0.7811952717535786
0.7966726555240593
0.796752737726388
0.7891409761822922
0.7935746343794263


In [22]:
X_train, X_val, y_train, y_val = train_test_split(all_train, target, test_size=0.33, random_state=42)

In [23]:
print('Train auc: ', roc_auc_score(y_train, model.predict(X_train[columns])))
print('Val auc: ', roc_auc_score(y_val, model.predict(X_val[columns])))

Train auc:  0.7975856194025526
Val auc:  0.8084065620125619


In [24]:
np.save('test_preds', test_preds)

In [25]:
submission = sample_submission.copy()

In [26]:
submission['target'] = test_preds

In [27]:
submission.head()

Unnamed: 0,musteri,target
0,ee523cbb79,0.011211
1,d0b45299ba,0.039562
2,45564f1c9f,0.029808
3,0e91950c32,0.033717
4,8d02e2c86d,0.018836


In [28]:
submission.to_csv('submission.csv', index=False) 