In [59]:
#import library
import pandas as pd
import numpy as np
import scipy as stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb

In [3]:
# load data
df_transaction=pd.read_csv("../data/train_transaction.csv.zip", engine="python")
df_identity=pd.read_csv("../data/train_identity.csv.zip", engine="python")

In [4]:
print(df_transaction.shape, df_identity.shape)

(590540, 394) (144233, 41)


In [5]:
df = df_transaction.merge(df_identity, on="TransactionID", how="left")

In [6]:
df.shape

(590540, 434)

## OPTIMISATION MEMOIRE

In [7]:
# delete les dataframes plus utilisés
del df_identity, df_transaction

In [8]:
# suppression des colonnes ne contenant que des NA
df=df.dropna(axis=1, how='all')

In [9]:
# downcast des int et des float
int_columns = df.select_dtypes(include=['int']).columns.tolist()
float_columns = df.select_dtypes(include=['float']).columns.tolist()
df[int_columns] = df[int_columns].apply(pd.to_numeric, downcast='integer')
df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')

In [10]:
# downcoast des object en category
object_columns=df.select_dtypes(include=['object']).columns.tolist()
df[object_columns]=df[object_columns].apply(lambda x: x.astype('category'))

## ETUDE BIVARIEE RAPIDE

In [11]:
# lib.py

def missing_values_analysis(df, lst_vars=None):
    if lst_vars is not None:
        missing=df[lst_vars].isnull().sum()
        percent_missing=df[lst_vars].isnull().sum()*100 / len(df)
        missing_value_df = pd.DataFrame({'column_name': df[lst_vars].columns,
                                 'missing': missing,
                                 'percent_missing': percent_missing})
    else:
        missing=df.isnull().sum()
        percent_missing=df.isnull().sum()*100 / len(df)
        missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing,
                                 'percent_missing': percent_missing})
    return missing_value_df

def corr_with_Y(df, col):
    vars_quanti=df.select_dtypes(exclude=['object']).columns
    return df[vars_quanti].corrwith(df[col]).reset_index().rename({'index':"Column", 0:'Value'}, axis=1).sort_values("Value", ascending=False)

In [13]:
missing_values_analysis(df)

Unnamed: 0,column_name,missing,percent_missing
TransactionID,TransactionID,0,0.000000
isFraud,isFraud,0,0.000000
TransactionDT,TransactionDT,0,0.000000
TransactionAmt,TransactionAmt,0,0.000000
ProductCD,ProductCD,0,0.000000
card1,card1,0,0.000000
card2,card2,8933,1.512683
card3,card3,1565,0.265012
card4,card4,1577,0.267044
card5,card5,4259,0.721204


In [14]:
corr_with_Y(df, "isFraud")

Unnamed: 0,Column,Value
1,isFraud,1.000000
297,V257,0.383060
286,V246,0.366878
284,V244,0.364129
282,V242,0.360590
241,V201,0.328005
240,V200,0.318783
229,V189,0.308219
228,V188,0.303582
298,V258,0.297151


## PREPROCESSING

In [35]:
# import specific preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

In [16]:
# drop useless column 
df=df.drop("TransactionID", axis=1)

In [23]:
# diff numeric & category columns
df_quali=df.select_dtypes(include=['category'])
df_quanti=df.select_dtypes(exclude=['category'])

In [36]:
# impute quali column - lightgbm handle missing value for numeric columns
#imp = IterativeImputer(max_iter=10, random_state=42)
imp=SimpleImputer(strategy='constant', fill_value='Missing')
df_quali_imputed=imp.fit_transform(df_quali)

In [None]:
# label encode categoric colimns
encoder=OneHotEncoder()
df_quali_encoded=encoder.fit_transform(df_quali_imputed)

In [56]:
# label encode categorical columns
encoder=LabelEncoder()
df_quali_encoded=pd.DataFrame(df_quali_imputed, columns=df_quali.columns).apply(encoder.fit_transform)

In [72]:
df_model=pd.concat([df_quali_encoded, df_quanti], axis=1)

In [102]:
df_model.shape

(590540, 433)

In [101]:
train_set, eval_set = train_test_split(df_model, test_size=0.2, random_state=42)

In [104]:
labels_train=train_set["isFraud"]
train=train_set.drop("isFraud", axis=1)

In [105]:
labels_eval=eval_set["isFraud"]
eval=eval_set.drop("isFraud", axis=1)

In [106]:
indexes_of_categories=[train.columns.get_loc(col) for col in df_quali_encoded.columns]

## MODELISATION

In [94]:
params = {
    'metric': 'auc',
    'device_type': 'cpu',
    'objective': 'binary',
    'is_unbalance': True,
    'learning_rate': 0.1,
    'num_leaves': 255,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
    'verbose': 3
    }

In [107]:
param_distribs = {
        'max_depth': np.arange(7, 10, 1).tolist()
    }

In [108]:
lgb_classifier = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', learning_rate=0.01, metric='auc', is_unbalance=True, categorical_feature=indexes_of_categories)
grid_search = RandomizedSearchCV(lgb_classifier, param_distribs, cv=5, n_iter = 10, verbose=3, n_jobs=-1)
grid_search.fit(train, labels_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  5.7min remaining: 15.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  5.9min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  7.6min finished
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            categorical_feature=[0, 1, 2, 3, 4,
                                                                 5, 6, 7, 8, 9,
                                                                 10, 11, 12, 13,
                                                                 14, 15, 16, 17,
                                                                 18, 19, 20, 21,
                                                                 22, 23, 24, 25,
                                                                 26, 27, 28, 29, ...],
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            is_unbalance=True,
                                            

In [109]:
print(grid_search.best_score_, grid_search.best_params_)

0.959674619839469 {'max_depth': 7}


In [110]:
eval_true, eval_pred = labels_eval, grid_search.predict(eval)

In [111]:
classification_report(eval_true, eval_pred)

'              precision    recall  f1-score   support\n\n           0       0.98      0.98      0.98    113866\n           1       0.45      0.53      0.49      4242\n\n    accuracy                           0.96    118108\n   macro avg       0.72      0.76      0.73    118108\nweighted avg       0.96      0.96      0.96    118108\n'

In [112]:
confusion_matrix(eval_true, eval_pred)

array([[111092,   2774],
       [  1975,   2267]])

In [130]:
grid_search.predict_proba(eval)

array([[0.76181649, 0.23818351],
       [0.85166437, 0.14833563],
       [0.93552612, 0.06447388],
       ...,
       [0.64254567, 0.35745433],
       [0.86079487, 0.13920513],
       [0.86115646, 0.13884354]])

In [135]:
# proba of fraud
pd.DataFrame(grid_search.predict_proba(eval))[1]

0         0.238184
1         0.148336
2         0.064474
3         0.459209
4         0.071390
5         0.343053
6         0.094122
7         0.364635
8         0.492562
9         0.344519
10        0.167948
11        0.071861
12        0.245590
13        0.117111
14        0.396127
15        0.275076
16        0.108043
17        0.392469
18        0.137760
19        0.074752
20        0.132523
21        0.129080
22        0.166727
23        0.203192
24        0.202553
25        0.099747
26        0.140150
27        0.326069
28        0.237415
29        0.070620
            ...   
118078    0.395208
118079    0.224262
118080    0.110923
118081    0.182751
118082    0.060195
118083    0.101059
118084    0.248723
118085    0.169330
118086    0.098593
118087    0.113322
118088    0.143750
118089    0.138110
118090    0.403234
118091    0.138607
118092    0.093104
118093    0.196095
118094    0.461894
118095    0.195753
118096    0.133804
118097    0.133699
118098    0.327816
118099    0.

In [129]:
grid_search.predict_proba(eval).shape

(118108, 2)

In [114]:
eval_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

## SUBMISSION KAGGLE

In [115]:
ls ../data/

sample_submission.csv.zip  train_identity.csv.zip
test_identity.csv.zip      train_transaction.csv.zip
test_transaction.csv.zip


In [116]:
test_transaction=pd.read_csv("../data/test_transaction.csv.zip", engine="python")
test_identity=pd.read_csv("../data/test_identity.csv.zip", engine="python")

In [117]:
print(test_transaction.shape, test_identity.shape)

(506691, 393) (141907, 41)


In [118]:
test = test_transaction.merge(test_identity, on="TransactionID", how="left")

In [119]:
# delete les dataframes plus utilisés
del test_transaction, test_identity

In [121]:
# downcoast des object en category
object_test_columns=test.select_dtypes(include=['object']).columns.tolist()
test[object_test_columns]=test[object_test_columns].apply(lambda x: x.astype('category'))

In [122]:
# keep id for submission
id=test["TransactionID"]
# drop useless column 
test=test.drop("TransactionID", axis=1)

In [123]:
# diff numeric & category columns
test_quali=test.select_dtypes(include=['category'])
test_quanti=test.select_dtypes(exclude=['category'])

In [136]:
# impute quali column - lightgbm handle missing value for numeric columns
test_quali_imputed=imp.fit_transform(test_quali)

# label encode categorical columns
test_quali_encoded=pd.DataFrame(test_quali_imputed, columns=test_quali.columns).apply(encoder.fit_transform)

# concat
test_model=pd.concat([test_quali_encoded, test_quanti], axis=1)

In [137]:
test_model.shape

(506691, 432)

In [139]:
# predict proba with test sample
fraud_proba=pd.DataFrame(grid_search.predict_proba(test_model))[1]
fraud_proba

0         0.112445
1         0.170335
2         0.168545
3         0.104544
4         0.148942
5         0.089870
6         0.401291
7         0.351367
8         0.093306
9         0.154254
10        0.152545
11        0.084668
12        0.269443
13        0.111693
14        0.116770
15        0.092383
16        0.143665
17        0.160055
18        0.331829
19        0.233996
20        0.244049
21        0.143638
22        0.242663
23        0.167391
24        0.357074
25        0.204346
26        0.209566
27        0.161599
28        0.181776
29        0.166237
            ...   
506661    0.127133
506662    0.213350
506663    0.142613
506664    0.291928
506665    0.133005
506666    0.194973
506667    0.271465
506668    0.159922
506669    0.168024
506670    0.131432
506671    0.121141
506672    0.126459
506673    0.124947
506674    0.175896
506675    0.185760
506676    0.223035
506677    0.295042
506678    0.061084
506679    0.471372
506680    0.173759
506681    0.175896
506682    0.

In [142]:
submission_file=pd.concat([id, fraud_proba], axis=1).rename({1:'isFraud'}, axis=1)
submission_file

Unnamed: 0,TransactionID,isFraud
0,3663549,0.112445
1,3663550,0.170335
2,3663551,0.168545
3,3663552,0.104544
4,3663553,0.148942
5,3663554,0.089870
6,3663555,0.401291
7,3663556,0.351367
8,3663557,0.093306
9,3663558,0.154254


In [144]:
submission_file.to_csv('../submission/submission_1.csv', index=False)