In [172]:
#import library
import pandas as pd
import numpy as np
from scipy import stats
import gc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgb

In [3]:
# load data
df_transaction=pd.read_csv("../data/train_transaction.csv.zip", engine="python")
df_identity=pd.read_csv("../data/train_identity.csv.zip", engine="python")

In [4]:
print(df_transaction.shape, df_identity.shape)

(590540, 394) (144233, 41)


In [5]:
df = df_transaction.merge(df_identity, on="TransactionID", how="left")

In [6]:
df.shape

(590540, 434)

## OPTIMISATION MEMOIRE

In [7]:
# delete les dataframes plus utilisés
del df_identity, df_transaction
gc.collect()

In [8]:
# suppression des colonnes ne contenant que des NA
df=df.dropna(axis=1, how='all')

In [9]:
# downcast des int et des float
int_columns = df.select_dtypes(include=['int']).columns.tolist()
float_columns = df.select_dtypes(include=['float']).columns.tolist()
df[int_columns] = df[int_columns].apply(pd.to_numeric, downcast='integer')
df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')

In [10]:
# downcoast des object en category
object_columns=df.select_dtypes(include=['object']).columns.tolist()
df[object_columns]=df[object_columns].apply(lambda x: x.astype('category'))

## ETUDE BIVARIEE RAPIDE

In [11]:
# lib.py

def missing_values_analysis(df, lst_vars=None):
    if lst_vars is not None:
        missing=df[lst_vars].isnull().sum()
        percent_missing=df[lst_vars].isnull().sum()*100 / len(df)
        missing_value_df = pd.DataFrame({'column_name': df[lst_vars].columns,
                                 'missing': missing,
                                 'percent_missing': percent_missing})
    else:
        missing=df.isnull().sum()
        percent_missing=df.isnull().sum()*100 / len(df)
        missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'missing': missing,
                                 'percent_missing': percent_missing})
    return missing_value_df

def corr_with_Y(df, col):
    vars_quanti=df.select_dtypes(exclude=['object']).columns
    return df[vars_quanti].corrwith(df[col]).reset_index().rename({'index':"Column", 0:'Value'}, axis=1).sort_values("Value", ascending=False)

In [13]:
missing_values_analysis(df)

Unnamed: 0,column_name,missing,percent_missing
TransactionID,TransactionID,0,0.000000
isFraud,isFraud,0,0.000000
TransactionDT,TransactionDT,0,0.000000
TransactionAmt,TransactionAmt,0,0.000000
ProductCD,ProductCD,0,0.000000
card1,card1,0,0.000000
card2,card2,8933,1.512683
card3,card3,1565,0.265012
card4,card4,1577,0.267044
card5,card5,4259,0.721204


In [14]:
corr_with_Y(df, "isFraud")

Unnamed: 0,Column,Value
1,isFraud,1.000000
297,V257,0.383060
286,V246,0.366878
284,V244,0.364129
282,V242,0.360590
241,V201,0.328005
240,V200,0.318783
229,V189,0.308219
228,V188,0.303582
298,V258,0.297151


## PREPROCESSING

In [163]:
# import specific preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

In [16]:
# drop useless column 
df=df.drop("TransactionID", axis=1)

In [226]:
# diff numeric & category columns
df_quali=df.select_dtypes(include=['category'])
df_quanti=df.select_dtypes(exclude=['category'])

### QUANTITATIVE FEATURE ENGINEERING

In [227]:
labels=df_quanti["isFraud"]
df_quanti=df_quanti.drop("isFraud", axis=1)

In [202]:
def transfo_num(df_quanti, vars_quanti):
    for var in vars_quanti:
        if df_quanti[var].min() <= 0:
            df_quanti[var]=np.log10(df_quanti[var] + 1)
        else:
            df_quanti[var]=stats.boxcox(df_quanti[var])[0]
    return df_quanti

In [228]:
df_quanti.describe()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,238269.0,37627.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,7372311.0,135.027161,9898.734658,362.555511,153.194946,199.2789,290.733826,86.800652,118.502197,231.855423,...,189.45137,14.237337,353.128174,403.882568,368.269806,16.002708,12.800927,329.608917,149.070312,26.508596
std,4617224.0,239.162521,4901.170153,157.793243,11.336444,41.244453,101.741074,2.690624,371.872009,529.053467,...,30.37536,1.561301,141.095352,152.160324,198.847031,6.897665,2.372447,97.46109,32.101994,3.737502
min,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,0.0,0.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3027058.0,43.320999,6019.0,214.0,150.0,166.0,204.0,87.0,3.0,7.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,7306528.0,68.769001,9678.0,361.0,150.0,226.0,299.0,87.0,8.0,37.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,24.0,206.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,15811130.0,31937.390625,18396.0,600.0,231.0,237.0,540.0,102.0,10286.0,11623.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [229]:
# columns to drop / valeurs manquantes > 95%
df_quanti_missing=missing_values_analysis(df_quanti)
vars_quanti_to_drop=df_quanti_missing[df_quanti_missing["percent_missing"] > 95]["column_name"].reset_index(drop=True)
vars_quanti_to_drop
df_quanti=df_quanti.drop(vars_quanti_to_drop, axis=1)

In [265]:
vars_quanti_to_drop

0    id_07
1    id_08
2    id_21
3    id_22
4    id_24
5    id_25
6    id_26
Name: column_name, dtype: object

In [230]:
# list of quantitative columns
vars_quanti=df_quanti.columns

In [231]:
# list of vars to not skew
vars_not_skew=['card2', 'addr1', 'D4','D15','V150','V291','V292','id_02','id_03','id_04','id_05','id_06','id_09',
               'id_10','id_19','id_20']

In [232]:
vars_quanti_skew=vars_quanti.drop(vars_not_skew)
vars_quanti_skew

Index(['TransactionDT', 'TransactionAmt', 'card1', 'card3', 'card5', 'addr2',
       'dist1', 'dist2', 'C1', 'C2',
       ...
       'V337', 'V338', 'V339', 'id_01', 'id_11', 'id_13', 'id_14', 'id_17',
       'id_18', 'id_32'],
      dtype='object', length=378)

In [233]:
transfo_num(df_quanti, vars_quanti_skew)

  x = um.multiply(x, x, out=x)
  tmp2 = (x - v) * (fx - fw)
  if any(x <= 0):
  after removing the cwd from sys.path.


Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_09,id_10,id_11,id_13,id_14,id_17,id_18,id_19,id_20,id_32
0,1606.559946,3.384828,2414.180647,,3.222161e+17,2.025283e+17,315.0,3.190635e+15,1.301030,,...,,,,,,,,,,
1,1606.571214,2.817077,668.158629,404.0,3.222161e+17,1.227835e+16,325.0,3.190635e+15,,,...,,,,,,,,,,
2,1607.337306,3.290002,1014.270830,490.0,3.222161e+17,7.604379e+17,330.0,3.190635e+15,2.459393,,...,,,,,,,,,,
3,1607.675212,3.183050,2975.672280,567.0,3.222161e+17,3.926070e+16,476.0,3.190635e+15,,,...,,,,,,,,,,
4,1607.754050,3.183050,985.532459,514.0,3.222161e+17,1.227835e+16,420.0,3.190635e+15,,,...,,,1.038193e+16,,,7.604379e+17,,542.0,144.0,1.518514
5,1607.799100,3.169863,1228.357143,555.0,3.222161e+17,1.038324e+19,272.0,3.190635e+15,1.568202,,...,,,,,,,,,,
6,1607.934242,3.891635,2189.108174,360.0,3.222161e+17,7.604379e+17,126.0,3.190635e+15,0.000000,,...,,,,,,,,,,
7,1608.013072,4.424211,2243.474809,490.0,3.222161e+17,1.038324e+19,325.0,3.190635e+15,,,...,,,,,,,,,,
8,1608.080639,2.344005,677.378678,100.0,3.222161e+17,1.038324e+19,337.0,3.190635e+15,,,...,,,1.038193e+16,2.463641e+13,,7.604379e+17,,621.0,500.0,1.518514
9,1608.091899,3.712405,2879.957973,111.0,3.222161e+17,9.630013e+18,204.0,3.190635e+15,1.301030,,...,,,,,,,,,,


In [234]:
df_quanti.describe()

Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_09,id_10,id_11,id_13,id_14,id_17,id_18,id_19,id_20,id_32
count,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,238269.0,37627.0,...,74926.0,74926.0,140978.0,127320.0,931.0,139369.0,45113.0,139318.0,139261.0,77586.0
mean,22320.3576,3.439972,1798.951696,362.555511,4.85387e+17,7.060091e+18,290.733826,3174744000000000.0,1.115867,1.579057,...,0.091023,-0.301124,1.019671e+16,54755350000000.0,1.726833,4.54114e+18,1416302000.0,353.128174,403.882568,1.435619
std,9774.237256,0.585705,744.313247,157.793243,5.499318e+17,4.496921e+18,101.741074,298901500000000.0,0.798231,0.935073,...,0.983842,2.789446,801944400000000.0,70602440000000.0,0.958216,4.548627e+18,8157667000.0,141.095352,152.160324,0.057776
min,1606.559946,-1.492206,298.754599,100.0,1.038193e+16,1.038193e+16,100.0,35006010.0,0.0,0.0,...,-36.0,-100.0,4252208000000000.0,35006010.0,0.0,1.038193e+16,35006010.0,100.0,100.0,0.0
25%,13844.27321,3.08882,1241.787673,214.0,3.222161e+17,7.604379e+17,204.0,3190635000000000.0,0.60206,0.90309,...,0.0,0.0,1.038193e+16,24636410000000.0,1.78533,7.604379e+17,323211000.0,266.0,256.0,1.39794
50%,23602.390874,3.387296,1809.397048,361.0,3.222161e+17,1.038324e+19,299.0,3190635000000000.0,0.954243,1.579784,...,0.0,0.0,1.038193e+16,40758670000000.0,1.78533,7.604379e+17,1086455000.0,341.0,472.0,1.39794
75%,30644.53823,3.751562,2449.552272,512.0,3.222161e+17,1.038324e+19,330.0,3190635000000000.0,1.39794,2.31597,...,0.0,0.0,1.038193e+16,40758670000000.0,2.624282,1.000037e+19,1086455000.0,427.0,533.0,1.518514
max,37662.970632,6.203885,3009.946656,600.0,1.249822e+19,1.553091e+19,540.0,1.227835e+16,4.012289,4.065356,...,25.0,0.0,1.038193e+16,236705700000000.0,2.857935,1.161056e+19,289495100000.0,671.0,661.0,1.518514


In [235]:
df_quanti.dtypes

TransactionDT     float64
TransactionAmt    float32
card1             float64
card2             float32
card3             float32
card5             float32
addr1             float32
addr2             float32
dist1             float32
dist2             float32
C1                float32
C2                float32
C3                float32
C4                float32
C5                float32
C6                float32
C7                float32
C8                float32
C9                float32
C10               float32
C11               float32
C12               float32
C13               float32
C14               float32
D1                float32
D2                float32
D3                float32
D4                float32
D5                float32
D6                float32
                   ...   
V326              float32
V327              float32
V328              float32
V329              float32
V330              float32
V331              float32
V332              float32
V333        

In [236]:
cols_infinite_variation=[]
for var in df_quanti.columns:
    if np.isfinite(df_quanti[var].std()) == False:
        cols_infinite_variation.append(var)
        
cols_infinite_variation

In [245]:
# centrage réduction
scaler=StandardScaler()
df_quanti_scaled=scaler.fit_transform(df_quanti)

In [247]:
df_quanti_scaled=pd.DataFrame(df_quanti_scaled, columns=df_quanti.columns)

In [248]:
df_quanti_scaled.shape

(590540, 394)

### QUALITATIVE FEATURE ENGINEERING

In [36]:
# impute quali column - lightgbm handle missing value for numeric columns
#imp = IterativeImputer(max_iter=10, random_state=42)
imp=SimpleImputer(strategy='constant', fill_value='Missing')
df_quali_imputed=imp.fit_transform(df_quali)

In [None]:
# label encode categoric colimns
encoder=OneHotEncoder()
df_quali_encoded=encoder.fit_transform(df_quali_imputed)

In [56]:
# label encode categorical columns
encoder=LabelEncoder()
df_quali_encoded=pd.DataFrame(df_quali_imputed, columns=df_quali.columns).apply(encoder.fit_transform)

In [252]:
df_model=pd.concat([df_quali_encoded, df_quanti_scaled, labels], axis=1)

In [253]:
df_model.shape

(590540, 426)

In [254]:
train_set, eval_set = train_test_split(df_model, test_size=0.2, random_state=42)

In [255]:
labels_train=train_set["isFraud"]
train=train_set.drop("isFraud", axis=1)

In [256]:
labels_eval=eval_set["isFraud"]
eval=eval_set.drop("isFraud", axis=1)

In [257]:
indexes_of_categories=[train.columns.get_loc(col) for col in df_quali_encoded.columns]

## MODELISATION

In [94]:
params = {
    'metric': 'auc',
    'device_type': 'cpu',
    'objective': 'binary',
    'is_unbalance': True,
    'learning_rate': 0.1,
    'num_leaves': 255,  
    'min_child_samples': 100,  
    'max_bin': 100,  
    'subsample': 0.7,  
    'subsample_freq': 1,  
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'subsample_for_bin': 200000,  
    'min_split_gain': 0,  
    'reg_alpha': 0,  
    'reg_lambda': 0,  
    'verbose': 3
    }

In [107]:
param_distribs = {
        'max_depth': np.arange(7, 10, 1).tolist()
    }

In [258]:
lgb_classifier = lgb.LGBMClassifier(boosting_type='gbdt',  objective='binary', learning_rate=0.01, metric='auc', is_unbalance=True, categorical_feature=indexes_of_categories)
grid_search = RandomizedSearchCV(lgb_classifier, param_distribs, cv=5, n_iter = 10, verbose=3, n_jobs=-1)
grid_search.fit(train, labels_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  7.5min remaining: 20.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  7.9min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.8min finished
Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            categorical_feature=[0, 1, 2, 3, 4,
                                                                 5, 6, 7, 8, 9,
                                                                 10, 11, 12, 13,
                                                                 14, 15, 16, 17,
                                                                 18, 19, 20, 21,
                                                                 22, 23, 24, 25,
                                                                 26, 27, 28, 29, ...],
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            is_unbalance=True,
                                            

In [259]:
print(grid_search.best_score_, grid_search.best_params_)

0.959892640633996 {'max_depth': 7}


In [260]:
eval_true, eval_pred = labels_eval, grid_search.predict(eval)

In [261]:
classification_report(eval_true, eval_pred)

'              precision    recall  f1-score   support\n\n           0       0.98      0.98      0.98    113866\n           1       0.45      0.54      0.49      4242\n\n    accuracy                           0.96    118108\n   macro avg       0.72      0.76      0.73    118108\nweighted avg       0.96      0.96      0.96    118108\n'

In [262]:
confusion_matrix(eval_true, eval_pred)

array([[111070,   2796],
       [  1956,   2286]])

In [263]:
grid_search.predict_proba(eval)

array([[0.76257278, 0.23742722],
       [0.85262645, 0.14737355],
       [0.93805246, 0.06194754],
       ...,
       [0.63383674, 0.36616326],
       [0.857257  , 0.142743  ],
       [0.86376738, 0.13623262]])

In [135]:
# proba of fraud
pd.DataFrame(grid_search.predict_proba(eval))[1]

0         0.238184
1         0.148336
2         0.064474
3         0.459209
4         0.071390
5         0.343053
6         0.094122
7         0.364635
8         0.492562
9         0.344519
10        0.167948
11        0.071861
12        0.245590
13        0.117111
14        0.396127
15        0.275076
16        0.108043
17        0.392469
18        0.137760
19        0.074752
20        0.132523
21        0.129080
22        0.166727
23        0.203192
24        0.202553
25        0.099747
26        0.140150
27        0.326069
28        0.237415
29        0.070620
            ...   
118078    0.395208
118079    0.224262
118080    0.110923
118081    0.182751
118082    0.060195
118083    0.101059
118084    0.248723
118085    0.169330
118086    0.098593
118087    0.113322
118088    0.143750
118089    0.138110
118090    0.403234
118091    0.138607
118092    0.093104
118093    0.196095
118094    0.461894
118095    0.195753
118096    0.133804
118097    0.133699
118098    0.327816
118099    0.

In [129]:
grid_search.predict_proba(eval).shape

(118108, 2)

In [114]:
eval_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [264]:
#features importances
feature_importances = grid_search.best_estimator_.feature_importances_
attribs=train.columns
sorted(zip(feature_importances, attribs), reverse=True)

[(306, 'P_emaildomain'),
 (129, 'TransactionAmt'),
 (129, 'DeviceInfo'),
 (123, 'D2'),
 (103, 'C13'),
 (99, 'C14'),
 (92, 'TransactionDT'),
 (90, 'C1'),
 (89, 'card6'),
 (77, 'id_31'),
 (77, 'card5'),
 (74, 'id_33'),
 (74, 'id_30'),
 (65, 'V258'),
 (65, 'R_emaildomain'),
 (61, 'card2'),
 (59, 'D15'),
 (58, 'C6'),
 (53, 'D4'),
 (44, 'C2'),
 (42, 'D3'),
 (42, 'C11'),
 (38, 'D11'),
 (37, 'V317'),
 (37, 'C5'),
 (35, 'M5'),
 (34, 'card1'),
 (34, 'C9'),
 (29, 'V310'),
 (29, 'V308'),
 (29, 'ProductCD'),
 (28, 'V294'),
 (26, 'V102'),
 (24, 'D10'),
 (23, 'card3'),
 (23, 'V307'),
 (23, 'M4'),
 (22, 'dist1'),
 (22, 'V70'),
 (22, 'M6'),
 (21, 'addr1'),
 (21, 'V285'),
 (20, 'V48'),
 (20, 'D8'),
 (18, 'D5'),
 (16, 'card4'),
 (16, 'V283'),
 (16, 'V128'),
 (16, 'D1'),
 (12, 'V281'),
 (11, 'V91'),
 (11, 'V53'),
 (11, 'V320'),
 (11, 'V315'),
 (11, 'V129'),
 (11, 'V127'),
 (10, 'V62'),
 (10, 'V312'),
 (9, 'V96'),
 (9, 'V54'),
 (9, 'V296'),
 (9, 'V160'),
 (8, 'V313'),
 (8, 'V309'),
 (8, 'V165'),
 (8, 'M8'

## SUBMISSION KAGGLE

In [115]:
ls ../data/

sample_submission.csv.zip  train_identity.csv.zip
test_identity.csv.zip      train_transaction.csv.zip
test_transaction.csv.zip


In [116]:
test_transaction=pd.read_csv("../data/test_transaction.csv.zip", engine="python")
test_identity=pd.read_csv("../data/test_identity.csv.zip", engine="python")

In [117]:
print(test_transaction.shape, test_identity.shape)

(506691, 393) (141907, 41)


In [118]:
test = test_transaction.merge(test_identity, on="TransactionID", how="left")

In [119]:
# delete les dataframes plus utilisés
del test_transaction, test_identity
gc.collect()

In [121]:
# downcoast des object en category
object_test_columns=test.select_dtypes(include=['object']).columns.tolist()
test[object_test_columns]=test[object_test_columns].apply(lambda x: x.astype('category'))

In [122]:
# keep id for submission
id=test["TransactionID"]
# drop useless column 
test=test.drop("TransactionID", axis=1)

In [123]:
# diff numeric & category columns
test_quali=test.select_dtypes(include=['category'])
test_quanti=test.select_dtypes(exclude=['category'])

In [269]:
# numeric feature engineering
# drop useless columns
test_quanti=test_quanti.drop(vars_quanti_to_drop, axis=1)

# log and box cox transfo
transfo_num(test_quanti, vars_quanti_skew)

# standard scaler
test_quanti_scaled=scaler.fit_transform(test_quanti)
test_quanti_scaled=pd.DataFrame(test_quanti_scaled, columns=test_quanti.columns)

  if any(x <= 0):
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)
  after removing the cwd from sys.path.


In [136]:
# impute quali column - lightgbm handle missing value for numeric columns
test_quali_imputed=imp.fit_transform(test_quali)

# label encode categorical columns
test_quali_encoded=pd.DataFrame(test_quali_imputed, columns=test_quali.columns).apply(encoder.fit_transform)

In [270]:
# concat
test_model=pd.concat([test_quali_encoded, test_quanti_scaled], axis=1)

In [271]:
test_model.shape

(506691, 425)

In [272]:
# predict proba with test sample
fraud_proba=pd.DataFrame(grid_search.predict_proba(test_model))[1]
fraud_proba

0         0.122183
1         0.174912
2         0.154981
3         0.120791
4         0.171584
5         0.096263
6         0.360772
7         0.243473
8         0.093188
9         0.138527
10        0.132626
11        0.089849
12        0.257203
13        0.115615
14        0.115023
15        0.101793
16        0.159050
17        0.130959
18        0.290419
19        0.239417
20        0.202784
21        0.142389
22        0.235350
23        0.161935
24        0.320891
25        0.181191
26        0.182706
27        0.170348
28        0.174272
29        0.167693
            ...   
506661    0.131930
506662    0.206248
506663    0.147880
506664    0.318026
506665    0.123142
506666    0.191237
506667    0.299561
506668    0.162577
506669    0.175968
506670    0.128555
506671    0.119867
506672    0.129653
506673    0.132329
506674    0.155783
506675    0.174274
506676    0.197951
506677    0.300085
506678    0.062305
506679    0.466352
506680    0.152439
506681    0.155783
506682    0.

In [273]:
submission_file=pd.concat([id, fraud_proba], axis=1).rename({1:'isFraud'}, axis=1)
submission_file

Unnamed: 0,TransactionID,isFraud
0,3663549,0.122183
1,3663550,0.174912
2,3663551,0.154981
3,3663552,0.120791
4,3663553,0.171584
5,3663554,0.096263
6,3663555,0.360772
7,3663556,0.243473
8,3663557,0.093188
9,3663558,0.138527


In [274]:
submission_file.to_csv('../submission/submission_2.csv', index=False)