In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import mannwhitneyu, kruskal, f_oneway
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, log_loss, confusion_matrix
import sklearn.metrics as metrics
from sklearn.utils import class_weight, shuffle



In [2]:
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")


In [3]:
df_train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [4]:
df_train.shape

(617, 58)

In [5]:
df_test.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_test.shape

(5, 57)

### Fill Nans with median

In [7]:
for column in df_train.columns:
    if df_train[column].isna().sum() != 0 and column != "Class":
        df_train[column].fillna(df_train[column].median(), inplace=True)

In [8]:
df_train.isna().sum().sum()

0

In [9]:
cols_to_drop_1 = ['AH', 'CH', 'CL', 'CS', 'DV', 'EP', 'AR']
cols_to_drop_2 = ['EH', 'FD ']
cols_to_drop_3 = ['BD ', 'BZ']

columns_to_drop_corr = cols_to_drop_1 + cols_to_drop_2 + cols_to_drop_3

In [10]:
df_train = df_train.drop(columns=columns_to_drop_corr, axis=1)
df_test = df_test.drop(columns=columns_to_drop_corr, axis=1)

In [11]:
data_to_analyze = df_train.drop(columns=['EJ', 'Id'], axis=1)
data_class_0 = data_to_analyze[data_to_analyze.Class == 0]
data_class_1 = data_to_analyze[data_to_analyze.Class == 1]

features_to_drop_Utest = []
features_to_drop_Htest = []
columns = data_to_analyze.columns.to_list()
columns.remove('Class')

for column in columns:
    U_rank, p_val1 = mannwhitneyu(data_class_0[column], data_class_1[column])
    if p_val1 > 0.05:
        features_to_drop_Utest.append(column)
        print(column, 'U rank:', U_rank, 'P value:', p_val1)
    H_rank, p_val2 = kruskal(data_class_0[column], data_class_1[column])
    if p_val2 > 0.05:
        features_to_drop_Htest.append(column)

#We need to compare the tests performance
features_to_drop_Htest == features_to_drop_Utest #Perfomance checking for the both tests

AX U rank: 24896.5 P value: 0.12387252473131435
AY U rank: 29748.0 P value: 0.1042758173936365
AZ U rank: 26577.5 P value: 0.5891666444372265
BR U rank: 27889.5 P value: 0.8106634220534192
CB U rank: 28728.5 P value: 0.46026169940417083
CF U rank: 27832.0 P value: 0.8373050274357787
CW  U rank: 30251.5 P value: 0.09534622073233975
DN U rank: 29316.0 P value: 0.27689441334062126
DY U rank: 24242.5 P value: 0.05392796919927672
EG U rank: 27423.0 P value: 0.9703690109171661
EL U rank: 24846.0 P value: 0.10470843603402266
EU U rank: 25132.0 P value: 0.15867656516656026
FC U rank: 27383.0 P value: 0.9514240746529378
FS U rank: 26780.0 P value: 0.6708511431442183
GB U rank: 26190.0 P value: 0.44132864824650264
GE U rank: 30430.0 P value: 0.05208627521686564
GH U rank: 27298.0 P value: 0.911270308447415
GI U rank: 24678.5 P value: 0.09526084456731561


True

In [12]:
features_to_drop_ANOVA = []
columns = data_to_analyze.columns.to_list()
columns.remove('Class')

for column in columns:
    F, p_val = f_oneway(np.log1p(data_class_0[column]), np.log1p(data_class_1[column]))
    if p_val > 0.05:
        features_to_drop_ANOVA.append(column)
        print(column, 'F:', F, 'P value:', p_val)

#We need to compare the tests performance
features_to_drop_Utest == features_to_drop_ANOVA #Perfomance checking for the both tests

AX F: 3.288271989539755 P value: 0.07026373631825462
AY F: 2.3607184918846453 P value: 0.12493916238381524
AZ F: 0.0007053205100172908 P value: 0.9788209906464335
BR F: 0.14146444750050208 P value: 0.7069591819194481
CB F: 0.40417486242386846 P value: 0.5251775161146854
CF F: 1.4361846527152875 P value: 0.2312193467137219
CW  F: 1.7219087512740827 P value: 0.1899370757777987
DN F: 0.9616555298034857 P value: 0.32715572514619407
EG F: 0.00031332948627996063 P value: 0.9858830317986427
EL F: 2.1465337372000763 P value: 0.1434031945216177
EU F: 1.8232246979392093 P value: 0.17742659822745485
FC F: 0.0017783489467820194 P value: 0.966376486358466
FS F: 0.7634489135682092 P value: 0.3825923939427739
GB F: 1.5632519875986108 P value: 0.21166521810832983
GH F: 0.05519165487111284 P value: 0.8143420059794999
GI F: 3.7100316863205536 P value: 0.054546953012873454


False

In [13]:
features_to_drop_final = list(set(features_to_drop_Utest).intersection(set(features_to_drop_ANOVA)))
print(features_to_drop_final)

['FS', 'EU', 'CW ', 'DN', 'CB', 'GB', 'AX', 'EG', 'FC', 'AZ', 'EL', 'GH', 'AY', 'BR', 'GI', 'CF']


In [14]:
data_obj = df_train[['EJ', 'Class']]
data_obj.groupby('EJ').mean()

Unnamed: 0_level_0,Class
EJ,Unnamed: 1_level_1
A,0.126126
B,0.202532


In [15]:
df_train = df_train.drop(columns=features_to_drop_final, axis=1)

In [16]:
df_test = df_test.drop(columns=features_to_drop_final, axis=1)

In [17]:
df_train.shape, df_test.shape

((617, 31), (5, 30))

In [18]:
import copy

def clean_data(df):
    df_copy = copy.deepcopy(df)
    
    df_copy = df_copy.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df_copy = df_copy.drop_duplicates()
    
    return df_copy

In [19]:
def one_hot_encode(df):
    df_copy = copy.deepcopy(df)
    eq_one_hot = pd.get_dummies(df_copy["EJ"])  
    df_copy = df_copy.drop(columns=['EJ'], axis=1)
    df_copy = df_copy.join(eq_one_hot)
    return df_copy

In [20]:
df_train = one_hot_encode(df_train)
df_test = one_hot_encode(df_test)

In [21]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [22]:
df_train.head()

Unnamed: 0,Id,AB,AF,AM,BC,BN,BP,BQ,CC,CD,...,FE,FI,FL,FR,GE,GF,GL,Class,A,B
0,000ff2bfdfe9,0.209377,3109.03329,22.394407,5.555634,22.5984,175.638726,152.707705,0.563481,23.3876,...,9028.291921,3.58345,7.298162,1.73855,72.611063,2003.810319,0.120343,1,0,1
1,007255e47698,0.145282,978.76416,36.968889,1.2299,19.4205,155.86803,14.75472,0.48471,50.628208,...,6785.003474,10.358927,0.173229,0.49706,72.611063,27981.56275,21.978,0,1,0
2,013f2bd269f5,0.47003,2635.10654,32.360553,1.2299,26.4825,128.988531,219.32016,0.495852,85.955376,...,8338.906181,11.626917,7.70956,0.97556,88.609437,13676.95781,0.196941,0,0,1
3,043ac50845d5,0.252107,3819.65177,77.112203,1.2299,23.6577,237.282264,11.05041,0.717882,88.15936,...,10965.76604,14.852022,6.122162,0.49706,82.416803,2094.262452,0.155829,0,0,1
4,044fb8a146ec,0.380297,3733.04844,14.103738,102.15198,24.0108,324.546318,149.717165,0.536467,72.644264,...,16198.04959,13.666727,8.153058,48.50134,146.109943,8524.370502,0.096614,1,0,1


In [23]:
X = df_train.drop(columns=["Id", "Class"], axis=1)
y = df_train['Class']

In [24]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Select all except one-hot encoded "EJ" feature
quantitative_features = X.select_dtypes(include=['float64'])

scaled_features = scaler.fit_transform(quantitative_features)

# Create DataFrame drom scaler output and add encoded "EJ" to it
X_scaled = pd.DataFrame(scaled_features, columns=quantitative_features.columns)
X_scaled[['A', 'B']] = X[['A', 'B']]

X_scaled.head()

Unnamed: 0,AB,AF,AM,BC,BN,BP,BQ,CC,CD,CR,...,EE,FE,FI,FL,FR,GE,GF,GL,A,B
0,0.021082,0.102347,0.030632,0.002958,0.654545,0.04324,0.440929,0.098469,0.0,0.0,...,0.094302,0.052697,0.0,0.05172,0.000998,0.0,0.013846,0.005425,0,1
1,0.010541,0.027589,0.053864,0.0,0.490909,0.034915,0.0391,0.078406,0.044646,0.353002,...,0.031732,0.036862,0.209978,0.0,0.0,0.0,0.194527,1.0,1,0
2,0.063949,0.085715,0.046519,0.0,0.854545,0.023597,0.634957,0.081244,0.102545,0.212468,...,0.435754,0.047831,0.249274,0.054706,0.000385,0.011229,0.095035,0.00891,0,1
3,0.02811,0.127285,0.117854,0.0,0.709091,0.069197,0.02831,0.137796,0.106158,0.19083,...,0.195531,0.066374,0.349223,0.043183,0.0,0.006882,0.014475,0.007039,0,1
4,0.049192,0.124246,0.017417,0.069008,0.727273,0.105942,0.432218,0.091589,0.080729,0.210044,...,0.177654,0.103309,0.312489,0.057926,0.038597,0.051588,0.059198,0.004345,0,1


In [25]:
def balanced_log_loss(y_true, y_pred, prob_0, prob_1):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_0 = prob_0
    p_1 = prob_1
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    balanced_log_loss = (w_0 * log_loss_0 + w_1 * log_loss_1) / 2
    # return the average log loss
    return balanced_log_loss

In [26]:
X_shuffle, y_shuffle = shuffle(X_scaled, y, random_state = 3247)

In [27]:
from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators': 610, 'max_depth': 19}

rf = RandomForestClassifier(
    n_jobs=-1,
    random_state=42,
    **params
)

rf.fit(X_shuffle, y_shuffle)

In [28]:
y_hat = cross_val_predict(rf, X_scaled, y, cv=4)
print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
print(classification_report(y, y_hat), '\n')
print(confusion_matrix(y, y_hat))



Accuracy: 0.921 

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       509
           1       0.84      0.68      0.75       108

    accuracy                           0.92       617
   macro avg       0.89      0.82      0.85       617
weighted avg       0.92      0.92      0.92       617
 

[[495  14]
 [ 35  73]]


In [29]:
acc_array = cross_val_score(rf, X_scaled, y, scoring = 'accuracy', cv = 4)
print('Accuracies:', acc_array, '\n')
print('Mean accuracy:', round(acc_array.mean(), 3), 'Std deviation:', round(acc_array.std(), 3))

Accuracies: [0.92258065 0.91558442 0.9025974  0.94155844] 

Mean accuracy: 0.921 Std deviation: 0.014


In [30]:
estimator_XG = xgb.XGBClassifier(booster = 'gbtree', 
                             eval_metric = 'logloss',
                             tree_method = 'exact', 
                             random_state = 123, 
                             grow_policy = 'lossguide',
                             eta = 0,
                             learning_rate = 0.2, 
                             gamma = 0
                            )

params = {'n_estimators' : [110], 
          'max_depth' : [5],  
          'subsample' : [1.0],
          'scale_pos_weight' : [9], 
          'base_score' : [0.3],
          'colsample_bynode' : [0.3],
          'colsample_bytree' : [0.6],
          'colsample_bylevel' : [0.8]
         }

search_XG = RandomizedSearchCV(estimator_XG, params, scoring = 'f1', cv = 4)
search_XG.fit(X_shuffle, y_shuffle)

print(search_XG.best_params_, '\n')
print('Best f1 score:', search_XG.best_score_, '\n')



{'subsample': 1.0, 'scale_pos_weight': 9, 'n_estimators': 110, 'max_depth': 5, 'colsample_bytree': 0.6, 'colsample_bynode': 0.3, 'colsample_bylevel': 0.8, 'base_score': 0.3} 

Best f1 score: 0.7894351680065965 



In [31]:
y_hat = cross_val_predict(search_XG.best_estimator_, X_scaled, y, cv=4)
print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
print(classification_report(y, y_hat), '\n')
print(confusion_matrix(y, y_hat))

Accuracy: 0.937 

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       509
           1       0.82      0.81      0.82       108

    accuracy                           0.94       617
   macro avg       0.89      0.89      0.89       617
weighted avg       0.94      0.94      0.94       617
 

[[490  19]
 [ 20  88]]


In [32]:
acc_array = cross_val_score(search_XG.best_estimator_, X_scaled, y, scoring = 'accuracy', cv = 4)
print('Accuracies:', acc_array, '\n')
print('Mean accuracy:', round(acc_array.mean(), 3), 'Std deviation:', round(acc_array.std(), 3))

Accuracies: [0.96129032 0.93506494 0.92207792 0.92857143] 

Mean accuracy: 0.937 Std deviation: 0.015


In [33]:
best_xgb = search_XG.best_estimator_

In [34]:
best_gb = GradientBoostingClassifier(
    subsample=0.6500000000000001,
    n_estimators=100,
    max_depth=3,
    learning_rate=0.06851903253551589,
    min_weight_fraction_leaf=0.0,
    min_samples_split=9,
    min_samples_leaf=6,
    random_state=42,
)

best_gb.fit(X_shuffle, y_shuffle)

In [35]:
y_hat = cross_val_predict(best_gb, X_scaled, y, cv=4)
print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
print(classification_report(y, y_hat), '\n')
print(confusion_matrix(y, y_hat))

Accuracy: 0.927 

              precision    recall  f1-score   support

           0       0.94      0.97      0.96       509
           1       0.84      0.72      0.78       108

    accuracy                           0.93       617
   macro avg       0.89      0.85      0.87       617
weighted avg       0.92      0.93      0.92       617
 

[[494  15]
 [ 30  78]]


In [36]:
acc_array = cross_val_score(best_gb, X_scaled, y, scoring = 'accuracy', cv = 4)
print('Accuracies:', acc_array, '\n')
print('Mean accuracy:', round(acc_array.mean(), 3), 'Std deviation:', round(acc_array.std(), 3))

Accuracies: [0.93548387 0.90909091 0.91558442 0.94805195] 

Mean accuracy: 0.927 Std deviation: 0.016


In [37]:
params = {
    "lambda_l1": 1.8213060450879508e-05,
    "lambda_l2": 0.0031582501792819773,
    "num_leaves": 94,
    "feature_fraction": 0.7652946325401702,
    "bagging_fraction": 0.5785618008941892,
    "bagging_freq": 4,
    "min_child_samples": 28,
    "learning_rate": 0.1833869044910331,
    "scale_pos_weight": 2.
}

best_lgb = lgb.LGBMClassifier(
    n_jobs=-1,
    random_state=42,
    **params
)

best_lgb.fit(X_shuffle, y_shuffle)



In [38]:
y_hat = cross_val_predict(best_lgb, X_scaled, y, cv=4)
print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
print(classification_report(y, y_hat), '\n')
print(confusion_matrix(y, y_hat))

Accuracy: 0.924 

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       509
           1       0.81      0.73      0.77       108

    accuracy                           0.92       617
   macro avg       0.88      0.85      0.86       617
weighted avg       0.92      0.92      0.92       617
 

[[491  18]
 [ 29  79]]


In [39]:
acc_array = cross_val_score(best_lgb, X_scaled, y, scoring = 'accuracy', cv = 4)
print('Accuracies:', acc_array, '\n')
print('Mean accuracy:', round(acc_array.mean(), 3), 'Std deviation:', round(acc_array.std(), 3))

Accuracies: [0.92903226 0.93506494 0.9025974  0.92857143] 

Mean accuracy: 0.924 Std deviation: 0.013


In [40]:
soft_vote = VotingClassifier(
    estimators=[("gb", best_gb), ('xgb', best_xgb), ('lgb', best_lgb)],
    voting='soft'
)


In [41]:
soft_vote.fit(X_shuffle, y_shuffle)



In [42]:
y_hat = cross_val_predict(soft_vote, X_scaled, y, cv=4)
print('Accuracy:', round(accuracy_score(y, y_hat), 3), '\n')
print(classification_report(y, y_hat), '\n')
print(confusion_matrix(y, y_hat))

Accuracy: 0.937 

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       509
           1       0.86      0.77      0.81       108

    accuracy                           0.94       617
   macro avg       0.90      0.87      0.89       617
weighted avg       0.94      0.94      0.94       617
 

[[495  14]
 [ 25  83]]


In [43]:
probs = cross_val_predict(soft_vote, X_scaled, y, cv = 4, method = 'predict_proba')
prob_0_train = probs[:, 0]
prob_1_train = probs[:, 1]



In [44]:
balanced_log_loss(y, y_hat, prob_0_train, prob_1_train)

0.36021917921563285

In [45]:
final_estimator = soft_vote

In [46]:
train_proba = cross_val_predict(final_estimator, X_scaled, y, cv = 4, method = 'predict_proba')

df_check = pd.DataFrame({
    #'Id' : X.index,
    'class_0' : train_proba[:, 0],
    'class_1' : train_proba[:, 1],
    'y_train' : y,
    'y_predicted' : y_hat
})




In [47]:
df_class_0 = df_check[['class_0', 'y_train', 'y_predicted']]
df_class_1 = df_check[['class_1', 'y_train', 'y_predicted']]

In [48]:
df_class_0.groupby(['y_train', 'y_predicted']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.969367
0,1,0.171069
1,0,0.764263
1,1,0.150246


In [49]:
df_class_0.groupby(['y_train', 'y_predicted']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.999444
0,1,0.49526
1,0,0.997076
1,1,0.493954


In [50]:
df_class_0.groupby(['y_train', 'y_predicted']).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_0
y_train,y_predicted,Unnamed: 2_level_1
0,0,0.539435
0,1,0.020718
1,0,0.504858
1,1,0.002552


In [51]:
df_class_0[(df_check.y_train == 0) & (df_check.y_predicted == 1)]

Unnamed: 0,class_0,y_train,y_predicted
102,0.49526,0,1
190,0.123662,0,1
195,0.493862,0,1
220,0.121431,0,1
231,0.098088,0,1
292,0.199146,0,1
325,0.10032,0,1
337,0.131682,0,1
356,0.299366,0,1
367,0.074612,0,1


In [52]:
df_class_0[(df_check.y_train == 1) & (df_check.y_predicted == 0)]

Unnamed: 0,class_0,y_train,y_predicted
4,0.740893,1,0
31,0.780967,1,0
93,0.837211,1,0
95,0.530918,1,0
145,0.979848,1,0
181,0.609951,1,0
186,0.507181,1,0
193,0.909711,1,0
262,0.504858,1,0
267,0.988041,1,0


In [53]:
df_test['B'] = 0

test_idxs = df_test['Id']
# Select all except one-hot encoded "EJ" feature
quantitative_features = df_test.select_dtypes(include=['float64'])

scaled_features = scaler.transform(quantitative_features)

# Create DataFrame drom scaler output and add encoded "EJ" to it
df_test_scaled = pd.DataFrame(scaled_features, columns=quantitative_features.columns)

In [54]:
df_test_scaled[['A', 'B']] = df_test[['A', 'B']]

df_test_scaled.head()

Unnamed: 0,AB,AF,AM,BC,BN,BP,BQ,CC,CD,CR,...,EE,FE,FI,FL,FR,GE,GF,GL,A,B
0,-0.013352,-0.006759,-0.005065,-0.000841,-0.509091,-0.030717,-0.003877,-0.04505,-0.038331,-0.023305,...,-0.015866,-0.011034,-0.111054,-0.001257,-0.0004,-0.050964,-9.1e-05,-5.1e-05,1,0
1,-0.013352,-0.006759,-0.005065,-0.000841,-0.509091,-0.030717,-0.003877,-0.04505,-0.038331,-0.023305,...,-0.015866,-0.011034,-0.111054,-0.001257,-0.0004,-0.050964,-9.1e-05,-5.1e-05,1,0
2,-0.013352,-0.006759,-0.005065,-0.000841,-0.509091,-0.030717,-0.003877,-0.04505,-0.038331,-0.023305,...,-0.015866,-0.011034,-0.111054,-0.001257,-0.0004,-0.050964,-9.1e-05,-5.1e-05,1,0
3,-0.013352,-0.006759,-0.005065,-0.000841,-0.509091,-0.030717,-0.003877,-0.04505,-0.038331,-0.023305,...,-0.015866,-0.011034,-0.111054,-0.001257,-0.0004,-0.050964,-9.1e-05,-5.1e-05,1,0
4,-0.013352,-0.006759,-0.005065,-0.000841,-0.509091,-0.030717,-0.003877,-0.04505,-0.038331,-0.023305,...,-0.015866,-0.011034,-0.111054,-0.001257,-0.0004,-0.050964,-9.1e-05,-5.1e-05,1,0


In [55]:
X_t = df_test_scaled
prob_predictions = final_estimator.predict_proba(X_t)
prob_predictions

array([[0.54671434, 0.45328566],
       [0.54671434, 0.45328566],
       [0.54671434, 0.45328566],
       [0.54671434, 0.45328566],
       [0.54671434, 0.45328566]])

In [56]:
prob_0 = prob_predictions[:, 0]
prob_0

array([0.54671434, 0.54671434, 0.54671434, 0.54671434, 0.54671434])

In [57]:
prob_0[prob_0 > 0.74] = 1
prob_1 = 1 - prob_0
prob_1


array([0.45328566, 0.45328566, 0.45328566, 0.45328566, 0.45328566])

In [58]:
submission = pd.DataFrame({
    'Id': df_test['Id'] ,
    'class_0' : prob_0,
    'class_1' : prob_1
})

In [59]:
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.546714,0.453286
1,010ebe33f668,0.546714,0.453286
2,02fa521e1838,0.546714,0.453286
3,040e15f562a2,0.546714,0.453286
4,046e85c7cc7f,0.546714,0.453286


In [60]:
submission.to_csv('submission.csv',  index = False)