In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

In [None]:
## 0 Preparation 

train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
#train.drop([93,145,186,193,371,434,500,559,229,412,467], axis=0, inplace=True)  == downgrade to 1.95
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

lb = LabelEncoder()
train['EJ'] = lb.fit_transform(train['EJ']).astype(float)
test['EJ'] = lb.fit_transform(test['EJ']).astype(float)

del_list = list(train.loc[train['EJ']==0]['Id'])
gr_del_list = list(greeks[greeks['Id'].isin(del_list)].index)
tr_del_list = train[train['Id'].isin(del_list)].index
greeks.drop(index=gr_del_list, axis=0, inplace=True)
train.drop(tr_del_list, axis=0, inplace=True)

predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']

x= train[predictor_columns]
y = train['Class']

from sklearn.model_selection import KFold as KF, GridSearchCV
cv_outer = KF(n_splits = 10, shuffle=True, random_state=42)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)

from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)


def balanced_log_loss_e(y_true, y_pred):
    hard_cases =[]
    hard_scores =[]
    hard_scores2 =[]
    nc = np.bincount(y_true)
    #print (type(y_true), type(y_pred), len(y_true), len (y_pred))
    y_tt = list (y_true)
    y_pp = list (y_pred)
    for i, element in enumerate(y_pp):
        if abs(y_tt[i] - y_pp[i]) > 0.4:
            hard_cases.append(i)
            hard_scores.append (y_tt[i])
            hard_scores2.append (y_pp[i])
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15), hard_cases, hard_scores, hard_scores2

class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        #self.imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

        self.classifiers =[xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=4),
                          TabPFNClassifier(N_ensemble_configurations=4)]
        
        #self.classifiers =[xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85)]
    
    def fit(self,X,y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        X = self.imputer.fit_transform(X)
        for classifier in self.classifiers:
            if classifier==self.classifiers[2] or classifier==self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
    

def training(model, x,y,y_meta):
    outer_results = list()
    best_loss = np.inf
    split = 0
    splits = 5
    models = []
    scores =[]
    for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):
        split+=1
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
                
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:,:1]
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            y_p[i]= p0[i]
            if p0[i]>=0.5:
                y_p[i]= False
            else :
                y_p[i]=True
        y_p = y_p.astype(int)
        loss = balanced_log_loss(y_val,y_p)
        models.append(model)
        scores.append(loss)
        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.15f, split = %.1f' % (loss,split))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model, models, scores

def training_nf(model, x,y,y_meta):
    models = []
    scores =[]
    model.fit(x, y)
    best_model = model
    for i in range (5):
        models.append(model)
        scores.append(i)
    return best_model, models, scores
    
    for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):
        split+=1
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
                
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:,:1]
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            y_p[i]= p0[i]
            if p0[i]>=0.5:
                y_p[i]= False
            else :
                y_p[i]=True
        y_p = y_p.astype(int)
        loss = balanced_log_loss(y_val,y_p)
        models.append(model)
        scores.append(loss)
        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.15f, split = %.1f' % (loss,split))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model, models, scores


## 1 BASIC TRAIN 

from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

train_pred_and_time = pd.concat((train, times), axis=1)

'''
test['A'] = test['AB']*10000+test['AF']+test['AM']*100+test['AY']*10000
test['B'] = test['BC']*1000+test['BQ']*1000+test['BR']+test['BZ']
test['D'] = test['DU']*500
test['E'] = test['EH']*1000
test['F'] = test['FD ']*100 + test['FE']+ test['FL']*1000  + test['FR']*100
predictor_columns.append('A')     # useful
predictor_columns.append('B')     # useful
predictor_columns.append('D')     # useful
predictor_columns.append('E')     # useful
predictor_columns.append('F')     # useful

test['C']= test['CC'] + test['CD ']+ test['CF']+test['CH']+test['CL']+test['CR']+test['CU']+test['CW ']
test['F']= test['FC'] + test['FD ']+ test['FE']+test['FI']+test['FL']+test['FR']+test['FS']
#basic_features.append('id2')   # useful
predictor_columns.append('C')     # useful
predictor_columns.append('F')     # useful

if 'DA' in predictor_columns: predictor_columns.remove('DA') # useful
#if 'BN' in predictor_columns: predictor_columns.remove('BN') # useful
if 'BP' in predictor_columns: predictor_columns.remove('BP') # useful
if 'CC' in predictor_columns: predictor_columns.remove('CC') # useful
if 'CW ' in predictor_columns: predictor_columns.remove('CW ') # useful
if 'DI' in predictor_columns: predictor_columns.remove('DI') # useful
if 'GH' in predictor_columns: predictor_columns.remove('GH') # useful
if 'BC' in predictor_columns: predictor_columns.remove('BC') # useful
if 'AF' in predictor_columns: predictor_columns.remove('AF') # useful
'''
test_predictors = test[predictor_columns]


first_category = test_predictors.EJ.unique()[0]
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

ros = RandomOverSampler(random_state=42)

train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks.Alpha)
#print('Original dataset shape')
#print(greeks.Alpha.value_counts())
#print('Resample dataset shape')
#print( y_ros.value_counts())

x_ros = train_ros.drop(['Class', 'Id'],axis=1)


'''
x_ros['A'] = x_ros['AB']*10000+x_ros['AF']+x_ros['AM']*100+x_ros['AY']*10000
x_ros['B'] = x_ros['BC']*1000+x_ros['BQ']*1000+x_ros['BR']+x_ros['BZ']
x_ros['D'] = x_ros['DU']*500
x_ros['E'] = x_ros['EH']*1000
x_ros['F'] = x_ros['FD ']*100 + x_ros['FE']+ x_ros['FL']*1000  + x_ros['FR']*100

x_ros['C']= x_ros['CC'] + x_ros['CD ']+ x_ros['CF']+x_ros['CH']+x_ros['CL']+x_ros['CR']+x_ros['CU']+x_ros['CW ']
x_ros['F']= x_ros['FC'] + x_ros['FD ']+ x_ros['FE']+x_ros['FI']+x_ros['FL']+x_ros['FR']+x_ros['FS']                          # test['FI']

x_ros.drop(['DA','BP','CC','CW ','DI','GH','BC','AF'],axis=1, inplace=True)
'''

y_ = train_ros.Class
yt = Ensemble()
m, all_models, all_scores = training_nf(yt,x_ros,y_,y_ros)

# 2 predict for submission 

y_pred = m.predict_proba(test_pred_and_time)
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)


all_scores_s = all_scores     
all_scores_s.sort()
index_of_second = all_scores.index(all_scores_s[1])
index_of_third = all_scores.index(all_scores_s[2])
index_of_forth = all_scores.index(all_scores_s[3])
index_of_five = all_scores.index(all_scores_s[4])


y_pred2 = all_models[index_of_second].predict_proba(test_pred_and_time)
probabilities2 = np.concatenate((y_pred2[:,:1], np.sum(y_pred2[:,1:], 1, keepdims=True)), axis=1)
y_pred3 = all_models[index_of_third].predict_proba(test_pred_and_time)
probabilities3 = np.concatenate((y_pred3[:,:1], np.sum(y_pred3[:,1:], 1, keepdims=True)), axis=1)
y_pred4 = all_models[index_of_forth].predict_proba(test_pred_and_time)
probabilities4 = np.concatenate((y_pred4[:,:1], np.sum(y_pred4[:,1:], 1, keepdims=True)), axis=1)
y_pred5 = all_models[index_of_five].predict_proba(test_pred_and_time)
probabilities5 = np.concatenate((y_pred5[:,:1], np.sum(y_pred5[:,1:], 1, keepdims=True)), axis=1)

p0 = (probabilities[:,:1] + probabilities2[:,:1]+ probabilities3[:,:1]+ probabilities4[:,:1]+ probabilities5[:,:1])/5

###p0[p0 > 0.10] = 1 # пробуем
###p0[p0 < 0.01] = 0

'''
zero_list = list(test.loc[train_pred_and_time['BQ'].isnull()].index)
for z in zero_list : p0[z] = 1
zero_list = list(test.loc[train_pred_and_time['BN'] < 15.5364].index)
for z in zero_list : p0[z] = 1
'''

submission_0 = pd.DataFrame(test["Id"], columns=["Id"])
submission_0["class_0"] = p0
submission_0["class_1"] = 1 - p0
submission_0.to_csv('submission_0.csv', index=False)

print ('------------------- submisson 0 completed ')


## 3 predict  for all TRAIN data 

predictor_columns_2 = [n for n in train_pred_and_time.columns if n != 'Class' and n != 'Id']

'''
train_pred_and_time['A'] = train_pred_and_time['AB']*10000+train_pred_and_time['AF']+train_pred_and_time['AM']*100+train_pred_and_time['AY']*10000
train_pred_and_time['B'] = train_pred_and_time['BC']*1000+train_pred_and_time['BQ']*1000+train_pred_and_time['BR']+train_pred_and_time['BZ']
train_pred_and_time['D'] = train_pred_and_time['DU']*500
train_pred_and_time['E'] = train_pred_and_time['EH']*1000
train_pred_and_time['F'] = train_pred_and_time['FD ']*100 + train_pred_and_time['FE']+ train_pred_and_time['FL']*1000  + train_pred_and_time['FR']*100
predictor_columns_2.append('A')     # useful
predictor_columns_2.append('B')     # useful
predictor_columns_2.append('D')     # useful
predictor_columns_2.append('E')     # useful
predictor_columns_2.append('F')     # useful

train_pred_and_time['C']= train_pred_and_time['CC'] + train_pred_and_time['CD ']+ train_pred_and_time['CF']+train_pred_and_time['CH']+train_pred_and_time['CL']+train_pred_and_time['CR']+train_pred_and_time['CU']+train_pred_and_time['CW ']
train_pred_and_time['F']= train_pred_and_time['FC'] + train_pred_and_time['FD ']+ train_pred_and_time['FE']+train_pred_and_time['FI']+train_pred_and_time['FL']+train_pred_and_time['FR']+train_pred_and_time['FS']
#basic_features.append('id2')   # useful
predictor_columns_2.append('C')     # useful
predictor_columns_2.append('F')     # useful

if 'DA' in predictor_columns_2: predictor_columns_2.remove('DA') # useful
#if 'BN' in predictor_columns_2: predictor_columns_2.remove('BN') # useful
if 'BP' in predictor_columns_2: predictor_columns_2.remove('BP') # useful
if 'CC' in predictor_columns_2: predictor_columns_2.remove('CC') # useful
if 'CW ' in predictor_columns_2: predictor_columns_2.remove('CW ') # useful
if 'DI' in predictor_columns_2: predictor_columns_2.remove('DI') # useful
if 'GH' in predictor_columns_2: predictor_columns_2.remove('GH') # useful
if 'BC' in predictor_columns_2: predictor_columns_2.remove('BC') # useful
if 'AF' in predictor_columns_2: predictor_columns_2.remove('AF') # useful
'''

y_pred=  m.predict_proba(train_pred_and_time[predictor_columns_2])
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)

y_pred2 = all_models[index_of_second].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities2 = np.concatenate((y_pred2[:,:1], np.sum(y_pred2[:,1:], 1, keepdims=True)), axis=1)
y_pred3 = all_models[index_of_third].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities3 = np.concatenate((y_pred3[:,:1], np.sum(y_pred3[:,1:], 1, keepdims=True)), axis=1)
y_pred4 = all_models[index_of_forth].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities4 = np.concatenate((y_pred4[:,:1], np.sum(y_pred4[:,1:], 1, keepdims=True)), axis=1)
y_pred5 = all_models[index_of_five].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities5 = np.concatenate((y_pred5[:,:1], np.sum(y_pred5[:,1:], 1, keepdims=True)), axis=1)

p0 = (probabilities[:,:1] + probabilities2[:,:1]+ probabilities3[:,:1]+ probabilities4[:,:1]+ probabilities5[:,:1])/5




###p0[p0 > 0.10] = 1 
###p0[p0 < 0.01] = 0

'''
zero_list = list(train_pred_and_time.loc[train_pred_and_time['BQ'].isnull()].index)
for z in zero_list : p0[z] = 1
zero_list = list(train_pred_and_time.loc[train_pred_and_time['BN'] < 15.5364].index)
for z in zero_list : p0[z] = 1
'''

p1 = 1 - p0


print('overall',balanced_log_loss_e(train.Class,p1.flatten()))

#overall CV 0.0928  --  -- 24 & 64  >30 mins       0.13 LB 

# overall (0.02794048828990353,  [103, 134, 205, 229, 295, 299], [0, 0, 0, 0, 0, 0], [0.78, 0.40, 0.61, 0.76, 0.845, 0.5419956709666807])
# overall (0.020332229109128463, [8, 64, 87, 128, 189], [0, 0, 0, 0, 0], [0.4743714693172232, 0.5984776315191408, 0.8488388713780383, 0.4809832856073687, 0.7367310655403029])
#overall (9.992007221626415e-16, [], [], []) including thresholds
#overall (0.006685700891628771, [], [], [])  no thresholds at all 

In [None]:
## 0 Preparation 

train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
#train.drop([93,145,186,193,371,434,500,559,229,412,467], axis=0, inplace=True)  == downgrade to 1.95
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

lb = LabelEncoder()
train['EJ'] = lb.fit_transform(train['EJ']).astype(float)
test['EJ'] = lb.fit_transform(test['EJ']).astype(float)

del_list = list(train.loc[train['EJ']==1]['Id'])
gr_del_list = list(greeks[greeks['Id'].isin(del_list)].index)
tr_del_list = train[train['Id'].isin(del_list)].index
greeks.drop(index=gr_del_list, axis=0, inplace=True)
train.drop(tr_del_list, axis=0, inplace=True)

predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']

x= train[predictor_columns]
y = train['Class']

from sklearn.model_selection import KFold as KF, GridSearchCV
cv_outer = KF(n_splits = 10, shuffle=True, random_state=42)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)

from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)


def balanced_log_loss_e(y_true, y_pred):
    hard_cases =[]
    hard_scores =[]
    hard_scores2 =[]
    nc = np.bincount(y_true)
    #print (type(y_true), type(y_pred), len(y_true), len (y_pred))
    y_tt = list (y_true)
    y_pp = list (y_pred)
    for i, element in enumerate(y_pp):
        if abs(y_tt[i] - y_pp[i]) > 0.4:
            hard_cases.append(i)
            hard_scores.append (y_tt[i])
            hard_scores2.append (y_pp[i])
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15), hard_cases, hard_scores, hard_scores2

class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        #self.imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

        self.classifiers =[xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=4),
                          TabPFNClassifier(N_ensemble_configurations=4)]
        
        #self.classifiers =[xgboost.XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85)]
    
    def fit(self,X,y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        X = self.imputer.fit_transform(X)
        for classifier in self.classifiers:
            if classifier==self.classifiers[2] or classifier==self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
    

def training(model, x,y,y_meta):
    outer_results = list()
    best_loss = np.inf
    split = 0
    splits = 5
    models = []
    scores =[]
    for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):
        split+=1
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
                
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:,:1]
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            y_p[i]= p0[i]
            if p0[i]>=0.5:
                y_p[i]= False
            else :
                y_p[i]=True
        y_p = y_p.astype(int)
        loss = balanced_log_loss(y_val,y_p)
        models.append(model)
        scores.append(loss)
        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.15f, split = %.1f' % (loss,split))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model, models, scores

## 1 BASIC TRAIN 

from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

train_pred_and_time = pd.concat((train, times), axis=1)

'''
test['A'] = test['AB']*10000+test['AF']+test['AM']*100+test['AY']*10000
test['B'] = test['BC']*1000+test['BQ']*1000+test['BR']+test['BZ']
test['D'] = test['DU']*500
test['E'] = test['EH']*1000
test['F'] = test['FD ']*100 + test['FE']+ test['FL']*1000  + test['FR']*100
predictor_columns.append('A')     # useful
predictor_columns.append('B')     # useful
predictor_columns.append('D')     # useful
predictor_columns.append('E')     # useful
predictor_columns.append('F')     # useful

test['C']= test['CC'] + test['CD ']+ test['CF']+test['CH']+test['CL']+test['CR']+test['CU']+test['CW ']
test['F']= test['FC'] + test['FD ']+ test['FE']+test['FI']+test['FL']+test['FR']+test['FS']
#basic_features.append('id2')   # useful
predictor_columns.append('C')     # useful
predictor_columns.append('F')     # useful

if 'DA' in predictor_columns: predictor_columns.remove('DA') # useful
#if 'BN' in predictor_columns: predictor_columns.remove('BN') # useful
if 'BP' in predictor_columns: predictor_columns.remove('BP') # useful
if 'CC' in predictor_columns: predictor_columns.remove('CC') # useful
if 'CW ' in predictor_columns: predictor_columns.remove('CW ') # useful
if 'DI' in predictor_columns: predictor_columns.remove('DI') # useful
if 'GH' in predictor_columns: predictor_columns.remove('GH') # useful
if 'BC' in predictor_columns: predictor_columns.remove('BC') # useful
if 'AF' in predictor_columns: predictor_columns.remove('AF') # useful
'''
test_predictors = test[predictor_columns]


first_category = test_predictors.EJ.unique()[0]
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

ros = RandomOverSampler(random_state=42)

train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks.Alpha)
#print('Original dataset shape')
#print(greeks.Alpha.value_counts())
#print('Resample dataset shape')
#print( y_ros.value_counts())

x_ros = train_ros.drop(['Class', 'Id'],axis=1)


'''
x_ros['A'] = x_ros['AB']*10000+x_ros['AF']+x_ros['AM']*100+x_ros['AY']*10000
x_ros['B'] = x_ros['BC']*1000+x_ros['BQ']*1000+x_ros['BR']+x_ros['BZ']
x_ros['D'] = x_ros['DU']*500
x_ros['E'] = x_ros['EH']*1000
x_ros['F'] = x_ros['FD ']*100 + x_ros['FE']+ x_ros['FL']*1000  + x_ros['FR']*100

x_ros['C']= x_ros['CC'] + x_ros['CD ']+ x_ros['CF']+x_ros['CH']+x_ros['CL']+x_ros['CR']+x_ros['CU']+x_ros['CW ']
x_ros['F']= x_ros['FC'] + x_ros['FD ']+ x_ros['FE']+x_ros['FI']+x_ros['FL']+x_ros['FR']+x_ros['FS']                          # test['FI']

x_ros.drop(['DA','BP','CC','CW ','DI','GH','BC','AF'],axis=1, inplace=True)
'''

y_ = train_ros.Class
yt = Ensemble()
m, all_models, all_scores = training_nf(yt,x_ros,y_,y_ros)

# 2 predict for submission 

y_pred = m.predict_proba(test_pred_and_time)
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)


all_scores_s = all_scores     
all_scores_s.sort()
index_of_second = all_scores.index(all_scores_s[1])
index_of_third = all_scores.index(all_scores_s[2])
index_of_forth = all_scores.index(all_scores_s[3])
index_of_five = all_scores.index(all_scores_s[4])

y_pred2 = all_models[index_of_second].predict_proba(test_pred_and_time)
probabilities2 = np.concatenate((y_pred2[:,:1], np.sum(y_pred2[:,1:], 1, keepdims=True)), axis=1)
y_pred3 = all_models[index_of_third].predict_proba(test_pred_and_time)
probabilities3 = np.concatenate((y_pred3[:,:1], np.sum(y_pred3[:,1:], 1, keepdims=True)), axis=1)
y_pred4 = all_models[index_of_forth].predict_proba(test_pred_and_time)
probabilities4 = np.concatenate((y_pred4[:,:1], np.sum(y_pred4[:,1:], 1, keepdims=True)), axis=1)
y_pred5 = all_models[index_of_five].predict_proba(test_pred_and_time)
probabilities5 = np.concatenate((y_pred5[:,:1], np.sum(y_pred5[:,1:], 1, keepdims=True)), axis=1)

p0 = (probabilities[:,:1] + probabilities2[:,:1]+ probabilities3[:,:1]+ probabilities4[:,:1]+ probabilities5[:,:1])/5

##p0[p0 > 0.10] = 1 # пробуем
##p0[p0 < 0.01] = 0

'''
zero_list = list(test.loc[train_pred_and_time['BQ'].isnull()].index)
for z in zero_list : p0[z] = 1
zero_list = list(test.loc[train_pred_and_time['BN'] < 15.5364].index)
for z in zero_list : p0[z] = 1
'''

submission_1 = pd.DataFrame(test["Id"], columns=["Id"])
submission_1["class_0"] = p0
submission_1["class_1"] = 1 - p0
submission_1.to_csv('submission_1.csv', index=False)

print ('------------------- submisson 1 completed ')

submission_0.append(submission_1)
submission_0.to_csv('submission.csv', index=False)

print ('------------------- GENERAL submisson  completed ')

## 3 predict  for all TRAIN data 

predictor_columns_2 = [n for n in train_pred_and_time.columns if n != 'Class' and n != 'Id']

'''
train_pred_and_time['A'] = train_pred_and_time['AB']*10000+train_pred_and_time['AF']+train_pred_and_time['AM']*100+train_pred_and_time['AY']*10000
train_pred_and_time['B'] = train_pred_and_time['BC']*1000+train_pred_and_time['BQ']*1000+train_pred_and_time['BR']+train_pred_and_time['BZ']
train_pred_and_time['D'] = train_pred_and_time['DU']*500
train_pred_and_time['E'] = train_pred_and_time['EH']*1000
train_pred_and_time['F'] = train_pred_and_time['FD ']*100 + train_pred_and_time['FE']+ train_pred_and_time['FL']*1000  + train_pred_and_time['FR']*100
predictor_columns_2.append('A')     # useful
predictor_columns_2.append('B')     # useful
predictor_columns_2.append('D')     # useful
predictor_columns_2.append('E')     # useful
predictor_columns_2.append('F')     # useful

train_pred_and_time['C']= train_pred_and_time['CC'] + train_pred_and_time['CD ']+ train_pred_and_time['CF']+train_pred_and_time['CH']+train_pred_and_time['CL']+train_pred_and_time['CR']+train_pred_and_time['CU']+train_pred_and_time['CW ']
train_pred_and_time['F']= train_pred_and_time['FC'] + train_pred_and_time['FD ']+ train_pred_and_time['FE']+train_pred_and_time['FI']+train_pred_and_time['FL']+train_pred_and_time['FR']+train_pred_and_time['FS']
#basic_features.append('id2')   # useful
predictor_columns_2.append('C')     # useful
predictor_columns_2.append('F')     # useful

if 'DA' in predictor_columns_2: predictor_columns_2.remove('DA') # useful
#if 'BN' in predictor_columns_2: predictor_columns_2.remove('BN') # useful
if 'BP' in predictor_columns_2: predictor_columns_2.remove('BP') # useful
if 'CC' in predictor_columns_2: predictor_columns_2.remove('CC') # useful
if 'CW ' in predictor_columns_2: predictor_columns_2.remove('CW ') # useful
if 'DI' in predictor_columns_2: predictor_columns_2.remove('DI') # useful
if 'GH' in predictor_columns_2: predictor_columns_2.remove('GH') # useful
if 'BC' in predictor_columns_2: predictor_columns_2.remove('BC') # useful
if 'AF' in predictor_columns_2: predictor_columns_2.remove('AF') # useful
'''

y_pred=  m.predict_proba(train_pred_and_time[predictor_columns_2])
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)

y_pred2 = all_models[index_of_second].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities2 = np.concatenate((y_pred2[:,:1], np.sum(y_pred2[:,1:], 1, keepdims=True)), axis=1)
y_pred3 = all_models[index_of_third].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities3 = np.concatenate((y_pred3[:,:1], np.sum(y_pred3[:,1:], 1, keepdims=True)), axis=1)
y_pred4 = all_models[index_of_forth].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities4 = np.concatenate((y_pred4[:,:1], np.sum(y_pred4[:,1:], 1, keepdims=True)), axis=1)
y_pred5 = all_models[index_of_five].predict_proba(train_pred_and_time[predictor_columns_2])
probabilities5 = np.concatenate((y_pred5[:,:1], np.sum(y_pred5[:,1:], 1, keepdims=True)), axis=1)

p0 = (probabilities[:,:1] + probabilities2[:,:1]+ probabilities3[:,:1]+ probabilities4[:,:1]+ probabilities5[:,:1])/5


###p0[p0 > 0.10] = 1 
###p0[p0 < 0.01] = 0

'''
zero_list = list(train_pred_and_time.loc[train_pred_and_time['BQ'].isnull()].index)
for z in zero_list : p0[z] = 1
zero_list = list(train_pred_and_time.loc[train_pred_and_time['BN'] < 15.5364].index)
for z in zero_list : p0[z] = 1
'''

p1 = 1 - p0


print('overall',balanced_log_loss_e(train.Class,p1.flatten()))

#overall CV 0.0928  --  -- 24 & 64  >30 mins       0.13 LB 

# overall (0.02794048828990353,  [103, 134, 205, 229, 295, 299], [0, 0, 0, 0, 0, 0], [0.78, 0.40, 0.61, 0.76, 0.845, 0.5419956709666807])
# overall (0.020332229109128463, [8, 64, 87, 128, 189], [0, 0, 0, 0, 0], [0.4743714693172232, 0.5984776315191408, 0.8488388713780383, 0.4809832856073687, 0.7367310655403029])
# overall (0.011763950426382223, [], [], []) no thresholds at all no trhresholds at all, simple model