In [1]:
import pandas as pd
import numpy as np
import math
import os
import optuna
import lightgbm as lgb
pd.set_option('display.max_columns', 50)
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import gc
import warnings
warnings.filterwarnings("ignore")

In [2]:
def DataPreparation(dfs, missing_values = False):
    
    for df in dfs:
        df[['CabinDeck','CabinNum','CabinSide']] = df['Cabin'].str.split('/',expand=True)
        df['CabinNum'] = df['CabinNum'].astype('float64')
        cabinnumdiv = 500
        df['CabinNumGroup'] = (np.floor(df['CabinNum']/cabinnumdiv)*cabinnumdiv).astype('object')
        df.drop(['Name','Cabin','CabinNum'],axis=1,inplace=True)
        df['Adult'] = 1
        df.loc[df['Age']<18.0,'Adult']=0
        
        columnslist = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
        df['TotalSpent'] = df[columnslist].sum(axis=1)
        df['TotalSpent_IsZero_Cryo_IsFalse'] = 0
        df.loc[(df['TotalSpent']==0.0)&(df['CryoSleep']==False),'TotalSpent_IsZero_Cryo_IsFalse'] = 1
        for col in columnslist:
            df['Ratio_'+str(col)+'_TotalSpent'] = df[col].divide(df['TotalSpent'],axis=0,fill_value=np.nan)
    
        # df['LargestSpend'] = df[columnslist].max(axis=1)
        # df['LargestSpendingArea'] = df[columnslist].idxmax(axis=1)
        # df.loc[df['TotalSpent']==0,'LargestSpendingArea'] = np.nan
        
    train_noms, test_noms = LabelEncoding(dfs[0], dfs[1], 'PassengerId', 'Transported', missing_values = missing_values)    
    
    # for df in [train_noms,test_noms]:
    #     columnlist1 = [col for col in df.columns if col.startswith('CabinDeck')]
    #     columnlist2 = [col for col in df.columns if col.startswith('CabinNumGroup')]
    #     for a in columnlist1:
    #         df[[a+'_'+b for b in columnlist2]] = df[columnlist2].multiply(df[a],axis=0)        
        
    columnlist = list()
    for col in dfs[0].drop(['PassengerId','Transported'],axis=1).columns:
        if (dfs[0][col].dtypes!='float64') & (dfs[0][col].dtypes!='int64'):
            columnlist.append(col)
    train_nums = dfs[0].drop(columnlist+['Transported'], axis=1)
    test_nums = dfs[1].drop(columnlist, axis=1)
    
    dfs[0] = train_noms.merge(train_nums, on = 'PassengerId', how = 'left')
    dfs[1] = test_noms.merge(test_nums, on = 'PassengerId', how = 'left')
    return dfs[0],dfs[1] 

In [3]:
def LabelEncoding(df1, df2, ids, labels, missing_values = False):
    
    df1_finished = df1[[ids, labels]].copy()
    df2_finished = df2[[ids]].copy()
    
    if not(missing_values):
        for col in df1.drop([ids,labels],axis=1):
            if (df1[col].dtypes != 'float64') & (df1[col].dtypes != 'int64'):
                df1_dummies = pd.get_dummies(df1[[ids,col]],columns=[col])
                df2_dummies = pd.get_dummies(df2[[ids,col]],columns=[col])
                # if df1[col].isna().sum() !=0:
                #     df1_dummies[col+'_Nan'] = df1[col].isna().astype('int')
                #     df2_dummies[col+'_Nan'] = df2[col].isna().astype('int')
                df1_finished = df1_finished.merge(df1_dummies, on=ids, how= 'left')
                df2_finished = df2_finished.merge(df2_dummies, on=ids, how= 'left')
    elif missing_values:
        for col in df1.drop([ids,labels],axis=1):
            if (df1[col].dtypes != 'float64') & (df1[col].dtypes != 'int64'):
                if len(df1[col].dropna().unique().tolist())==2:
                    df1_dummies = pd.DataFrame({'PassengerId':df1[ids], col : np.full(df1.shape[0],np.nan,dtype='int')})
                    df2_dummies = pd.DataFrame({'PassengerId':df2[ids], col : np.full(df2.shape[0],np.nan,dtype='int')})
                    for ind,value in enumerate(df1[col].dropna().unique().tolist()):                      
                        df1_dummies.loc[df1[col]==value,col] = ind
                        df2_dummies.loc[df2[col]==value,col] = ind
                    df1_finished = df1_finished.merge(df1_dummies, on=ids, how= 'left')
                    df2_finished = df2_finished.merge(df2_dummies, on=ids, how= 'left')
                elif len(df1[col].dropna().unique().tolist())>2:
                    df1_dummies = pd.get_dummies(df1[[ids,col]],columns=[col])
                    df2_dummies = pd.get_dummies(df2[[ids,col]],columns=[col])
                    # if df1[col].isna().sum() !=0:
                    #     df1_dummies[col+'_Nan'] = df1[col].isna().astype('int')
                    #     df2_dummies[col+'_Nan'] = df2[col].isna().astype('int')
                    df1_finished = df1_finished.merge(df1_dummies, on=ids, how= 'left')
                    df2_finished = df2_finished.merge(df2_dummies, on=ids, how= 'left')                    
    
    df1_labels = df1_finished[labels]
    df1_finished, df2_finished = df1_finished.align(df2_finished, join = 'inner', axis = 1)
    df1_finished[labels] = df1_labels
    
    return df1_finished,df2_finished

In [4]:
def BaselineModels(df1, df2, ids, labels, n_folds = 5, seed = 42069):
    
    df1_ids = df1[ids].copy()
    df2_ids = df2[ids].copy()
    
    df1_labels = df1[labels].copy()
    
    df1_features = df1.fillna(0).drop([ids,labels],axis=1).copy()
    df2_features = df2.fillna(0).drop([ids], axis=1).copy()
    
    
    feat_names = df1_features.columns.tolist()
    
    df1_features = np.array(df1_features)
    df2_features = np.array(df2_features)
    
    out_of_fold = np.zeros(df1_features.shape[0])
    
    display(df1)
    
    logreg = LogisticRegression(max_iter = 20000, random_state = seed)
    gnb = GaussianNB()
    clftree = DecisionTreeClassifier(random_state = seed)
    
    modelsdict = {'Logistic Regression' : logreg, 'Naive Bayes' : gnb, 'Classification Tree' : clftree}
    
    scores_dict = dict()
    out_of_fold_dict = dict()
    for basemodel in modelsdict:
        scores_dict[basemodel] = np.zeros((n_folds+1,2),'float64')
        out_of_fold_dict[basemodel] = np.zeros(df1_features.shape[0],'float64')
        
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    k = 0
    for train_indices, valid_indices in k_fold.split(df1_features):
        
        
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]
        
        for basemodel in modelsdict:
            
            modelsdict[basemodel].fit(train_features, train_labels) 

        
            train_preds = modelsdict[basemodel].predict_proba(train_features)[:,1]
            train_auc_fold = roc_auc_score(train_labels,train_preds)
            valid_preds = modelsdict[basemodel].predict_proba(valid_features)[:,1]
            valid_auc_fold = roc_auc_score(valid_labels,valid_preds)
            
            scores_dict[basemodel][k,0] = train_auc_fold 
            scores_dict[basemodel][k,1] = valid_auc_fold 
            out_of_fold_dict[basemodel][valid_indices] = valid_preds
        
        k += 1
    baseline_auc = np.zeros(len(modelsdict))
    l = 0
    for basemodel in modelsdict:
        print('*************************** '+str(basemodel)+' ***************************')
        for i in range(0,n_folds):
            print('Fold ' + str(i+1) + ' --- Train AUC: ' + str("%.6f" % round(scores_dict[basemodel][i,0], 6)) + '   Valid AUC: ' +  str("%.6f" % round(scores_dict[basemodel][i,1], 6)))
        valid_auc_all = roc_auc_score(df1_labels,out_of_fold_dict[basemodel])
        baseline_auc[l]= valid_auc_all
        l += 1
        print('Overall AUC: '+str("%.6f" % round(valid_auc_all, 6)))
    
    bestmodel = list(modelsdict.keys())[list(np.where(baseline_auc==np.amax(baseline_auc)))[0][0]]
    modelsdict[bestmodel].fit(df1_features,df1_labels)
    test_preds = modelsdict[bestmodel].predict(df2_features)
    submission = pd.DataFrame({'PassengerId' : df2_ids, 'Transported' : test_preds})
        
    return submission

In [5]:
def OptimizationObjectiveLogistic(trial, df1_ids, df1_labels, df1_features, n_folds = 5, seed = 42069):
    
    paramdict = {
        'max_iter': trial.suggest_categorical('max_iter',[5000]),
        'solver': trial.suggest_categorical('solver',['liblinear']),
        'tol': trial.suggest_categorical('tol',[1e-4]),
        'penalty': trial.suggest_categorical('penalty',['l1','l2']),
        'C': trial.suggest_float('C',0.1,1,step=0.005),
        }
    
    valid_auc = OptimizationLogisticModel(df1_ids, df1_labels, df1_features, params = paramdict, n_folds = n_folds , seed = seed)
    
    return valid_auc

In [6]:
def OptimizationLogisticModel(df1_ids, df1_labels, df1_features, params, n_folds = 5, seed = 42069):

    model = LogisticRegression(random_state = seed, **params)
    
    out_of_fold = np.zeros(df1_features.shape[0]) 
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for train_indices, valid_indices in k_fold.split(df1_features):
        
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]    
    
        model.fit(train_features, train_labels)
        valid_preds = model.predict_proba(valid_features)[:,1]
        out_of_fold[valid_indices] = valid_preds
        
    valid_auc = log_loss(df1_labels,out_of_fold)        
    
    return valid_auc

In [7]:
def FinalLogisticModel(df1_ids, df1_labels, df1_features, df2_ids, df2_features, params, n_folds = 5, seed = 42069):
    
    model = LogisticRegression(random_state = seed, **params)
    
    out_of_fold = np.zeros(df1_features.shape[0]) 
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    k=0
    print('*************************** Final Logistic Regression Model***************************')
    for train_indices, valid_indices in k_fold.split(df1_features):
        k+=1
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]    
    
        model.fit(train_features, train_labels)
        train_preds = model.predict_proba(train_features)[:,1]
        train_auc_fold = roc_auc_score(train_labels,train_preds)
        valid_preds = model.predict_proba(valid_features)[:,1]
        valid_auc_fold = roc_auc_score(valid_labels,valid_preds)
        out_of_fold[valid_indices] = valid_preds
        
        print('   Fold '+str(k)+' ---- Train AUC: '+ str("%.6f" % round(train_auc_fold, 6)) + '   Valid AUC: ' +  str("%.6f" % round(valid_auc_fold, 6)))
        
    valid_auc = roc_auc_score(df1_labels,out_of_fold)
    print('   Overall AUC:   '+str("%.6f" % round(valid_auc, 6)))
    
    model.fit(df1_features,df1_labels)
    test_preds = model.predict(df2_features)
    submission = pd.DataFrame({'PassengerId' : df2_ids, 'Transported' : test_preds})
    
    return submission    

In [8]:
def OptimizationModel(df1, df2, ids, labels, n_folds = 5, seed = 42069):

    df1_ids = df1[ids].copy()
    df2_ids = df2[ids].copy()
    
    df1_labels = df1[labels].copy()
    
    df1_features = df1.fillna(0).drop([ids,labels],axis=1).copy()
    df2_features = df2.fillna(0).drop([ids], axis=1).copy()
    
    feat_names = df1_features.columns.tolist()
    
    df1_features = np.array(df1_features)
    df2_features = np.array(df2_features)

    # optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='minimize', study_name='Logistic Regression')
    func = lambda trial: OptimizationObjectiveLogistic(trial, df1_ids, df1_labels, df1_features, n_folds = n_folds, seed = seed)
    study.optimize(func,n_trials=200)
    bestparamsdict = dict(study.best_params.items())
    
    submission = FinalLogisticModel(df1_ids, df1_labels, df1_features, df2_ids, df2_features, params= bestparamsdict, n_folds = n_folds, seed = seed)
    
    return submission

In [9]:
def OptimizationObjectiveLGBM(trial, df1_ids, df1_labels, df1_features, n_folds = 5, seed = 42069):
    
    paramdict = {
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 50, 250, step = 5),
        'max_depth': trial.suggest_int('max_depth', 3, 10, step = 1),
        'min_child_samples': trial.suggest_int('min_child_samples', 50, 1000, step=10),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 10, step=0.1),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 10, step=0.1),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 15, step = 0.5),
        'subsample': trial.suggest_float('subsample', 0.4, 0.9, step=0.1),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.95, step=0.1)
    } 
    
    valid_auc = OptimizationLGBM(df1_ids, df1_labels, df1_features, params = paramdict, n_folds = n_folds , seed = seed)
    
    return valid_auc

In [10]:
def OptimizationLGBM(df1_ids, df1_labels, df1_features, params, n_folds = 5, seed = 42069):
    

    model = lgb.LGBMClassifier(objective = 'binary', random_state = seed, **params)   
    
    out_of_fold = np.zeros(df1_features.shape[0]) 
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for train_indices, valid_indices in k_fold.split(df1_features):
        
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]    
    
        model.fit(train_features, train_labels, eval_metric = 'auc', eval_set = [(valid_features, valid_labels), (train_features, train_labels)], eval_names = ['valid', 'train'], early_stopping_rounds = 200, verbose = -1)   
        
        bestiteration = model.best_iteration_
        valid_preds = model.predict_proba(valid_features, num_iteration = bestiteration)[:,1]
        out_of_fold[valid_indices] = valid_preds
        
#         gc.enable()
#         del lgbmodel, train_features, valid_features
#         gc.collect()        
        
    valid_auc = roc_auc_score(df1_labels,out_of_fold)        
    
    return valid_auc

In [11]:
def FinalLGBM(df1_ids, df1_labels, df1_features, df2_ids, df2_features, params, n_folds = 5, seed = 42069):
    
    
    model = lgb.LGBMClassifier(objective = 'binary', random_state = seed, **params)
    
    out_of_fold = np.zeros(df1_features.shape[0])
    test_preds = np.zeros(df2_features.shape[0])
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    k=0
    print('*************************** Final Logistic Regression Model***************************')
    for train_indices, valid_indices in k_fold.split(df1_features):
        k+=1
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]    
    
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], early_stopping_rounds = 300, verbose = -1)   
        
        bestiteration = model.best_iteration_        
        
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = bestiteration)[:, 1]        
        valid_auc_fold = model.best_score_['valid']['auc']
        train_auc_fold = model.best_score_['train']['auc']
        
        print('   Fold '+str(k)+' ---- Train AUC: '+ str("%.6f" % round(train_auc_fold, 6)) + '   Valid AUC: ' +  str("%.6f" % round(valid_auc_fold, 6)))
        
        test_preds += model.predict_proba(df2_features, num_iteration = bestiteration)[:, 1] / k_fold.n_splits
        
    valid_auc = roc_auc_score(df1_labels,out_of_fold)
    print('   Overall AUC:   '+str("%.6f" % round(valid_auc, 6)))
    
    # model.fit(train_features, train_labels, eval_metric = 'auc',
    #           eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
    #           eval_names = ['valid', 'train'], early_stopping_rounds = 200, verbose = -1)  
    # test_preds = model.predict(df2_features)
    submission = pd.DataFrame({'PassengerId' : df2_ids, 'Transported' : test_preds})
    
    return submission  

In [12]:
def OptimizationModelLGBM(df1, df2, ids, labels, n_folds = 5, seed = 42069):

    df1_ids = df1[ids].copy()
    df2_ids = df2[ids].copy()
    
    df1_labels = df1[labels].copy()
    
    
    # Change HOT encoding for binary nominals with nan values, LBM handles NaN value
    df1_features = df1.drop([ids,labels],axis=1).copy()
    df2_features = df2.drop([ids], axis=1).copy()
    
    feat_names = df1_features.columns.tolist()
    
    df1_features = np.array(df1_features)
    df2_features = np.array(df2_features)

    # optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='maximize', study_name='LGBM Classifier')
    func = lambda trial: OptimizationObjectiveLGBM(trial, df1_ids, df1_labels, df1_features, n_folds = n_folds, seed = seed)
    study.optimize(func,n_trials=200)
    bestparamsdict = dict(study.best_params.items())
    
    submission = FinalLGBM(df1_ids, df1_labels, df1_features, df2_ids, df2_features, params= bestparamsdict, n_folds = n_folds, seed = seed)
    
    submission['Transported_class'] = False
    submission.loc[submission['Transported']>=0.5,'Transported_class'] = True
    submission['Transported'] = submission['Transported_class']
    submission.drop('Transported_class',axis=1,inplace=True)
    
    return submission

In [13]:
train = pd.read_csv(os.getcwd()+'\\train.csv')
test = pd.read_csv(os.getcwd()+'\\test.csv')
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [14]:
train,test = DataPreparation([train, test],missing_values = True)
train

Unnamed: 0,PassengerId,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide,CabinNumGroup_0.0,CabinNumGroup_500.0,CabinNumGroup_1000.0,CabinNumGroup_1500.0,Transported,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Adult,TotalSpent,TotalSpent_IsZero_Cryo_IsFalse,Ratio_RoomService_TotalSpent,Ratio_FoodCourt_TotalSpent,Ratio_ShoppingMall_TotalSpent,Ratio_Spa_TotalSpent,Ratio_VRDeck_TotalSpent
0,0001_01,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,False,39.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1,,,,,
1,0002_01,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,True,24.0,109.0,9.0,25.0,549.0,44.0,1,736.0,0,0.148098,0.012228,0.033967,0.745924,0.059783
2,0003_01,0,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,False,58.0,43.0,3576.0,0.0,6715.0,49.0,1,10383.0,0,0.004141,0.344409,0.000000,0.646730,0.004719
3,0003_02,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,False,33.0,0.0,1283.0,371.0,3329.0,193.0,1,5176.0,0,0.000000,0.247875,0.071677,0.643161,0.037287
4,0004_01,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,True,16.0,303.0,70.0,151.0,565.0,2.0,0,1091.0,0,0.277727,0.064161,0.138405,0.517874,0.001833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,False,41.0,0.0,6819.0,0.0,1643.0,74.0,1,8536.0,0,0.000000,0.798852,0.000000,0.192479,0.008669
8689,9278_01,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,False,18.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,,,,,
8690,9279_01,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,True,26.0,0.0,0.0,1872.0,1.0,0.0,1,1873.0,0,0.000000,0.000000,0.999466,0.000534,0.000000
8691,9280_01,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,False,32.0,0.0,1049.0,0.0,353.0,3235.0,1,4637.0,0,0.000000,0.226224,0.000000,0.076127,0.697649


In [15]:
sub = OptimizationModelLGBM(train, test, 'PassengerId', 'Transported')
sub.to_csv(os.getcwd()+'\\Submission14_SpS_Titanic.csv', index=False)

[32m[I 2022-06-29 19:10:24,155][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2022-06-29 19:10:27,297][0m Trial 0 finished with value: 0.8694447429393888 and parameters: {'n_estimators': 10000, 'learning_rate': 0.1457710042989261, 'num_leaves': 245, 'max_depth': 8, 'min_child_samples': 940, 'reg_alpha': 3, 'reg_lambda': 9, 'min_split_gain': 0.0, 'subsample': 0.5, 'subsample_freq': 1, 'colsample_bytree': 0.6000000000000001}. Best is trial 0 with value: 0.8694447429393888.[0m
[32m[I 2022-06-29 19:10:28,503][0m Trial 1 finished with value: 0.8754864864721796 and parameters: {'n_estimators': 10000, 'learning_rate': 0.4266458187758511, 'num_leaves': 250, 'max_depth': 3, 'min_child_samples': 310, 'reg_alpha': 8, 'reg_lambda': 0, 'min_split_gain': 7.0, 'subsample': 0.6000000000000001, 'subsample_freq': 1, 'colsample_bytree': 0.4}. Best is trial 1 with value: 0.8754864864721796.[0m
[32m[I 2022-06-29 19:10:29,385][0m Trial 2 finished with value: 0.883497996672

*************************** Final Logistic Regression Model***************************
   Fold 1 ---- Train AUC: 0.943462   Valid AUC: 0.896225
   Fold 2 ---- Train AUC: 0.949646   Valid AUC: 0.902057
   Fold 3 ---- Train AUC: 0.934627   Valid AUC: 0.899355
   Fold 4 ---- Train AUC: 0.942426   Valid AUC: 0.902678
   Fold 5 ---- Train AUC: 0.937660   Valid AUC: 0.886748
   Overall AUC:   0.897209


In [16]:
# sub = OptimizationModel(train, test, 'PassengerId', 'Transported')
# sub.to_csv(os.getcwd()+'\\Submission11_SpS_Titanic.csv', index=False)

In [17]:
# sub = BaselineModels(train, test, 'PassengerId', 'Transported', n_folds = 5, seed = 42069)
# sub.to_csv(os.getcwd()+'\\Submission9_SpS_Titanic.csv', index=False)

In [18]:
# testdict = {'1':1,'2':2}
# list(testdict.keys())