In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
import os
import gc
pd.set_option('display.max_columns', 50)
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [9]:
def DataPreparation(dfs):
    
    for df in dfs:
        df[['CabinDeck','CabinNum','CabinSide']] = df['Cabin'].str.split('/',expand=True)
        df.drop(['Name','Cabin', 'CabinNum' ],axis=1,inplace=True)
        df['Adult'] = 1
        df.loc[df['Age']<18.0,'Adult']=0
        
        columnslist = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
        df['TotalSpent'] = df[columnslist].sum(axis=1)
        df['TotalSpent_IsZero_Cryo_IsFalse'] = 0
        df.loc[(df['TotalSpent']==0.0)&(df['CryoSleep']==False),'TotalSpent_IsZero_Cryo_IsFalse'] = 1
        # for col in columnslist:
        #     df['Ratio_'+str(col)+'_TotalSpent'] = df[col].divide(df['TotalSpent'],axis=0,fill_value=np.nan)
        
    train = dfs[0]
    test = dfs[1]
    
    train_noms, test_noms = LabelEncoding(train, test, 'PassengerId', 'Transported')
    
    ColumnList = list()
    for col in train.drop(['PassengerId','Transported'],axis=1).columns:
        if (train[col].dtypes!='float64') :
            ColumnList.append(col)
    train_nums = train.drop(ColumnList+['Transported'], axis=1)
    test_nums = test.drop(ColumnList, axis=1)
    
    train = train_noms.merge(train_nums, on = 'PassengerId', how = 'left')
    test = test_noms.merge(test_nums, on = 'PassengerId', how = 'left')
    return train,test 

In [3]:
def LabelEncoding(df1, df2, ids, labels):
    
    df1_finished = df1[[ids, labels]].copy()
    df2_finished = df2[[ids]].copy()
    
    for col in df1.drop([ids,labels],axis=1):
        if df1[col].dtypes != 'float64':
            df1_dummies = pd.get_dummies(df1[[ids,col]],columns=[col])
            df2_dummies = pd.get_dummies(df2[[ids,col]],columns=[col])
            # if df1[col].isna().sum() !=0:
            #     df1_dummies[col+'_Nan'] = df1[col].isna().astype('int')
            #     df2_dummies[col+'_Nan'] = df2[col].isna().astype('int')
            df1_finished = df1_finished.merge(df1_dummies, on=ids, how= 'left')
            df2_finished = df2_finished.merge(df2_dummies, on=ids, how= 'left')
    
    df1_labels = df1_finished[labels]
    df1_finished, df2_finished = df1_finished.align(df2_finished, join = 'inner', axis = 1)
    df1_finished[labels] = df1_labels
    
    return df1_finished,df2_finished

In [None]:
def nnmodel(df1, df2, ids, labels, n_folds = 5, seed = 42069):
    
    
    df1_ids = df1[ids].copy()
    df2_ids = df2[ids].copy()
    
    df1_labels = df1[labels].copy()
    
    df1_features = df1.fillna(0).drop([ids,labels],axis=1).copy()
    df2_features = df2.fillna(0).drop([ids], axis=1).copy()
    
    
    feat_names = df1_features.columns.tolist()
    
    df1_features = np.array(df1_features)
    df2_features = np.array(df2_features)
    
    out_of_fold = np.zeros(df1_features.shape[0])
    
    nnmodel = tf.keras.models.Sequential()
    nnmodel.add(tf.keras.Input(Shape=()))

In [4]:
def BaselineModels(df1, df2, ids, labels, n_folds = 5, seed = 42069):
    
    df1_ids = df1[ids].copy()
    df2_ids = df2[ids].copy()
    
    df1_labels = df1[labels].copy()
    
    df1_features = df1.fillna(0).drop([ids,labels],axis=1).copy()
    df2_features = df2.fillna(0).drop([ids], axis=1).copy()
    
    
    feat_names = df1_features.columns.tolist()
    
    df1_features = np.array(df1_features)
    df2_features = np.array(df2_features)
    
    out_of_fold = np.zeros(df1_features.shape[0])
    
    logreg = LogisticRegression(max_iter = 20000, random_state = seed)
    gnb = GaussianNB()
    clftree = DecisionTreeClassifier(random_state = seed)
    
    modelsdict = {'Logistic Regression' : logreg, 'Naive Bayes' : gnb, 'Classification Tree' : clftree}
    
    scores_dict = dict()
    out_of_fold_dict = dict()
    for basemodel in modelsdict:
        scores_dict[basemodel] = np.zeros((n_folds+1,2),'float64')
        out_of_fold_dict[basemodel] = np.zeros(df1_features.shape[0],'float64')
        
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    k = 0
    for train_indices, valid_indices in k_fold.split(df1_features):
        
        
        train_features, train_labels = df1_features[train_indices], df1_labels[train_indices]
        valid_features, valid_labels = df1_features[valid_indices], df1_labels[valid_indices]
        
        for basemodel in modelsdict:
            
            modelsdict[basemodel].fit(train_features, train_labels) 

        
            train_preds = modelsdict[basemodel].predict_proba(train_features)[:,1]
            train_auc_fold = roc_auc_score(train_labels,train_preds)
            valid_preds = modelsdict[basemodel].predict_proba(valid_features)[:,1]
            valid_auc_fold = roc_auc_score(valid_labels,valid_preds)
            
            scores_dict[basemodel][k,0] = train_auc_fold 
            scores_dict[basemodel][k,1] = valid_auc_fold 
            out_of_fold_dict[basemodel][valid_indices] = valid_preds
        
        k += 1
    baseline_auc = np.zeros(len(modelsdict))
    l = 0
    for basemodel in modelsdict:
        print('*************************** '+str(basemodel)+' ***************************')
        for i in range(0,n_folds):
            print('Fold ' + str(i+1) + ' --- Train AUC: ' + str("%.6f" % round(scores_dict[basemodel][i,0], 6)) + '   Valid AUC: ' +  str("%.6f" % round(scores_dict[basemodel][i,1], 6)))
        valid_auc_all = roc_auc_score(df1_labels,out_of_fold_dict[basemodel])
        baseline_auc[l]= valid_auc_all
        l += 1
        print('Overall AUC: '+str("%.6f" % round(valid_auc_all, 6)))
    
    bestmodel = list(modelsdict.keys())[list(np.where(baseline_auc==np.amax(baseline_auc)))[0][0]]
    modelsdict[bestmodel].fit(df1_features,df1_labels)
    test_preds = modelsdict[bestmodel].predict(df2_features)
    submission = pd.DataFrame({'PassengerId' : df2_ids, 'Transported' : test_preds})
        
    return submission

In [5]:
train = pd.read_csv(os.getcwd()+'\\train.csv')
test = pd.read_csv(os.getcwd()+'\\test.csv')
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [6]:
train,test = DataPreparation([train, test])
train

Unnamed: 0,PassengerId,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinNum_0,CabinNum_1,CabinNum_10,CabinNum_100,CabinNum_1001,CabinNum_1003,...,CabinNum_987,CabinNum_988,CabinNum_989,CabinNum_99,CabinNum_990,CabinNum_992,CabinNum_993,CabinNum_994,CabinNum_995,CabinNum_996,CabinNum_997,CabinSide_P,CabinSide_S,Adult_0,Adult_1,TotalSpent_IsZero_Cryo_IsFalse_0,TotalSpent_IsZero_Cryo_IsFalse_1,Transported,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpent
0,0001_01,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,False,39.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0002_01,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,True,24.0,109.0,9.0,25.0,549.0,44.0,736.0
2,0003_01,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,False,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0
3,0003_02,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,False,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0
4,0004_01,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,True,16.0,303.0,70.0,151.0,565.0,2.0,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,False,41.0,0.0,6819.0,0.0,1643.0,74.0,8536.0
8689,9278_01,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,False,18.0,0.0,0.0,0.0,0.0,0.0,0.0
8690,9279_01,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,True,26.0,0.0,0.0,1872.0,1.0,0.0,1873.0
8691,9280_01,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,False,32.0,0.0,1049.0,0.0,353.0,3235.0,4637.0


In [7]:
# BaselineModels(train, test, 'PassengerId', 'Transported', n_folds = 5, seed = 42069)
# sub.to_csv(os.getcwd()+'\\Submission3_SpS_Titanic.csv', index=False)

In [8]:
# testdict = {'1':1,'2':2}
# list(testdict.keys())