In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [26]:
def read_data(path):
    df = pd.read_csv(path)
    return df

In [27]:
df_train = read_data('data/train.csv')
df_test = read_data('data/test.csv')
df_submision = read_data('data/sample_submission.csv')

In [None]:
def collumn_split(df):

    # Spliting 'PassengerId' collumn, creating new columns 'Group' and 'InGroup'
    df['Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
    df['GroupSize'] = df.groupby('Group')['Group'].transform('size')
    df['InGroup'] = df['GroupSize'] > 1

    # Splitting 'Name' column, add new collumn 'WithFamily'
    df['Name'] = df['Name'].fillna('unknown unknown')
    df['Surname'] = df['Name'].str.split(' ').str[-1]
    surname_counts = df['Surname'].value_counts()
    if 'unknown' in surname_counts:
        surname_counts = surname_counts.drop('unknown')
    has_family_surname = df['Surname'].apply(lambda x: x != 'unknown' and surname_counts.get(x, 0) > 1)
    df['WithFamily'] = (df['InGroup'] & has_family_surname)

    # Spliting 'Cabin' collumn, creating new columns 'Deck', 'Num' and 'Side'
    df['Cabin'] = df['Cabin'].fillna('unknown/unknown/unknown')
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1]
    df['Side'] = df['Cabin'].str.split('/').str[2]

    # Add columns "All_Expenses" and 'HasExpenses'
    df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)
    df["All_Expenses"] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
    df['HasExpenses'] = df['All_Expenses'].apply(lambda x: 1 if x>0 else 0)

    # Dropping splitted columns
    df = df.drop(columns=['PassengerId', 'Cabin', 'Name'])

    return df

In [29]:
def drop_unnecessary_columns(df):
    df = df.drop(columns=['Group', 'GroupSize', 'InGroup', 'Surname', 'CabinNumber'])
    return df

In [30]:
def converting_columns_to_correct_type(df):
    
    if target_column in df.columns.tolist():
        df[target_column] = df[target_column].astype(bool)
    for col in df.columns.tolist():
        if col in categorical_columns:
            df[col] = df[col].astype('object')
        elif col in continuous_columns:
            df[col] = df[col].astype('float32')
        elif col in boolean_columns:
            df[col] = df[col].astype(bool)
    
    return df

In [31]:
def simple_missing_data_fill(df):
    
    for col in df.columns.tolist():
        if df[col].isnull().sum() > 0:
            if col in categorical_columns:
                df[col] = df[col].fillna(df[col].mode()[0])
            elif col in continuous_columns:
                df[col] = df[col].fillna(df[col].median())
            elif col in boolean_columns:
                df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

In [32]:
def data_cleaning_and_preparation(df):

    df = collumn_split(df)
    df = drop_unnecessary_columns(df)
    df = simple_missing_data_fill(df)
    df = converting_columns_to_correct_type(df)

    return df

In [33]:
def correlation_matrix(df):
    
    df = df.copy()
    corr = df.select_dtypes(include=['number', 'bool']).corr()
    plt.figure(figsize=(20, 16))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={"shrink": .8})
    plt.title("Correlation Matrix")
    plt.show()

In [34]:
def encoding(df_train, df_test):
    
    df_train = pd.get_dummies(df_train, columns=categorical_columns, drop_first=True)
    df_test = pd.get_dummies(df_test, columns=categorical_columns, drop_first=True)
    df_test = df_test.reindex(columns=df_train.columns)
    df_test = df_test.drop(columns=target_column)

    return df_train, df_test

In [35]:
def scaling(df_train, df_test):
    scaler = StandardScaler()
    df_train[continuous_columns] = scaler.fit_transform(df_train[continuous_columns])
    df_test[continuous_columns] = scaler.transform(df_test[continuous_columns])
    
    return df_train, df_test

In [36]:
def last_convert_for_tensorflow(df_train, df_test):
    
    df_train = df_train.astype('float32')
    df_test = df_test.astype('float32')

    return df_train, df_test

In [37]:
def train_test_encoding_scaling_converting_for_tf(df_train, df_test):

    df_train, df_test = encoding(df_train, df_test)
    df_train, df_test = scaling(df_train, df_test)
    df_train, df_test = last_convert_for_tensorflow(df_train, df_test)

    return df_train, df_test

In [38]:
def drop_low_correlation_features(df_train, df_test, threshold=0.1):

    correlations = df_train.corr()[target_column].abs()

    columns_to_keep = correlations[correlations >= threshold].index.tolist()
    
    # Make sure target column is in the columns to keep
    if target_column not in columns_to_keep:
        columns_to_keep.append(target_column)
    
    df_train = df_train[columns_to_keep]
    columns_to_keep.remove(target_column)
    df_test = df_test[columns_to_keep]

    return df_train, df_test

In [39]:
target_column = 'Transported'
categorical_columns = ['HomePlanet', 'Destination', 'Deck', 'Side']
continuous_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'All_Expenses', 'Age']
boolean_columns = ['CryoSleep', 'VIP', 'WithFamily', 'HasExpenses']

In [40]:
df_train = data_cleaning_and_preparation(df_train)


  df[col] = df[col].fillna(df[col].mode()[0])
  df[col] = df[col].fillna(df[col].mode()[0])


In [41]:
df_test = data_cleaning_and_preparation(df_test)

  df[col] = df[col].fillna(df[col].mode()[0])
  df[col] = df[col].fillna(df[col].mode()[0])


In [42]:
df_train, df_test = train_test_encoding_scaling_converting_for_tf(df_train, df_test)


In [44]:
df_train, df_test = drop_low_correlation_features(df_train, df_test)

In [45]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

def xgb_predictions(df_train, df_test):

    X = df_train.copy()
    y = X['Transported']
    X = X.drop(columns='Transported')
    X_train, X_val, y_train, y_val=train_test_split(X,y, test_size=0.2, random_state=42)

    param_grid = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0, 0.1, 0.2, 0.5]
    }

    xgb1 = xgb.XGBClassifier(random_state=42)
    random_search = RandomizedSearchCV(xgb1, param_distributions=param_grid, n_iter=20, scoring='roc_auc', cv=5, random_state=42, n_jobs=-1, verbose=1)
    random_search.fit(X_train, y_train)

    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best ROC-AUC: {random_search.best_score_:.4f}")

    focused_param_grid = {
        'n_estimators': [250, 300, 350],
        'max_depth': [5, 6, 7],
        'learning_rate': [0.05, 0.06, 0.07],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
        'min_child_weight': [1, 2, 3],
        'gamma': [0.30, 0.35, 0.40]
    }

    xgb2 = xgb.XGBClassifier(random_state=42)
    grid_search = GridSearchCV(xgb2, param_grid=focused_param_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best ROC-AUC: {grid_search.best_score_:.4f}")

    best_model = random_search.best_estimator_
    best_model.fit(X_train, y_train)

    submision = best_model.predict(df_test)
    df_submision['Transported'] = submision.astype(bool)
    df_submision.to_csv('data/xgb_submisions.csv', index=False)

In [46]:
from tensorflow import keras
from tensorflow.keras import layers # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore

def simple_nn_predictions(df_train, df_test):

    optimizer = Adam(learning_rate=0.001)

    X = df_train.copy()
    y = X[target_column]
    X = X.drop(columns=target_column)

    def create_improved_model(optimizer=optimizer, activation='relu', neurons=64):
        model = keras.Sequential()
        model.add(layers.InputLayer(input_shape=(X.shape[1],)))
        
        model.add(layers.Dense(neurons))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        model.add(layers.Dropout(0.3))
        
        model.add(layers.Dense(neurons // 2))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        model.add(layers.Dropout(0.2))
        
        model.add(layers.Dense(neurons // 4))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        
        model.add(layers.Dense(1, activation='sigmoid'))
        
        model.compile(optimizer=optimizer, 
                    loss='binary_crossentropy', 
                    metrics=['accuracy', keras.metrics.AUC()])
        return model

    model = create_improved_model()
    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, mode='min'),
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=8, min_lr=1e-6),
        keras.callbacks.ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
    ]

    history = model.fit(X, y, epochs=150, batch_size=32, validation_split=0.20, verbose=1, callbacks=callbacks)

    best_model = keras.models.load_model('best_model.h5')

    submision = best_model.predict(df_test)
    df_submision['Transported'] = (submision > 0.5).astype(bool)
    df_submision.to_csv('data/nn_submisions.csv', index=False)

In [47]:
from tensorflow import keras
from tensorflow.keras import layers # type: ignore
from tensorflow.keras.optimizers import Adam, AdamW # type: ignore
import optuna

def optuna_predictions(df_train, df_test):

    X = df_train.copy()
    y = X[target_column]
    X = X.drop(columns=target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    def create_model(optimizer='adam', activation='relu', neurons=64, layer_count=1, dropout=0.0, learning_rate=0.001, weight_decay=0.01):

        model = keras.Sequential()
        model.add(layers.InputLayer(shape=(X.shape[1],)))
        for i in range(layer_count):
            model.add(layers.Dense(neurons, activation=activation))
            model.add(layers.Dropout(dropout))
        model.add(layers.Dense(1, activation='sigmoid'))
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=learning_rate)
        elif optimizer == 'adamw':
            optimizer = AdamW(learning_rate=learning_rate, weight_decay=weight_decay)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        return model
    
    # ADDED: Pyramidal model architecture
    def create_pyramidal_model(optimizer='adam', neurons=16, dropout=0.2):

        model = keras.Sequential()
        model.add(layers.InputLayer(shape=(X.shape[1],)))
        model.add(layers.Dense(neurons * 8))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation('relu'))
        model.add(layers.Dropout(dropout))
        model.add(layers.Dense(neurons * 4))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation('relu'))
        model.add(layers.Dropout(dropout * 0.8))
        model.add(layers.Dense(neurons * 2))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation('relu'))
        model.add(layers.Dropout(dropout * 0.6))
        model.add(layers.Dense(neurons))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation('relu'))
        model.add(layers.Dense(1, activation='sigmoid'))
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=0.001)
        elif optimizer == 'adamw':
            optimizer = AdamW(learning_rate=0.001, weight_decay=0.01)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    # ADDED: Residual model architecture
    def create_residual_model(optimizer='adam', neurons=16, layer_count=4, dropout=0.2):

        inputs = layers.Input(shape=(X.shape[1],))
        x = layers.LayerNormalization()(inputs)
        for i in range(layer_count):
            residual = x if i > 0 else None
            x = layers.Dense(neurons)(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Dropout(dropout)(x)
            if residual is not None and x.shape[-1] == residual.shape[-1]:
                x = layers.Add()([x, residual])
        outputs = layers.Dense(1, activation='sigmoid')(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=0.001)
        elif optimizer == 'adamw':
            optimizer = AdamW(learning_rate=0.001, weight_decay=0.01)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        return model
    
    # ADDED: Ensemble model architecture
    def create_ensemble_model(optimizer='adam', neurons=16, layer_count=4, dropout=0.2):

        inputs = layers.Input(shape=(X.shape[1],))
        x1 = layers.Dense(neurons * 4, activation='relu')(inputs)
        x1 = layers.BatchNormalization()(x1)
        x1 = layers.Dropout(dropout)(x1)
        x1 = layers.Dense(neurons * 2, activation='relu')(x1)
        x2 = inputs
        for i in range(layer_count):
            x2 = layers.Dense(neurons, activation='relu')(x2)
            x2 = layers.BatchNormalization()(x2)
            x2 = layers.Dropout(dropout)(x2)
        merged = layers.Concatenate()([x1, x2])
        merged = layers.Dense(neurons // 2, activation='relu')(merged)
        merged = layers.BatchNormalization()(merged)
        outputs = layers.Dense(1, activation='sigmoid')(merged)
        model = keras.Model(inputs=inputs, outputs=outputs)
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=0.001)
        elif optimizer == 'adamw':
            optimizer = AdamW(learning_rate=0.001, weight_decay=0.01)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        return model
    
    callbacks = [
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, mode='min'),
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=8, min_lr=1e-6),
    ]
    
    def objective(trial):

        # ADDED: Model type selection
        model_type = trial.suggest_categorical('model_type', ['standard', 'pyramidal'])
        optimizer = trial.suggest_categorical('optimizer', ['adamw'])
        neurons = trial.suggest_int('neurons', 48, 80, step=8)
        batch_size = trial.suggest_categorical('batch_size', [40, 48, 56])
        layer_count = trial.suggest_int('layer_count', 3, 5, step=1)
        dropout = trial.suggest_float("dropout", 0.1, 0.2, step=0.025)
        learning_rate = trial.suggest_float("learning_rate", 0.005, 0.015, log=True)
        weight_decay = trial.suggest_float("weight_decay", 0.001, 0.05, log=True)
        
        # ADDED: Model selection based on model_type
        if model_type == 'standard':
            model = create_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)
        elif model_type == 'pyramidal':
            model = create_pyramidal_model(optimizer=optimizer, neurons=neurons, dropout=dropout)
        elif model_type == 'residual':
            model = create_residual_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)
        else:  # ensemble
            model = create_ensemble_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)
        
        if optimizer == 'adamw':
            model.optimizer.learning_rate = learning_rate
            model.optimizer.weight_decay = weight_decay

        history = model.fit(X_train, y_train, epochs=100, batch_size=batch_size, validation_split=0.20, verbose=1, callbacks=callbacks)
        loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

        return accuracy
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20, n_jobs=1)
    print("Best hyperparameters: ", study.best_params)
    print("Best trial: ", study.best_trial)

    best_params = study.best_params
    model_type = best_params['model_type']
    optimizer = best_params['optimizer']
    neurons = best_params['neurons']
    batch_size = best_params['batch_size']
    layer_count = best_params['layer_count']
    dropout = best_params['dropout']
    learning_rate = best_params['learning_rate']

    if model_type == 'standard':
        best_model = create_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)
    elif model_type == 'pyramidal':
        best_model = create_pyramidal_model(optimizer=optimizer, neurons=neurons, dropout=dropout)
    elif model_type == 'residual':
        best_model = create_residual_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)
    else:  # ensemble
        best_model = create_ensemble_model(optimizer=optimizer, neurons=neurons, layer_count=layer_count, dropout=dropout)

    best_model.fit(X, y, epochs=100, batch_size=batch_size, validation_split=0.20, verbose=1, callbacks=callbacks)

    submision = best_model.predict(df_test)

    df_submision['Transported'] = (submision > 0.5).astype(bool)
    df_submision.to_csv('data/optuna_submisions.csv', index=False)
    

In [48]:
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot loss
    ax1.plot(history.history['loss'], label='Training Loss')
    ax1.plot(history.history['val_loss'], label='Validation Loss')
    ax1.set_title('Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Plot accuracy
    ax2.plot(history.history['accuracy'], label='Training Accuracy')
    ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax2.set_title('Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

# plot_history(history)

In [49]:
# xgb_predictions(df_train, df_test)
# simple_nn_predictions(df_train, df_test)
# optuna_predictions(df_train, df_test)