In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tensorflow.keras import models, layers
from time import time

In [2]:
dataset = pd.read_csv('../data/raw/train.csv')

X = dataset.drop(['Exited'], axis=1)
y = dataset['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Preprocessing data pipeline

In [3]:
categorical_preprocessor = ColumnTransformer(
    transformers = [
        ('geography_encoder', OneHotEncoder(), ['Geography']),
        ('gender_encoder', OneHotEncoder(), ['Gender']),
        (
            'passthrough', 'passthrough',
            ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
        )
    ]
)

In [4]:
numerical_preprocessor = ColumnTransformer(
    transformers = [
        ('std_scaler', StandardScaler(), ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'Has'])
    ]
)

In [5]:
preprocessing_pipeline = Pipeline([
    ('categorical_preprocessor', categorical_preprocessor),
    ('std_scaler', StandardScaler())
])

### Model creation

In [41]:
model = models.Sequential(
    layers = [
        layers.Dense(10, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ]
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'], run_eagerly=True)

In [20]:
X_preprocessed_train = preprocessing_pipeline.fit_transform(X_train)

In [21]:
model.fit(X_preprocessed_train, y_train, epochs=10, batch_size=512, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f90ac75b490>

### Test model

### Export solution

In [32]:
submission_dataset = pd.read_csv('../data/raw/sample_submission.csv')
X_submission_dataset = pd.read_csv('../data/raw/test.csv')
X_preprocessed_submission_dataset = preprocessing_pipeline.transform(X_submission_dataset)

y_submission_predictions = model.predict(X_preprocessed_submission_dataset)

threshold = 0.5
y_predicted = (y_submission_predictions > threshold).astype(int)



In [35]:
submission_dataset['Exited'] = y_predicted

In [40]:
submission_dataset.to_csv(f'../data/output/submission_{round(time())}.csv', index=False)