In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import concurrent.futures
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from tensorflow.keras.losses import BinaryCrossentropy


In [14]:
df = pd.read_csv("C:\\Users\\joonw\\trav\\trav_dataset1.csv")

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, stratify=df['convert_ind'], random_state=24)

train_set = train_set.drop(columns=['split'])
test_set = test_set.drop(columns=['split'])

# Separate features and target from the entire training set
y_train = train_set['convert_ind'].values
train_x = train_set.drop(columns=['convert_ind'])

y_test = test_set['convert_ind'].values
test_x = test_set.drop(columns=['convert_ind'])

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_x)
X_test = scaler.transform(test_x)

# Reshape the data for Conv1D (add a channel dimension)
X_train = X_train[:, :, np.newaxis]
X_test = X_test[:, :, np.newaxis]

In [15]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))
class_weights


early_stopping = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')


In [44]:
# Build the CNN model
model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.3),
    
    Conv1D(64, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
# if model is not learning, reduce learning_rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=[AUC(name='auc')])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=25,
    batch_size=32,
    verbose=1,
    class_weight=class_weights,
    callbacks=[early_stopping]
)

Epoch 1/25


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 8ms/step - auc: 0.5248 - loss: 0.8966 - val_auc: 0.6158 - val_loss: 0.6776
Epoch 2/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - auc: 0.5607 - loss: 0.7424 - val_auc: 0.6329 - val_loss: 0.6897
Epoch 3/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - auc: 0.5811 - loss: 0.7032 - val_auc: 0.6396 - val_loss: 0.6717
Epoch 4/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - auc: 0.6059 - loss: 0.6892 - val_auc: 0.6492 - val_loss: 0.6976
Epoch 5/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - auc: 0.6057 - loss: 0.6797 - val_auc: 0.6474 - val_loss: 0.6938
Epoch 6/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - auc: 0.6121 - loss: 0.6781 - val_auc: 0.6556 - val_loss: 0.7010
Epoch 7/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/

In [45]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions
predictions = model.predict(X_test)
mm = np.mean(df['convert_ind'])
test_pred = [1 if x >=0.083 else 0 for x in predictions]

# Calculate AUC
auc = roc_auc_score(y_test, predictions)
auc2 = roc_auc_score(y_test, test_pred)
print(f"Test AUC: {auc:.4f}")
print(f"Test AUC: {auc2:.4f}")

Test Accuracy: 0.7196
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test AUC: 0.7196
Test AUC: 0.5028


Add cross validation structure

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Dropout, Add, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC

# Load and preprocess the data
df = pd.read_csv("C:\\Users\\joonw\\trav\\trav_dataset1.csv")
# Split features and target
y = df['convert_ind'].values
X = df.drop(columns=['convert_ind']).select_dtypes(include=[np.number]).values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = X[:, :, np.newaxis]  # Reshape for Conv1D

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(class_weights))

# Define the residual CNN model
def build_model(input_shape):
    input_layer = Input(shape=input_shape)
    
    # First Convolutional Block
    x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(input_layer)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Residual Block
    residual = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
    residual = BatchNormalization()(residual)
    x = Add()([x, residual])
    x = Dropout(0.3)(x)
    
    # Second Convolutional Block
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Flatten and Dense Layers
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    output_layer = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
auc_scores = []

for train_idx, val_idx in skf.split(X, y):
    print(f"Training Fold {fold}...")
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile the model
    model = build_model(input_shape=(X_train.shape[1], 1))
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=[AUC(name='auc')])
    
    # Early stopping
    early_stopping = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=25,
        batch_size=32,
        verbose=1,
        class_weight=class_weights,
        callbacks=[early_stopping]
    )
    
    # Evaluate the model
    val_auc = model.evaluate(X_val, y_val, verbose=0)[1]
    print(f"Fold {fold} AUC: {val_auc:.4f}")
    auc_scores.append(val_auc)
    fold += 1

# Summary of results
print(f"Average AUC across folds: {np.mean(auc_scores):.4f}")


Training Fold 1...
Epoch 1/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - auc: 0.5370 - loss: 0.9213 - val_auc: 0.6284 - val_loss: 0.8182
Epoch 2/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.5705 - loss: 0.7143 - val_auc: 0.6318 - val_loss: 0.7796
Epoch 3/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.5850 - loss: 0.7092 - val_auc: 0.6337 - val_loss: 0.7341
Epoch 4/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.5935 - loss: 0.6841 - val_auc: 0.6413 - val_loss: 0.7540
Epoch 5/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.6049 - loss: 0.6829 - val_auc: 0.6414 - val_loss: 0.7306
Epoch 6/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - auc: 0.6136 - loss: 0.6723 - val_auc: 0.6450 - val_loss: 0.7390
Epoch 7/25
[1m1230/1230[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [43]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions
predictions = model.predict(X_test)
mm = np.mean(df['convert_ind'])
test_pred = [1 if x >=0.083 else 0 for x in predictions]

# Calculate AUC
auc = roc_auc_score(y_test, predictions)
auc2 = roc_auc_score(y_test, test_pred)
print(f"Test AUC: {auc:.4f}")
print(f"Test AUC: {auc2:.4f}")

Test Accuracy: 0.7036
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test AUC: 0.7037
Test AUC: 0.5013


## Tabnet

### !pip install pytorch-tabnet

Tabnet requires np array

In [10]:
# Split the data
train_set, test_set = train_test_split(df, test_size=0.2, stratify=df['convert_ind'], random_state=24)

# Drop the 'split' column if it exists
if 'split' in train_set.columns:
    train_set = train_set.drop(columns=['split'])
    test_set = test_set.drop(columns=['split'])

# Separate features and target
y_train = train_set['convert_ind'].values
train_x = train_set.drop(columns=['convert_ind'])

y_test = test_set['convert_ind'].values
test_x = test_set.drop(columns=['convert_ind'])

# Select numeric columns
train_x = train_x.select_dtypes(include=[np.number])
test_x = test_x.select_dtypes(include=[np.number])

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_x)
X_test = scaler.transform(test_x)

# Ensure inputs are NumPy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)



Hyper parameter optimization

In [None]:

from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import roc_auc_score

best_auc = 0
best_params = {}

# Parameter ranges
param_grid = {
    'n_d': [8, 16, 32],
    'n_a': [8, 16, 32],
    'n_steps': [3, 5, 7],
    'gamma': [1.0, 1.5, 2.0],
    'lambda_sparse': [1e-4, 1e-3, 1e-2],
    'learning_rate': [0.01, 0.05, 0.1],
}

# Manual loop for tuning
for n_d in param_grid['n_d']:
    for n_a in param_grid['n_a']:
        for n_steps in param_grid['n_steps']:
            for gamma in param_grid['gamma']:
                for lambda_sparse in param_grid['lambda_sparse']:
                    for lr in param_grid['learning_rate']:
                        # Define the model
                        model = TabNetClassifier(
                            n_d=n_d,
                            n_a=n_a,
                            n_steps=n_steps,
                            gamma=gamma,
                            lambda_sparse=lambda_sparse,
                            optimizer_params=dict(lr=lr),
                        )
                        
                        # Train the model
                        model.fit(
                            X_train, y_train,
                            eval_set=[(X_test, y_test)],
                            eval_metric=['auc'],
                            max_epochs=100,
                            batch_size=128,
                            patience=10
                        )
                        
                        # Evaluate the model
                        preds = model.predict_proba(X_test)[:, 1]
                        auc = roc_auc_score(y_test, preds)
                        print(f"Params: n_d={n_d}, n_a={n_a}, n_steps={n_steps}, gamma={gamma}, lambda_sparse={lambda_sparse}, lr={lr}, AUC={auc:.4f}")

                        # Track the best parameters
                        if auc > best_auc:
                            best_auc = auc
                            best_params = {
                                'n_d': n_d,
                                'n_a': n_a,
                                'n_steps': n_steps,
                                'gamma': gamma,
                                'lambda_sparse': lambda_sparse,
                                'learning_rate': lr,
                            }

print("Best AUC:", best_auc)
print("Best Parameters:", best_params)


### Modeling after optimizing hyper parameter

In [12]:
n_d = 8 
n_a = 8
n_steps =3
gamma=1.5
lambda_sparse=1e-4
lr = 0.1

model = TabNetClassifier(
    n_d=n_d,
    n_a=n_a,
    n_steps=n_steps,
    gamma=gamma,
    lambda_sparse= lambda_sparse,
    optimizer_params=dict(lr= lr),
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['auc'],
    max_epochs=100,
    batch_size=128,
    patience=10
)

# Evaluate the model
preds = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds)
print(f"Params: n_d={n_d}, n_a={n_a}, n_steps={n_steps}, gamma={gamma}, lambda_sparse={lambda_sparse}, lr={lr}, AUC={auc:.4f}")




epoch 0  | loss: 0.29491 | val_0_auc: 0.58021 |  0:00:07s
epoch 1  | loss: 0.29021 | val_0_auc: 0.55617 |  0:00:16s
epoch 2  | loss: 0.29039 | val_0_auc: 0.55875 |  0:00:25s
epoch 3  | loss: 0.28932 | val_0_auc: 0.58041 |  0:00:32s
epoch 4  | loss: 0.28831 | val_0_auc: 0.61111 |  0:00:40s
epoch 5  | loss: 0.28703 | val_0_auc: 0.60974 |  0:00:48s
epoch 6  | loss: 0.28838 | val_0_auc: 0.57119 |  0:00:56s
epoch 7  | loss: 0.29055 | val_0_auc: 0.55653 |  0:01:04s
epoch 8  | loss: 0.28899 | val_0_auc: 0.55708 |  0:01:12s
epoch 9  | loss: 0.28861 | val_0_auc: 0.57897 |  0:01:20s
epoch 10 | loss: 0.28858 | val_0_auc: 0.58418 |  0:01:27s
epoch 11 | loss: 0.28796 | val_0_auc: 0.58675 |  0:01:36s
epoch 12 | loss: 0.28673 | val_0_auc: 0.59368 |  0:01:44s
epoch 13 | loss: 0.28664 | val_0_auc: 0.59614 |  0:01:52s
epoch 14 | loss: 0.28656 | val_0_auc: 0.60152 |  0:02:00s

Early stopping occurred at epoch 14 with best_epoch = 4 and best_val_0_auc = 0.61111




Params: n_d=8, n_a=8, n_steps=3, gamma=2.0, lambda_sparse=0.001, lr=0.1, AUC=0.6111


In [41]:
preds = [1 if x>0.083 else 0 for x in preds]

auc = roc_auc_score(y_test, preds)
print(f"Params: n_d={n_d}, n_a={n_a}, n_steps={n_steps}, gamma={gamma}, lambda_sparse={lambda_sparse}, lr={lr}, AUC={auc:.4f}")


Params: n_d=8, n_a=8, n_steps=3, gamma=2.0, lambda_sparse=0.001, lr=0.1, AUC=0.5687
