In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, mean_squared_error, f1_score

# Load the dataset
realistic_file_path = "C:\\Users\\Aman Kumar\\OneDrive\\Desktop\\Agenix\\aadvanced_meaningful_funnel_conversion_data.csv"
data = pd.read_csv(realistic_file_path)

# Identify categorical columns
categorical_columns = ["Traffic_Source", "Purchase_History", "Device_Type", "Time_of_Day", "Discount_Usage"]

# Encode categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
X = data.drop("Conversion", axis=1)
y = data["Conversion"]

# Normalize numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into Train/Validation/Test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train_series, y_val_series = train_test_split(X_trainval, y_trainval, stratify=y_trainval, test_size=0.25, random_state=42)

y_train = y_train_series.to_numpy()
y_val = y_val_series.to_numpy()
y_test = y_test.to_numpy()

# Model Definition
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, kernel_regularizer=tf.keras.regularizers.l2(0.0005), input_shape=(X_train.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.0005)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.2),
    
    tf.keras.layers.Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.0005)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(alpha=0.1),
    tf.keras.layers.Dropout(0.1),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

# Callbacks for Early Stopping and LR Reduction
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

# Save the Model
os.makedirs("saved_model", exist_ok=True)

# Save in Keras format (recommended)
model.save("saved_model/my_model.keras")
print("Model saved in Keras native format at 'saved_model/my_model.keras'")

# Save in HDF5 format
model.save("saved_model/my_model.h5")
print("Model saved in HDF5 format at 'saved_model/my_model.h5'")

# Save preprocessing artifacts
with open("saved_model/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

with open("saved_model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Preprocessing artifacts saved.")

# Threshold Tuning
y_val_pred_prob = model.predict(X_val).ravel()

best_threshold = 0.5
best_f1 = 0.0

for threshold in np.arange(0.1, 1.0, 0.01):
    y_val_pred = (y_val_pred_prob > threshold).astype(int)
    f1 = f1_score(y_val, y_val_pred, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best Threshold on Validation Set: {best_threshold:.2f}, F1: {best_f1:.4f}")

# Final Evaluation
y_test_pred_prob = model.predict(X_test).ravel()
y_test_pred = (y_test_pred_prob > best_threshold).astype(int)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, zero_division=0)
auc = roc_auc_score(y_test, y_test_pred_prob)
mse = mean_squared_error(y_test, y_test_pred_prob)
f1 = f1_score(y_test, y_test_pred, zero_division=0)

# Calculate TP and FP
true_positive_count = sum((y_test_pred == 1) & (y_test == 1))
false_positive_count = sum((y_test_pred == 1) & (y_test == 0))
total_positive_predictions = len(y_test_pred[y_test_pred == 1])

true_positive_percentage = (true_positive_count / total_positive_predictions) * 100 if total_positive_predictions > 0 else 0
false_positive_percentage = (false_positive_count / total_positive_predictions) * 100 if total_positive_predictions > 0 else 0

print("\nFinal Test Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MSE: {mse:.4f}")
print(f"True Positive Percentage: {true_positive_percentage:.2f}%")
print(f"False Positive Percentage: {false_positive_percentage:.2f}%")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6494 - auc: 0.5805 - loss: 0.7727 - val_accuracy: 0.7605 - val_auc: 0.7069 - val_loss: 0.7033 - learning_rate: 5.0000e-04
Epoch 2/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7449 - auc: 0.6581 - loss: 0.6654 - val_accuracy: 0.7615 - val_auc: 0.7113 - val_loss: 0.6471 - learning_rate: 5.0000e-04
Epoch 3/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7458 - auc: 0.6809 - loss: 0.6617 - val_accuracy: 0.7595 - val_auc: 0.7120 - val_loss: 0.6333 - learning_rate: 5.0000e-04
Epoch 4/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7635 - auc: 0.6960 - loss: 0.6417 - val_accuracy: 0.7625 - val_auc: 0.7068 - val_loss: 0.6309 - learning_rate: 5.0000e-04
Epoch 5/100
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7608 - auc: 0.7057 - loss: 0.




Model saved in HDF5 format at 'saved_model/my_model.h5'
Preprocessing artifacts saved.
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Best Threshold on Validation Set: 0.40, F1: 0.8669
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 784us/step

Final Test Metrics:
Accuracy: 0.7630
Precision: 0.7686
F1 Score: 0.8632
AUC: 0.6704
MSE: 0.1726
True Positive Percentage: 76.86%
False Positive Percentage: 23.14%
