In [33]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [34]:
import pandas as pd
train=pd.read_csv('../data/training_data.csv',low_memory=True, dtype=np.float32)
test=pd.read_csv('../data/testing_data.csv',low_memory=True, dtype=np.float32)



In [35]:
dep = ['Label']
X_train = train.loc[:,~train.columns.isin(dep)]
y_train = train.loc[:,train.columns.isin(dep)]


In [36]:
X_val = test.loc[:,~test.columns.isin(dep)]
y_val = test.loc[:,test.columns.isin(dep)]

In [37]:
# Convert y to 1D array if it's a DataFrame
# y_train = y_train.values.ravel() if hasattr(y_train, "values") else y_train
# y_val = y_val.values.ravel() if hasattr(y_val, "values") else y_val

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint, uniform
import numpy as np
from tensorflow.keras.initializers import GlorotUniform

# Step 1: Fit base model on all features
base_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
base_model.fit(X_train, y_train.values.ravel())

# Step 2: Select important features only
selector = SelectFromModel(base_model, threshold="median", prefit=True)
X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)




In [74]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scaler
scaler = MinMaxScaler()

# Fit on training data and transform both train and val
X_train_scaled = scaler.fit_transform(X_train_sel)
X_val_scaled = scaler.transform(X_val_sel)


In [75]:
y_train.mean()

Label    0.073314
dtype: float32

In [None]:
# 2. Define hyperparameter search space (includes number of layers)
param_dist = {
    'learning_rate': uniform(1e-4, 3e-3),
    'dropout_rate': uniform(0.2, 0.3),
    'num_units': randint(10, 64),
    'num_layers': randint(5, 16),       
    'batch_size': [64],
    'epochs': [10]
}
param_list = list(ParameterSampler(param_dist, n_iter=10, random_state=42))

best_score = -1
best_model = None
best_params = None

# 3. Training loop
for i, params in enumerate(param_list):
    print(f"\nTrying params: {params}")
    
    model = Sequential()
    model.add(Dense(params['num_units'], activation='relu', input_shape=(X_train_scaled.shape[1],),kernel_initializer=GlorotUniform(),bias_initializer='zeros'))
    model.add(Dropout(params['dropout_rate']))

    # Add additional hidden layers based on hyperparameter
    for _ in range(params['num_layers'] - 1):
        model.add(Dense(params['num_units'], activation='relu',kernel_initializer=GlorotUniform(),bias_initializer='zeros'))
        model.add(Dropout(params['dropout_rate']))

    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])

    model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        verbose=0
    )

    y_val_prob = model.predict(X_val_scaled).ravel()
    val_auc = roc_auc_score(y_val, y_val_prob)
    print(f"Validation AUC: {val_auc:.4f}")

    if val_auc > best_score:
        best_score = val_auc
        best_model = model
        best_params = params

print(f"\n Best AUC: {best_score:.4f}")
print("Best Parameters:", best_params)





Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.3123620356542087), 'epochs': 10, 'learning_rate': np.float64(0.0029521429192297484), 'num_layers': 12, 'num_units': 17}
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation AUC: 0.5474

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.379597545259111), 'epochs': 10, 'learning_rate': np.float64(0.0005680559213273096), 'num_layers': 12, 'num_units': 32}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation AUC: 0.5622

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.21742508365045984), 'epochs': 10, 'learning_rate': np.float64(0.0026985284373248052), 'num_layers': 13, 'num_units': 49}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation AUC: 0.5641

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.4124217733388137), 'epochs': 10, 'learning_rate': np.float64(0.00016175348288740736), 'num_layers': 11, 'num_units': 33}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Validation AUC: 0.5582

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.4497327922401265), 'epochs': 10, 'learning_rate': np.float64(0.0007370173320348285), 'num_layers': 13, 'num_units': 30}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Validation AUC: 0.5572

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.385244452888315), 'epochs': 10, 'learning_rate': np.float64(0.0019349594814648428), 'num_layers': 14, 'num_units': 53}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation AUC: 0.5626

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.20691872751242474), 'epochs': 10, 'learning_rate': np.float64(0.0016743239807751676), 'num_layers': 11, 'num_units': 37}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Validation AUC: 0.5593

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.49212665565243774), 'epochs': 10, 'learning_rate': np.float64(0.0007983140212909128), 'num_layers': 15, 'num_units': 56}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Validation AUC: 0.5563

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.3855158027999262), 'epochs': 10, 'learning_rate': np.float64(0.0012473859738014882), 'num_layers': 13, 'num_units': 12}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Validation AUC: 0.5000

Trying params: {'batch_size': 64, 'dropout_rate': np.float64(0.45798212202089617), 'epochs': 10, 'learning_rate': np.float64(0.002140922615763339), 'num_layers': 10, 'num_units': 48}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step
Validation AUC: 0.5610

✅ Best AUC: 0.5641
Best Parameters: {'batch_size': 64, 'dropout_rate': np.float64(0.21742508365045984), 'epochs': 10, 'learning_rate': np.float64(0.0026985284373248052), 'num_layers': 13, 'num_units': 49}


In [81]:
#best_model.save_model("../model-objects/RF_final_model.json")  # Saves model to JSON format

import pickle

with open('../model-objects/NN_final_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# #loaded_model = XGBClassifier()
# #loaded_model.load_model("../model-objects/xgb_final_model.json")


In [86]:
# Step 6: Get predictions on train/val
y_train_prob = model.predict(X_train_scaled).ravel()
y_val_prob = model.predict(X_val_scaled).ravel()
train_auc = roc_auc_score(y_train, y_train_prob)
val_auc = roc_auc_score(y_val, y_val_prob)

[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step


In [87]:
data_ap_train = pd.DataFrame({'Actual':y_train.values.ravel(),'Predicted':y_train_prob})
data_ap_val = pd.DataFrame({'Actual':y_val.values.ravel(),'Predicted':y_val_prob})

**KS AND RANK ORDER**

In [88]:
def ks(data=None,target=None, prob=None):
    data['target0'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    print(kstable)
    
    #Display KS
    from colorama import Fore
    print(Fore.RED + "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
    return(kstable)

In [89]:
ks_train=ks(data=data_ap_train,target='Actual',prob='Predicted')

ValueError: Bin edges must be unique: Index([0.0064096637070178986,   0.05105820782482624,   0.07025631815195084,
         0.08795567452907563,   0.10524910688400269,   0.11974748596549034,
         0.12632262110710143,   0.12638522684574127,   0.12638522684574127,
         0.12638522684574127,   0.12638522684574127],
      dtype='float64', name='Predicted').
You can drop duplicate edges by setting the 'duplicates' kwarg

In [90]:
ks_val=ks(data=data_ap_val,target='Actual',prob='Predicted')

ValueError: Bin edges must be unique: Index([0.008170580491423607,  0.05907974503934384,  0.08692148476839066,
        0.11214669421315195,   0.1262371599674225,  0.12638522684574127,
        0.12638522684574127,  0.12638522684574127,  0.12638522684574127,
        0.12638522684574127,  0.12638522684574127],
      dtype='float64', name='Predicted').
You can drop duplicate edges by setting the 'duplicates' kwarg

In [63]:
ks_train.to_csv('../data-analysis/ks_table_train_nn.csv')
ks_val.to_csv('../data-analysis/ks_table_val_nn.csv')

NameError: name 'ks_val' is not defined