In [133]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from scikeras.wrappers import KerasClassifier
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold

In [134]:
X_train = np.load("../Data/X_train.npy", allow_pickle=True).T
X_test = np.load("../Data/X_test.npy", allow_pickle=True).T
y_train = np.load("../Data/y_train.npy", allow_pickle=True)
y_test = np.load("../Data/y_test.npy", allow_pickle=True)



In [135]:
def create_model(model_type, input_dim=None, **kwargs):
    if model_type == 'neural_network':
        inputs = Input(shape=(input_dim, ))
        x = Dense(64, input_dim=input_dim, activation='relu')(inputs)
        x = Dropout(0.5)(x)
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.5)(x)
        outputs = Dense(1, activation='sigmoid')(x)
        
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    elif model_type == 'random_forest':
        model = RandomForestClassifier(**kwargs)
    elif model_type == 'svm':
        model = SVC(probability=True, **kwargs)
    elif model_type == 'xgboost':
        model = xgb.XGBClassifier(eval_metric='logloss', **kwargs)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    return model

In [136]:
def train_model(X_train, y_train, model_type, epochs=100, batch_size=32, validation_split=0.1, **kwargs):
    input_dim = X_train.shape[1]
    model = create_model(model_type, input_dim, **kwargs)
    
    if model_type in ['neural_network']:
        def lr_schedule(epoch):
            return 0.001 * (0.1 ** int(epoch / 10))
        #lr_scheduler = LearningRateScheduler(lr_schedule)
        
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        class_weight_dict = dict(enumerate(class_weights))
        history = model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight_dict,
            validation_split=validation_split,
            #callbacks=[lr_scheduler],
            verbose=1  # Set to 0 for no output, 1 for progress bar, 2 for one line per epoch
        )
    else:
        # For non-TensorFlow models
        model.fit(X_train, y_train)
        history = None
    
    return model, history

In [137]:
def train_model_feature_selection(X_train, y_train, X_test, model_type, epochs=100, batch_size=32, validation_split=0.1, **kwargs):
    selector = VarianceThreshold()
    X_train = selector.fit_transform(X_train)
    X_test = selector.transform(X_test)
    # Select top k features
    k = min(250, X_train.shape[1])  # Ensure k is not larger than the number of features
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    input_dim = X_train_selected.shape[1]
    model = create_model(model_type, input_dim=input_dim, **kwargs)
    if model_type in ['neural_network']:
        def lr_schedule(epoch):
            return 0.001 * (0.1 ** int(epoch / 10))
        #lr_scheduler = LearningRateScheduler(lr_schedule)
        
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        class_weight_dict = dict(enumerate(class_weights))
        history = model.fit(
            X_train_selected, y_train,
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight_dict,
            validation_split=validation_split,
            #callbacks=[lr_scheduler],
            verbose=1  # Set to 0 for no output, 1 for progress bar, 2 for one line per epoch
        )
    return model, history, X_test_selected
    
    

In [138]:
def plot_training_history(history, metrics=None):
    """
    Plot training history of a TensorFlow model.
    
    :param history: History object returned by model.fit()
    :param metrics: List of metrics to plot (optional). If None, plots all available metrics.
    """
    if not isinstance(history, dict):
        history = history.history

    epochs = range(1, len(next(iter(history.values()))) + 1)

    if metrics is None:
        metrics = [m for m in history.keys() if not m.startswith('val_')]

    plt.figure(figsize=(12, 4 * len(metrics)))
    
    for i, metric in enumerate(metrics, 1):
        plt.subplot(len(metrics), 1, i)
        
        plt.plot(epochs, history[metric], 'bo-', label=f'Training {metric}')
        if f'val_{metric}' in history:
            plt.plot(epochs, history[f'val_{metric}'], 'ro-', label=f'Validation {metric}')
        
        plt.title(f'{metric.capitalize()} vs. Epochs')
        plt.xlabel('Epochs')
        plt.ylabel(metric.capitalize())
        plt.legend()
        
        if metric == 'loss':
            plt.yscale('log')  # Use log scale for loss

    plt.tight_layout()
    plt.show()

In [139]:
def test_model(model, X_test, y_test, model_type):
    """
    Test the model on the test set and return various performance metrics.
    
    :param model: Trained model
    :param X_test: Test features
    :param y_test: True labels for test set
    :param model_type: Type of the model ('neural_network', 'random_forest', 'svm', 'xgboost')
    :return: Dictionary of performance metrics
    """
    # Make predictions
    if model_type in ['neural_network']:
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int)
    else:
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    
    # For TensorFlow models, also get the loss
    if model_type in ['neural_network']:
        loss = model.evaluate(X_test, y_test)[0]
    else:
        loss = None
    
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc_roc,
        'loss': loss
    }
    
    return results

SyntaxError: unterminated string literal (detected at line 12) (937199572.py, line 12)

In [None]:
model, history = train_model(X_train, y_train, model_type='neural_network', epochs=100)

In [None]:
plot_training_history(history, metrics=['loss', 'accuracy'])

In [None]:
test_results = test_model(model, X_test, y_test, model_type='neural_network')
for metric, value in test_results.items():
    print(f"{metric}: {value}")

In [None]:
model, history, X_test_feature_selection = train_model_feature_selection(X_train, y_train, X_test, model_type='neural_network', epochs=250)

In [None]:
plot_training_history(history, metrics=['loss', 'accuracy'])

In [None]:
test_results = test_model(model, X_test_feature_selection, y_test, model_type='neural_network')
for metric, value in test_results.items():
    print(f"{metric}: {value}")

In [None]:
model, history = train_model(X_train, y_train, model_type='xgboost', epochs=100)
test_results = test_model(model, X_test, y_test, model_type='xgboost')
for metric, value in test_results.items():
    print(f"{metric}: {value}")

In [None]:
model, history = train_model(X_train, y_train, model_type='random_forest', epochs=100)
test_results = test_model(model, X_test, y_test, model_type='random_forest')
for metric, value in test_results.items():
    print(f"{metric}: {value}")

In [None]:
model, history = train_model(X_train, y_train, model_type='svm', epochs=100)
test_results = test_model(model, X_test, y_test, model_type='svm')
for metric, value in test_results.items():
    print(f"{metric}: {value}")