In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

Predictions (deep learning model)

In [16]:
emb_df = pd.read_csv("sbert_embeddings.csv", header=None) # embeddings
data_df = pd.read_json("dataset.json") # targets
print(emb_df.shape, data_df.shape)

(148122, 1536) (148122, 6)


In [17]:
X = np.array(emb_df.values.tolist())[:]
Y = pd.get_dummies(data_df['category']).values[:]  # One-hot encoding of categories
print(X.shape, Y.shape)

(148122, 1536) (148122, 15)


In [34]:
# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(Y.shape[1], activation='softmax')  # Prediction head
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# Make predictions
predictions = model.predict(X_test)
predicted_categories = np.argmax(predictions, axis=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
N = len(predicted_categories)
c = 0
for pred, y in zip(predicted_categories, np.argmax(Y_test, axis=1)):
    if pred == y:
        c += 1
print(c, N)
print("ACC", c/N)

1712 2000
ACC 0.856


# Preliminary hyperparameter tuning

In [None]:
X_train_global, X_test_global, Y_train_global, Y_test_global = train_test_split(X, Y, test_size=0.2, random_state=42)

In [101]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier 
from sklearn.model_selection import StratifiedKFold, cross_val_score
from kerastuner.tuners import Hyperband

# Define the model creation function with hyperparameters
def create_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=256, max_value=1024, step=128),
                    activation='relu', kernel_regularizer=regularizers.l2(hp.Float('weight_decay', min_value=1e-6, max_value=1e-2, sampling='log'))))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=128, activation='relu'))
    model.add(Dense(units=Y.shape[1], activation='softmax'))

    optimizer = Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log'))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize KerasClassifier with the model creation function
model = KerasClassifier(build_fn=create_model, batch_size=32, verbose=0)

# Perform hyperparameter tuning using Hyperband tuner
tuner = Hyperband(
    create_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='hyperband',
    project_name='my_tuning_project'
)

tuner.search_space_summary()

# Run the hyperparameter search
tuner.search(X_train_global, Y_train_global, epochs=10, batch_size=32, validation_split=0.3, callbacks=[EarlyStopping('val_loss', patience=5)])

# Get the best hyperparameters
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_hp.values)

# Retrain the model with the best hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
# Ensure correct dimensions
best_model.fit(X[0], Y[0], epochs=1, batch_size=32, verbose=False)
best_model.save("data/best_model")

Reloading Tuner from hyperband/my_tuning_project/tuner0.json
Search space summary
Default search space size: 4
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 256, 'max_value': 1024, 'step': 128, 'sampling': 'linear'}
weight_decay (Float)
{'default': 1e-06, 'conditions': [], 'min_value': 1e-06, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
dropout (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
learning_rate (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
Best Hyperparameters:
{'units_1': 896, 'weight_decay': 4.302778030652062e-05, 'dropout': 0.30000000000000004, 'learning_rate': 0.0004055961741658466, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
INFO:tensorflow:Assets written to: data/best_model/assets


INFO:tensorflow:Assets written to: data/best_model/assets


# Cross validation

The last batch is important - it represents the batch not used when hypertuning - hyperparams can also overfit!!

In [103]:
## HYPER CROSS VALIDATION

import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


num_folds = 3
random_state = 42

# Initialize StratifiedKFold
stratkf = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=random_state)

# Initialize lists to store metrics for each iteration
accuracies, precisions, recalls, f1_scores = [], [], [], []

# Loop over folds
for fold, (train_indices, test_indices) in enumerate(stratkf.split(X_train_global, np.argmax(Y_train_global, axis=1))):
    print(f"Training on fold {fold + 1}...")

    # Split the data for this fold
    X_train, X_test = X[train_indices], X[test_indices]
    Y_train, Y_test = Y[train_indices], Y[test_indices]

    model = tf.keras.models.load_model("data/best_model")
    model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=True)

    # Train the model
    # model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=True)

    # Make predictions on the test set
    predictions = model.predict(X_test)
    predicted_categories = np.argmax(predictions, axis=1)

    # Calculate metrics for this iteration
    accuracy = accuracy_score(np.argmax(Y_test, axis=1), predicted_categories)
    precision = precision_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    recall = recall_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    f1 = f1_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')

    # Append metrics to the lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    # Display metrics for this iteration
    print(f"Accuracy for fold {fold + 1}: {accuracy:.4f}")
    print(f"Precision for fold {fold + 1}: {precision:.4f}")
    print(f"Recall for fold {fold + 1}: {recall:.4f}")
    print(f"F1 Score for fold {fold + 1}: {f1:.4f}\n")

# Calculate and output the average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)

print(f"\nAverage Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

Training on fold 1...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sucess
Accuracy for fold 1: 0.8568
Precision for fold 1: 0.8574
Recall for fold 1: 0.8568
F1 Score for fold 1: 0.8530

Training on fold 2...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sucess
Accuracy for fold 2: 0.8609
Precision for fold 2: 0.8607
Recall for fold 2: 0.8609
F1 Score for fold 2: 0.8583

Training on fold 3...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sucess
Accuracy for fold 3: 0.8569
Precision for fold 3: 0.8573
Recall for fold 3: 0.8569
F1 Score for fold 3: 0.8546


Average Accuracy: 0.8582
Average Precision: 0.8585
Average Recall: 0.8582
Average F1 Score: 0.8553


# Global testing

In [19]:
model = tf.keras.models.load_model("data/best_model")
model.fit(X_train_global, Y_train_global, epochs=10, batch_size=32, verbose=True)
# Train the model
# model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=True)

# Make predictions on the test set
predictions = model.predict(X_test_global)
predicted_categories = np.argmax(predictions, axis=1)

# Calculate metrics for this iteration
accuracy = accuracy_score(np.argmax(Y_test_global, axis=1), predicted_categories)
precision = precision_score(np.argmax(Y_test_global, axis=1), predicted_categories, average='weighted')
recall = recall_score(np.argmax(Y_test_global, axis=1), predicted_categories, average='weighted')
f1 = f1_score(np.argmax(Y_test_global, axis=1), predicted_categories, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}\n")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.8001
Precision: 0.8010
Recall: 0.8001
F1 Score: 0.7947



# Backup

In [95]:
## BACKUP CROSS VALIDATION

import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras import regularizers

# Assuming 'X' is your feature matrix and 'Y' is the target variable

# Define hyperparameters
num_folds = 5
epochs = 10
batch_size = 32
learning_rate = 0.001
weight_decay = 1e-5
random_state = 42

# Initialize StratifiedKFold
stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)

# Initialize lists to store metrics for each iteration
accuracies, precisions, recalls, f1_scores = [], [], [], []

# Define your TensorFlow model with hyperparameters
def create_model(input_dim, output_dim, learning_rate, weight_decay):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)),
        tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(output_dim, activation='softmax')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Loop over folds
for fold, (train_indices, test_indices) in enumerate(stratkf.split(X, np.argmax(Y, axis=1))):
    print(f"Training on fold {fold + 1}...")

    # Split the data for this fold
    X_train, X_test = X[train_indices], X[test_indices]
    Y_train, Y_test = Y[train_indices], Y[test_indices]

    # Create and compile the model
    model = create_model(X_train.shape[1], Y_train.shape[1], learning_rate, weight_decay)

    # Train the model
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=True)

    # Make predictions on the test set
    predictions = model.predict(X_test)
    predicted_categories = np.argmax(predictions, axis=1)

    # Calculate metrics for this iteration
    accuracy = accuracy_score(np.argmax(Y_test, axis=1), predicted_categories)
    precision = precision_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    recall = recall_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    f1 = f1_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')

    # Append metrics to the lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    # Display metrics for this iteration
    print(f"Accuracy for fold {fold + 1}: {accuracy:.4f}")
    print(f"Precision for fold {fold + 1}: {precision:.4f}")
    print(f"Recall for fold {fold + 1}: {recall:.4f}")
    print(f"F1 Score for fold {fold + 1}: {f1:.4f}\n")

# Calculate and output the average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)

print(f"\nAverage Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")



Training on fold 1...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 1: 0.8700
Precision for fold 1: 0.8721
Recall for fold 1: 0.8700
F1 Score for fold 1: 0.8571

Training on fold 2...
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 2: 0.9150
Precision for fold 2: 0.9154
Recall for fold 2: 0.9150
F1 Score for fold 2: 0.9057

Training on fold 3...
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 3: 0.9000
Precision for fold 3: 0.8947
Recall for fold 3: 0.9000
F1 Score for fold 3: 0.8957

Training on fold 4...
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 4: 0.8950
Precision for fold 4: 0.8965
Recall for fold 4: 0.8950
F1 Score for fold 4: 0.8891

Training on fold 5...
Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 5: 0.8800
Precision for fold 5: 0.8596
Recall for fold 5: 0.8800
F1 Score for fold 5: 0.8628


Average Accuracy: 0.8920
Average Precision: 0.8877
Average Recall: 0.8920
Average F1 Score: 0.8821


  _warn_prf(average, modifier, msg_start, len(result))
