In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

2024-01-06 21:08:27.032693: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Predictions (deep learning model)

In [2]:
emb_df = pd.read_csv("embeddingsCTXencode_plus_Sbert.csv", header=None) # embeddings
data_df = pd.read_json("dataset.json") # targets

In [3]:
print(emb_df.shape, data_df.shape)

(148122, 1536) (148122, 6)


In [4]:
X = np.array(emb_df.values.tolist())[:10000]
Y = pd.get_dummies(data_df['category']).values[:10000]  # One-hot encoding of categories

print(X.shape, Y.shape)

(10000, 1536) (10000, 15)


In [5]:
# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(Y.shape[1], activation='softmax')  # Prediction head
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# Make predictions
predictions = model.predict(X_test)
predicted_categories = np.argmax(predictions, axis=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
  Target: ENTERTAINMENT,                 Prediction: ENTERTAINMENT,                 Correctness: True
  Target: SPORTS,                 Prediction: ENTERTAINMENT,                 Correctness: False
  Target: BUSINESS,                 Prediction: HEALTHY LIVING,                 Correctness: False
  Target: QUEER VOICES,                 Prediction: QUEER VOICES,                 Correctness: True
  Target: ENTERTAINMENT,                 Prediction: ENTERTAINMENT,                 Correctness: True
  Target: HOME & LIVING,                 Prediction: HOME & LIVING,                 Correctness: True
  Target: QUEER VOICES,                 Prediction: QUEER VOICES,                 Correctness: True
  Target: QUEER VOICES,                 Prediction: QUEER VOICES,                 Correctness: True
  Target: QUEER VOICES,                 Prediction: QUEER VOICES,                 Correc

In [6]:
N = len(predicted_categories)
c = 0
for pred, y in zip(predicted_categories, np.argmax(Y_test, axis=1)):
    if pred == y:
        c += 1
print(c, N)
print("ACC", c/N)

1712 2000
ACC 0.856


# Hyperparameter tuning

In [11]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, precision_score
from tensorflow.keras import regularizers
from scikeras.wrappers import KerasClassifier

# Assuming 'X' is your feature matrix and 'Y' is the target variable

# Define hyperparameters
num_folds = 5
epochs = 10
batch_size = 32
random_state = 42

# Initialize StratifiedKFold
stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)

# Define your TensorFlow model with hyperparameters
def create_model(input_dim, output_dim, learning_rate=0.001, weight_decay=1e-5):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)),
        tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(output_dim, activation='softmax')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'weight_decay': [1e-5, 1e-4, 1e-3]
}

# Create the model
model = KerasClassifier(build_fn=create_model,
                        input_dim=X.shape[1],
                        output_dim=Y.shape[1],
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=0)

# Define scoring metric for hyperparameter tuning (you can change it based on your needs)
scorer = make_scorer(precision_score, average='weighted')

# Perform GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=stratkf)
grid_result = grid.fit(X, np.argmax(Y, axis=1))

# Print the best hyperparameters
print(f"Best Hyperparameters: {grid_result.best_params_}")

# Print the best precision score
print(f"Best Precision Score: {grid_result.best_score_:.4f}")

AttributeError: module 'tensorflow.keras' has no attribute 'wrappers'

# Cross validation

In [9]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming 'X' is your feature matrix and 'Y' is the target variable

# Define the number of folds
num_folds = 5

# Initialize StratifiedKFold
stratkf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize lists to store metrics for each iteration
accuracies, precisions, recalls, f1_scores = [], [], [], []

# Define your TensorFlow model
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(Y.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Loop over folds
for fold, (train_indices, test_indices) in enumerate(stratkf.split(X, np.argmax(Y, axis=1))):
    print(f"Training on fold {fold + 1}...")

    # Split the data for this fold
    X_train, X_test = X[train_indices], X[test_indices]
    Y_train, Y_test = Y[train_indices], Y[test_indices]

    # Create and compile the model
    model = create_model()

    # Train the model
    model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=True)

    # Make predictions on the test set
    predictions = model.predict(X_test)
    predicted_categories = np.argmax(predictions, axis=1)

    # Calculate metrics for this iteration
    accuracy = accuracy_score(np.argmax(Y_test, axis=1), predicted_categories)
    precision = precision_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    recall = recall_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')
    f1 = f1_score(np.argmax(Y_test, axis=1), predicted_categories, average='weighted')

    # Append metrics to the lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    # Display metrics for this iteration
    print(f"Accuracy for fold {fold + 1}: {accuracy:.4f}")
    print(f"Precision for fold {fold + 1}: {precision:.4f}")
    print(f"Recall for fold {fold + 1}: {recall:.4f}")
    print(f"F1 Score for fold {fold + 1}: {f1:.4f}\n")

# Calculate and output the average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)

print(f"\nAverage Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

Training on fold 1...


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 1: 0.8485
Precision for fold 1: 0.8437
Recall for fold 1: 0.8485
F1 Score for fold 1: 0.8443

Training on fold 2...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 2: 0.8675
Precision for fold 2: 0.8677
Recall for fold 2: 0.8675
F1 Score for fold 2: 0.8638

Training on fold 3...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 3: 0.8590
Precision for fold 3: 0.8592
Recall for fold 3: 0.8590
F1 Score for fold 3: 0.8577

Training on fold 4...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy for fold 4: 0.8570
Precision for fold 4: 0.8585
Recall for fold 4: 0.8570
F1 Score for fold 4: 0.8540

Training on fold 5...
Epoch 1/10
Epoch 2/1