In [1]:
import numpy as np

# Function to calculate the Hoeffding bound
def hoeffding_bound(n, epsilon):
    return 2 * np.exp(-2 * n * epsilon**2)

# Sample usage
n = 100  # sample size
epsilon = 0.05  # desired deviation from true mean
confidence_bound = hoeffding_bound(n, epsilon)
print(f"Probability that the sample mean deviates from the true mean by more than {epsilon}: {confidence_bound}")

Probability that the sample mean deviates from the true mean by more than 0.05: 1.2130613194252666


In [2]:
import math

# Function to calculate minimum sample size for desired epsilon and delta
def required_sample_size(epsilon, delta):
    return math.ceil((1 / (2 * epsilon**2)) * np.log(2 / delta))

# Sample usage
epsilon = 0.05  # maximum allowed deviation
delta = 0.01    # confidence level (1 - delta)
n_required = required_sample_size(epsilon, delta)
print(f"Required sample size for epsilon = {epsilon} and delta = {delta}: {n_required}")


Required sample size for epsilon = 0.05 and delta = 0.01: 1060


In [3]:
# Function to calculate generalization error bound using Union Bound
def generalization_error_bound(n, epsilon, num_models):
    return 2 * num_models * np.exp(-2 * n * epsilon**2)

# Sample usage
n = 100  # sample size
epsilon = 0.05  # deviation tolerance
num_models = 3  # number of models in hypothesis space
gen_error_bound = generalization_error_bound(n, epsilon, num_models)
print(f"Generalization error bound with {num_models} models: {gen_error_bound}")


Generalization error bound with 3 models: 3.6391839582758


In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
import math

# Load the data
X = np.load('Datasets/kryptonite-9-X.npy')
y = np.load('Datasets/kryptonite-9-y.npy')

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.fit_transform(X)  # Scale the entire dataset for polynomial feature transformation

# Convert data to PyTorch tensors for the neural network
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

# PyTorch DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# Define Neural Network model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return self.sigmoid(x)

# Initialize model, loss, and optimizer
input_dim = X_train.shape[1]
model = MLP(input_dim)
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train Neural Network
num_epochs = 50
best_val_accuracy = 0.0
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation accuracy
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_preds = (val_outputs > 0.5).float()
        val_accuracy = accuracy_score(y_val_tensor, val_preds)
        
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model.pth')

# Load best model
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

# Extra Trees with Polynomial Features
poly = PolynomialFeatures(degree=4, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)

# K-Fold Cross-Validation for Extra Trees
kf = KFold(n_splits=5, shuffle=True, random_state=42)
et_accuracies = []

for train_index, val_index in kf.split(X_poly):
    X_train_fold, X_val_fold = X_poly[train_index], X_poly[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    et_model.fit(X_train_fold, y_train_fold)
    y_val_pred = et_model.predict(X_val_fold)
    fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
    et_accuracies.append(fold_accuracy)

mean_et_accuracy = np.mean(et_accuracies)
print("Extra Trees Cross-Validation Mean Accuracy:", mean_et_accuracy)

# Hoeffding's Inequality calculations

# 1. Confidence Interval Bound
def hoeffding_bound(n, epsilon):
    return 2 * np.exp(-2 * n * epsilon**2)

n_samples = len(y_val)  # For neural network validation set or k-fold for Extra Trees
epsilon = 0.05  # Desired deviation from true mean

confidence_bound_nn = hoeffding_bound(n_samples, epsilon)
confidence_bound_et = hoeffding_bound(len(et_accuracies), epsilon)

print(f"Neural Network confidence bound with epsilon = {epsilon}: {confidence_bound_nn}")
print(f"Extra Trees confidence bound with epsilon = {epsilon}: {confidence_bound_et}")

# 2. Sample Complexity Estimation
def required_sample_size(epsilon, delta):
    return math.ceil((1 / (2 * epsilon**2)) * np.log(2 / delta))

# Define parameters for sample size estimation
delta = 0.01  # Confidence level (1 - delta)
n_required = required_sample_size(epsilon, delta)
print(f"Required sample size for epsilon = {epsilon} and delta = {delta}: {n_required}")

# 3. Generalization Error Bound with Union Bound
def generalization_error_bound(n, epsilon, num_models):
    return 2 * num_models * np.exp(-2 * n * epsilon**2)

num_models = 2  # Number of models (Neural Network and Extra Trees)
gen_error_bound = generalization_error_bound(n_samples, epsilon, num_models)
print(f"Generalization error bound with {num_models} models: {gen_error_bound}")

# Final Classification Reports for Best Models
with torch.no_grad():
    val_preds_nn = model(X_val_tensor)
    val_preds_nn = (val_preds_nn > 0.5).float()
    print("Neural Network Classification Report:")
    print(classification_report(y_val_tensor, val_preds_nn))
    print("Validation Accuracy of Best Neural Network Model:", accuracy_score(y_val, val_preds_nn))

print("Extra Trees Cross-Validation Classification Report:")
for i, acc in enumerate(et_accuracies, 1):
    print(f"Fold {i} Accuracy: {acc}")
print(f"Mean Accuracy of Extra Trees: {mean_et_accuracy}")



Extra Trees Cross-Validation Mean Accuracy: 0.9577777777777777
Neural Network confidence bound with epsilon = 0.05: 3.0459959489425146e-08
Extra Trees confidence bound with epsilon = 0.05: 1.9506198240566652
Required sample size for epsilon = 0.05 and delta = 0.01: 1060
Generalization error bound with 2 models: 6.091991897885029e-08
Neural Network Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      1808
         1.0       0.96      0.96      0.96      1792

    accuracy                           0.96      3600
   macro avg       0.96      0.96      0.96      3600
weighted avg       0.96      0.96      0.96      3600

Validation Accuracy of Best Neural Network Model: 0.9575
Extra Trees Cross-Validation Classification Report:
Fold 1 Accuracy: 0.9575
Fold 2 Accuracy: 0.9613888888888888
Fold 3 Accuracy: 0.9591666666666666
Fold 4 Accuracy: 0.9580555555555555
Fold 5 Accuracy: 0.9527777777777777
Mean Accuracy of Extra 

In [6]:
# Total number of samples in the dataset
n_samples = X.shape[0]
epsilon = 0.05  # Desired deviation from the true mean

# Hoeffding's Inequality confidence bound calculation
def hoeffding_bound(n, epsilon):
    return 2 * np.exp(-2 * n * epsilon**2)

confidence_bound = hoeffding_bound(n_samples, epsilon)
print(f"Confidence bound with epsilon = {epsilon} and n = {n_samples}: {confidence_bound}")


Confidence bound with epsilon = 0.05 and n = 18000: 1.6388025247980798e-39


In [7]:
def required_sample_size(epsilon, delta):
    return math.ceil((1 / (2 * epsilon**2)) * np.log(2 / delta))

# Define epsilon and delta for sample complexity estimation
delta = 0.01  # Confidence level (1 - delta)
n_required = required_sample_size(epsilon, delta)
print(f"Required sample size for epsilon = {epsilon} and delta = {delta}: {n_required}")


Required sample size for epsilon = 0.05 and delta = 0.01: 1060
