In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# Paths
base_path = '/kaggle/input/qcd-tt-jet-tagging-co-da-s-hep'

# Load training data
train_df = pd.read_csv(f'{base_path}/train/features/cluster_features.csv')
train_labels = np.load(f'{base_path}/train/labels/labels.npy')
train_ids = np.load(f'{base_path}/train/ids/ids.npy')

# Load validation data
val_df = pd.read_csv(f'{base_path}/val/features/cluster_features.csv')
val_labels = np.load(f'{base_path}/val/labels/labels.npy')
val_ids = np.load(f'{base_path}/val/ids/ids.npy')

# Combine train + val
X = pd.concat([train_df, val_df], axis=0).reset_index(drop=True)
y = np.concatenate([train_labels, val_labels])
ids = np.concatenate([train_ids, val_ids])

X['pt_per_cluster'] = X['total_pt'] / (X['n_clusters'] + 1e-6)
X['cluster_size_diff'] = X['max_cluster_size'] - X['mean_cluster_size']
X['pt_std_ratio'] = X['std_cluster_pt'] / (X['mean_cluster_pt'] + 1e-6)
X['pt_ratio'] = X['max_cluster_pt'] / (X['total_pt'] + 1e-6)
X['cluster_density'] = X['n_clusters'] / (X['max_cluster_size'] + 1e-6)
X['inv_mean_size'] = 1 / (X['mean_cluster_size'] + 1e-6)
X['pt_entropy'] = -X['std_cluster_pt'] * np.log(X['mean_cluster_pt'] + 1e-6)


# Cross-validation training
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_df = pd.read_csv(f'{base_path}/test/features/cluster_features.csv')
test_ids = np.load(f'{base_path}/test/ids/ids.npy')

# Apply same features to test set
test_df['pt_per_cluster'] = test_df['total_pt'] / (test_df['n_clusters'] + 1e-6)
test_df['cluster_size_diff'] = test_df['max_cluster_size'] - test_df['mean_cluster_size']
test_df['pt_std_ratio'] = test_df['std_cluster_pt'] / (test_df['mean_cluster_pt'] + 1e-6)
test_df['pt_ratio'] = test_df['max_cluster_pt'] / (test_df['total_pt'] + 1e-6)
test_df['cluster_density'] = test_df['n_clusters'] / (test_df['max_cluster_size'] + 1e-6)
test_df['inv_mean_size'] = 1 / (test_df['mean_cluster_size'] + 1e-6)
test_df['pt_entropy'] = -test_df['std_cluster_pt'] * np.log(test_df['mean_cluster_pt'] + 1e-6)



test_preds_all = np.zeros(len(test_df))
val_preds_all = []
val_targets_all = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=fold
    )

    model.fit(X_train, y_train)
    val_preds = model.predict_proba(X_val)[:, 1]
    val_preds_all.extend(val_preds)
    val_targets_all.extend(y_val)

    test_preds_all += model.predict_proba(test_df)[:, 1] / skf.n_splits

# Evaluate
cv_auc = roc_auc_score(val_targets_all, val_preds_all)
print(f'Cross-Validated AUC: {cv_auc:.4f}')
xgb_preds = test_preds_all.copy()
np.save('xgb_preds.npy', xgb_preds)

# Plot confidence distribution
plt.hist(test_preds_all, bins=50)
plt.title("Test Set Prediction Distribution")
plt.xlabel("Probability of tt̄")
plt.ylabel("Count")
plt.grid(True)
plt.show()


In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = pd.concat([
    pd.read_csv(f'{base_path}/train/features/cluster_features.csv'),
    pd.read_csv(f'{base_path}/val/features/cluster_features.csv')
]).reset_index(drop=True)

y = np.concatenate([
    np.load(f'{base_path}/train/labels/labels.npy'),
    np.load(f'{base_path}/val/labels/labels.npy')
])

# Same feature engineering
X['pt_per_cluster'] = X['total_pt'] / (X['n_clusters'] + 1e-6)
X['cluster_size_diff'] = X['max_cluster_size'] - X['mean_cluster_size']
X['pt_std_ratio'] = X['std_cluster_pt'] / (X['mean_cluster_pt'] + 1e-6)
X['pt_ratio'] = X['max_cluster_pt'] / (X['total_pt'] + 1e-6)
X['cluster_density'] = X['n_clusters'] / (X['max_cluster_size'] + 1e-6)
X['inv_mean_size'] = 1 / (X['mean_cluster_size'] + 1e-6)
X['pt_entropy'] = -X['std_cluster_pt'] * np.log(X['mean_cluster_pt'] + 1e-6)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-validation split for DNN
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Build the DNN model
model_dnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_dnn.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['AUC']
)

model_dnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=2
)


X_test = pd.read_csv(f'{base_path}/test/features/cluster_features.csv')

# Apply the same feature engineering
X_test['pt_per_cluster'] = X_test['total_pt'] / (X_test['n_clusters'] + 1e-6)
X_test['cluster_size_diff'] = X_test['max_cluster_size'] - X_test['mean_cluster_size']
X_test['pt_std_ratio'] = X_test['std_cluster_pt'] / (X_test['mean_cluster_pt'] + 1e-6)
X_test['pt_ratio'] = X_test['max_cluster_pt'] / (X_test['total_pt'] + 1e-6)
X_test['cluster_density'] = X_test['n_clusters'] / (X_test['max_cluster_size'] + 1e-6)
X_test['inv_mean_size'] = 1 / (X_test['mean_cluster_size'] + 1e-6)
X_test['pt_entropy'] = -X_test['std_cluster_pt'] * np.log(X_test['mean_cluster_pt'] + 1e-6)

X_test_scaled = scaler.transform(X_test)

# Predict test probabilities
dnn_preds = model_dnn.predict(X_test_scaled).flatten()
np.save('dnn_preds.npy', dnn_preds)
print("dnn_preds generated and saved")


In [None]:
import h5py



# Load jet image data (train + val)
with h5py.File(f'{base_path}/train/images/jet_images.h5', 'r') as f: 
    train_images = f['images'][:]  # shape: (N, 32, 32)

with h5py.File(f'{base_path}/val/images/jet_images.h5', 'r') as f:
    val_images = f['images'][:]

images = np.concatenate([train_images, val_images], axis=0)
labels = np.concatenate([
    np.load(f'{base_path}/train/labels/labels.npy'),
    np.load(f'{base_path}/val/labels/labels.npy')
])

# Normalize and reshape
images = images.astype('float32') / np.max(images)
images = np.expand_dims(images, axis=-1)  # shape becomes (N, 32, 32, 1)

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    images, labels, test_size=0.2, stratify=labels, random_state=42
)

# Build CNN model
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32, 32, 1)),
    tf.keras.layers.Conv2D(32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Conv2D(64, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Train the CNN
model_cnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
    verbose=2
)

# Load test images
with h5py.File(f'{base_path}/test/images/jet_images.h5', 'r') as f:
    test_images = f['images'][:]
test_images = test_images.astype('float32') / np.max(test_images)
test_images = np.expand_dims(test_images, axis=-1)

# Predict
cnn_preds = model_cnn.predict(test_images).flatten()
np.save('cnn_preds.npy', cnn_preds)
print("cnn_preds generated and saved")


In [None]:


# Load predictions
xgb_preds = np.load('xgb_preds.npy')
dnn_preds = np.load('dnn_preds.npy')
cnn_preds = np.load('cnn_preds.npy')
test_ids = np.load('/kaggle/input/qcd-tt-jet-tagging-co-da-s-hep/test/ids/ids.npy')


weights = np.array([0.2, 0.45, 0.35]) 

# Normalize just in case
weights = weights / np.sum(weights)

# Ensemble
final_preds = weights[0] * xgb_preds + weights[1] * dnn_preds + weights[2] * cnn_preds

# Save submission
submission = pd.DataFrame({'id': test_ids, 'label': final_preds})
submission.to_csv('/kaggle/working/submission.csv', index=False)

print(f"Submission saved with weights: XGB={weights[0]:.2f}, DNN={weights[1]:.2f}, CNN={weights[2]:.2f}")
