In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import joblib

# Load datasets
train_iedb = pd.read_csv("/kaggle/input/d/dennistheuri/immrep25/iedb_positives.csv")
train_vdjdb = pd.read_csv("/kaggle/input/d/dennistheuri/immrep25/vdjdb_positives.csv")
test_data = pd.read_csv("/kaggle/input/d/dennistheuri/immrep25/test.csv")

# Merge training data
train_data = pd.concat([train_iedb, train_vdjdb], ignore_index=True)

# Select important features
features = ["CDR3a", "CDR3b", "Peptide"]
train_data = train_data.dropna(subset=features)

# Encode sequences using ASCII encoding
def encode_sequence(seq):
    return np.mean([ord(c) for c in str(seq)])

# Encode training data
train_data["CDR3a_encoded"] = train_data["CDR3a"].apply(encode_sequence)
train_data["CDR3b_encoded"] = train_data["CDR3b"].apply(encode_sequence)
train_data["Peptide_encoded"] = train_data["Peptide"].apply(encode_sequence)

# Create Negative Samples (Assumed Non-Binding Pairs)
neg_samples = train_data.copy()
neg_samples["CDR3a_encoded"] = np.random.permutation(neg_samples["CDR3a_encoded"])
neg_samples["CDR3b_encoded"] = np.random.permutation(neg_samples["CDR3b_encoded"])
neg_samples["Peptide_encoded"] = np.random.permutation(neg_samples["Peptide_encoded"])
neg_samples["Label"] = 0  # Non-binding

# Add labels
train_data["Label"] = 1  # Binding pairs
train_data = pd.concat([train_data, neg_samples], ignore_index=True)

# Prepare feature matrix and target variable
X = train_data[["CDR3a_encoded", "CDR3b_encoded", "Peptide_encoded"]]
y = train_data["Label"]  # Now has both 0s and 1s

# Split into train & validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate model
y_val_pred = model.predict_proba(X_val)[:, 1]  # Now it should work fine
auc_score = roc_auc_score(y_val, y_val_pred)
print(f"Validation AUC Score: {auc_score:.4f}")

# Process test data
test_data["CDR3a_encoded"] = test_data["CDR3a"].apply(encode_sequence)
test_data["CDR3b_encoded"] = test_data["CDR3b"].apply(encode_sequence)
test_data["Peptide_encoded"] = test_data["Peptide"].apply(encode_sequence)

# Prepare test features
X_test = test_data[["CDR3a_encoded", "CDR3b_encoded", "Peptide_encoded"]]

# Make predictions
test_data["Prediction"] = model.predict_proba(X_test)[:, 1]

# Save submission
submission = test_data[["ID", "Prediction"]]
submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'!")

Validation AUC Score: 0.5831
Submission file saved as 'submission.csv'!
