In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score
import lightgbm as lgb

In [2]:
def load_data():
    """Loads the training and test datasets."""
    try:
        train_df = pd.read_csv("train.csv")
        test1_df = pd.read_csv("test_1.csv")
        test2_df = pd.read_csv("test_2.csv")
        print("Data loaded successfully.")
        print(f"Train shape: {train_df.shape}")
        print(f"Test1 shape: {test1_df.shape}")
        print(f"Test2 shape: {test2_df.shape}")
        return train_df, test1_df, test2_df
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please ensure train.csv, test_1.csv, and test_2.csv are in the same directory.")
        return None, None, None



In [3]:
def identify_feature_columns(df):
    """Identifies molecular, ECFP, and FCFP feature columns."""
    molecular_features = []
    ecfp_features = []
    fcfp_features = []

    # Assuming molecular features are between 'BalabanJ' and 'qed'
    # Find first and last standard molecular feature to get the range
    # These are example column names, adjust if the actual names differ slightly
    # or if they are not contiguous.
    std_mol_feature_start_example = 'BalabanJ'
    std_mol_feature_end_example = 'qed'

    if std_mol_feature_start_example in df.columns and std_mol_feature_end_example in df.columns:
        start_idx = df.columns.get_loc(std_mol_feature_start_example)
        end_idx = df.columns.get_loc(std_mol_feature_end_example)
        if start_idx <= end_idx:
            molecular_features = df.columns[start_idx : end_idx+1].tolist()
        else:
            print(f"Warning: Molecular feature end '{std_mol_feature_end_example}' found before start '{std_mol_feature_start_example}'.")
    else:
        print(f"Warning: Could not find '{std_mol_feature_start_example}' or '{std_mol_feature_end_example}' in columns.")
        # Fallback: try to infer based on known number of features if specific names fail
        # This is a heuristic and might not be robust if column order changes
        if len(df.columns) > 200 and not any(c.startswith('ecfc_') or c.startswith('fcfc_') for c in df.columns[1:200]):
             # Assuming 'smiles' is the first column, then 199 molecular features
            potential_mol_cols = [col for col in df.columns if not col.startswith('ecfc_') and not col.startswith('fcfc_') and col not in ['smiles', 'class', 'series']]
            if len(potential_mol_cols) == 199: # As per readme
                 molecular_features = potential_mol_cols
            else: # If still not found, try to locate them by excluding others
                all_cols = set(df.columns)
                fingerprint_cols = {col for col in all_cols if col.startswith('ecfc_') or col.startswith('fcfc_')}
                id_cols = {'smiles', 'class', 'series'} # 'class' and 'series' might not be in all DFs
                potential_mol_cols = list(all_cols - fingerprint_cols - id_cols)
                # This is very heuristic, assuming they are grouped and non-fingerprint
                # A more robust way would be to have an explicit list of these 199 features
                # For now, we'll rely on the start/end columns or the count.
                # If the above specific start/end are not found, this part is tricky.
                # Let's assume the problem setter ensures these columns are present and named as in readme.
                # The readme says: "199 molecular features computed with the rdkit package (from column BalabanJ to qed)"
                # So the primary method using get_loc should work if columns are named exactly.

    for col in df.columns:
        if col.startswith("ecfc_"):
            ecfp_features.append(col)
        elif col.startswith("fcfc_"):
            fcfp_features.append(col)

    if not molecular_features:
        print("Critical Warning: Molecular features could not be identified robustly. Predictions may be unreliable.")
        # Fallback: take all columns that are not smiles, class, series, ecfc, fcfc
        # This is a last resort.
        non_feature_cols = ['smiles', 'class', 'series']
        molecular_features = [col for col in df.columns if col not in non_feature_cols and not col.startswith('ecfc_') and not col.startswith('fcfc_')]
        print(f"Fallback: Identified {len(molecular_features)} potential molecular features.")


    print(f"Identified {len(molecular_features)} molecular features.")
    print(f"Identified {len(ecfp_features)} ECFP features.")
    print(f"Identified {len(fcfp_features)} FCFP features.")

    all_features = molecular_features + ecfp_features + fcfp_features
    return molecular_features, ecfp_features, fcfp_features, all_features

In [4]:
def preprocess_data(train_df, test1_df, test2_df, molecular_features, all_features):
    """Preprocesses the data: scales molecular features and prepares X, y."""

    X_train_full = train_df[all_features].copy()
    y_train_full = train_df['class'].copy()

    X_test1 = test1_df[all_features].copy()
    X_test2 = test2_df[all_features].copy()

    # Scale molecular features
    # Ensure molecular_features only contains numeric columns before scaling
    numeric_molecular_features = X_train_full[molecular_features].select_dtypes(include=np.number).columns.tolist()

    if not numeric_molecular_features:
        print("Warning: No numeric molecular features found to scale.")
    else:
        scaler = StandardScaler()
        X_train_full[numeric_molecular_features] = scaler.fit_transform(X_train_full[numeric_molecular_features])
        X_test1[numeric_molecular_features] = scaler.transform(X_test1[numeric_molecular_features])
        X_test2[numeric_molecular_features] = scaler.transform(X_test2[numeric_molecular_features])
        print("Molecular features scaled.")

    return X_train_full, y_train_full, X_test1, X_test2

In [6]:
def train_model(X_train, y_train):
    """Trains a LightGBM model."""
    # Calculate scale_pos_weight for imbalanced classes
    class_counts = y_train.value_counts()
    scale_pos_weight = class_counts[0] / class_counts[1] if 1 in class_counts and class_counts[1] > 0 else 1

    model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss', # Common metric for binary classification
        n_estimators=1000,      # Can be tuned
        learning_rate=0.05,     # Can be tuned
        num_leaves=31,          # Can be tuned
        max_depth=-1,           # No limit
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight, # Address class imbalance
        # colsample_bytree=0.8, # Feature fraction
        # subsample=0.8,        # Bagging fraction
        # reg_alpha=0.1,        # L1 regularization
        # reg_lambda=0.1        # L2 regularization
    )

    # Using early stopping for optimal number of estimators
    # For this, we need a validation set. If not doing CV here,
    # we can train on full data for a fixed number of estimators.
    # For simplicity in this script, we'll train on full X_train without early stopping here.
    # Proper hyperparameter tuning would involve CV.

    model.fit(X_train, y_train)
    print("Model trained successfully.")
    return model

In [7]:
def main():
    train_df, test1_df, test2_df = load_data()
    if train_df is None:
        return

    molecular_features, ecfp_features, fcfp_features, all_features = identify_feature_columns(train_df)

    if not all_features:
        print("Critical error: No features identified. Exiting.")
        return

    X_train_full, y_train_full, X_test1, X_test2 = preprocess_data(
        train_df.copy(), test1_df.copy(), test2_df.copy(), molecular_features, all_features
    )

    # --- Train final model on all training data ---
    print("\n--- Training Final Model on Full Training Data ---")
    final_model = train_model(X_train_full, y_train_full)

    # --- Task 1: Predict toxicity on test set 1 ---
    print("\n--- Task 1: Predictions for test_1.csv ---")
    task1_predictions = final_model.predict(X_test1)
    task1_pred_proba = final_model.predict_proba(X_test1)[:, 1] # Probabilities for class 1

    # Store predictions for submission
    test1_df['predicted_class_task1'] = task1_predictions
    # test1_df[['smiles', 'predicted_class_task1']].to_csv("task1_predictions.csv", index=False)
    print("Predictions for test_1.csv generated.")
    print(f"Example predictions for test_1.csv: {task1_predictions[:10]}")


    # --- Task 1: Evaluation Metric Demonstration (Cohen Kappa) ---
    print("\n--- Task 1: Cohen Kappa Score (Demonstration) ---")
    # To demonstrate metric calculation, split train_df into a demo train/val set
    X_temp_train, X_temp_val, y_temp_train, y_temp_val = train_test_split(
        X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
    )

    print("Training a temporary model for demonstration purposes...")
    demo_model_task1 = train_model(X_temp_train, y_temp_train)
    demo_preds_task1 = demo_model_task1.predict(X_temp_val)
    kappa_demo = cohen_kappa_score(y_temp_val, demo_preds_task1)
    print(f"Demonstration Cohen Kappa Score on internal validation set: {kappa_demo:.4f}")
    print("Note: This Kappa score is on an internal split of the training data for demonstration.")
    print("The actual Task 1 Cohen Kappa would be calculated using the predictions on test_1.csv and its true (hidden) labels.")

    # --- Task 2: Predict toxicity on test set 2 ---
    print("\n--- Task 2: Predictions for test_2.csv ---")
    task2_predictions = final_model.predict(X_test2)

    # Store predictions for submission, including series
    test2_df['predicted_class_task2'] = task2_predictions
    # test2_df[['smiles', 'series', 'predicted_class_task2']].to_csv("task2_predictions.csv", index=False)
    print("Predictions for test_2.csv generated.")

    # Group predictions by series
    task2_results_df = test2_df[['smiles', 'series', 'predicted_class_task2']].copy()
    print("\nExample predictions for test_2.csv grouped by series:")
    for series_id, group in task2_results_df.groupby('series'):
        print(f"\nSeries {series_id}:")
        print(group.head())
        # print(f"  Number of molecules: {len(group)}")
        # print(f"  Predicted toxic: {group['predicted_class_task2'].sum()} / {len(group)}")

    print("\n--- Task 2: Evaluation Metric (Average Accuracy by Series) ---")
    print("To calculate the actual Task 2 metric (average accuracy across 6 series):")
    print("1. Obtain the true labels for test_2.csv.")
    print("2. For each of the 6 molecular series:")
    print("   a. Filter predictions and true labels for that series.")
    print("   b. Calculate accuracy_score(true_labels_series, predicted_labels_series).")
    print("3. Average these 6 accuracy scores.")
    print("Demonstration of grouping (actual calculation requires true labels for test_2.csv):")

    # Example of how one might calculate if true labels were available for test_2_df
    # Assume test2_df had a 'true_class' column for demonstration
    # test2_df['true_class_placeholder'] = np.random.randint(0,2,size=len(test2_df)) # Placeholder
    # accuracies_per_series_demo = []
    # if 'true_class_placeholder' in test2_df.columns:
    #     for series_id, group in test2_df.groupby('series'):
    #         acc = accuracy_score(group['true_class_placeholder'], group['predicted_class_task2'])
    #         accuracies_per_series_demo.append(acc)
    #         print(f"  Demo Accuracy for Series {series_id}: {acc:.4f} (using placeholder true labels)")
    #     if accuracies_per_series_demo:
    #         print(f"  Demo Average Accuracy: {np.mean(accuracies_per_series_demo):.4f}")
    # else:
    #     print("  (Skipping demo accuracy calculation as placeholder true labels are not set up for test_2.csv)")


    # --- Task 3: Select 200 most reliable predictions from test set 1 ---
    print("\n--- Task 3: Identify 200 Most Reliable Predictions from test_1.csv ---")
    # Using task1_pred_proba from the final_model on X_test1
    confidence = np.abs(task1_pred_proba - 0.5) # Higher value means more confident (further from 0.5)

    test1_predictions_df = pd.DataFrame({
        'smiles': test1_df['smiles'], # Assuming test1_df still has original index
        'predicted_class': task1_predictions,
        'probability_class1': task1_pred_proba,
        'confidence': confidence
    })

    # Sort by confidence and select top 200
    top_200_reliable = test1_predictions_df.sort_values(by='confidence', ascending=False).head(200)

    # Store these top 200 predictions for submission
    # top_200_reliable[['smiles', 'predicted_class']].to_csv("task3_top_200_predictions.csv", index=False)
    print(f"Identified {len(top_200_reliable)} most reliable predictions for test_1.csv.")
    print("Top 5 most reliable predictions for test_1.csv:")
    print(top_200_reliable.head())

    print("\n--- Task 3: Accuracy on Most Reliable (Demonstration) ---")
    # Use the demo_model_task1 and its predictions on X_temp_val
    demo_proba_task3 = demo_model_task1.predict_proba(X_temp_val)[:, 1]
    demo_confidence_task3 = np.abs(demo_proba_task3 - 0.5)

    # Create a DataFrame for demo validation results
    demo_val_results_df = pd.DataFrame({
        'true_class': y_temp_val,
        'predicted_class': demo_preds_task1, # Predictions from demo_model_task1
        'probability_class1': demo_proba_task3,
        'confidence': demo_confidence_task3
    })

    # Determine N for demonstration (proportional to 200/750)
    N_demo = int(np.ceil(len(X_temp_val) * (200 / len(X_test1)))) # len(X_test1) is 750

    demo_top_N_reliable = demo_val_results_df.sort_values(by='confidence', ascending=False).head(N_demo)

    accuracy_top_N_demo = accuracy_score(demo_top_N_reliable['true_class'], demo_top_N_reliable['predicted_class'])
    print(f"Demonstration Accuracy on top {N_demo} most reliable predictions (from internal validation set): {accuracy_top_N_demo:.4f}")
    print("Note: This accuracy is on an internal split of the training data for demonstration.")
    print("The actual Task 3 Accuracy would be calculated using the top 200 predictions on test_1.csv and its true (hidden) labels.")

    print("\n\n--- Summary of Files to be Generated for Submission (if uncommented) ---")
    print("1. task1_predictions.csv: Contains 'smiles' and 'predicted_class_task1' for all molecules in test_1.csv.")
    print("   (The submission file for Task 3 should be the first 200 rows of this file, if sorted by reliability).")
    print("   Alternatively, a separate file like task3_top_200_predictions.csv can be made.")
    print("2. task2_predictions.csv: Contains 'smiles', 'series', and 'predicted_class_task2' for all molecules in test_2.csv.")
    print("\nTo create the submission file for Task 3 (first 200 rows of test_1.csv, ordered by reliability):")
    print("  submission_task3_df = top_200_reliable[['smiles', 'predicted_class']]")
    print("  # submission_task3_df.to_csv('submission_task3.csv', index=False)")
    print("  The 'task1_predictions.csv' file, if it were to be the base for Task 3, would need to be sorted by reliability first, then the top 200 taken.")
    print("  A common approach for Kaggle-like challenges is to submit a single prediction file for test_set_1, and the platform evaluates Task 1 and Task 3 from it.")
    print("  For Task 3, it implies the submitted predictions for test_set_1 should be ordered by reliability, most reliable first.")
    print("  Let's prepare a test_1_submission_ordered.csv for this purpose:")

    test1_submission_ordered_df = test1_predictions_df.sort_values(by='confidence', ascending=False)[['smiles', 'predicted_class']]
    # test1_submission_ordered_df.to_csv("test_1_submission_ordered_by_reliability.csv", index=False, header=False) # Often no header for submission
    print("  'test_1_submission_ordered_by_reliability.csv' (if uncommented) would be structured for Task 1 and Task 3 evaluation.")
    print(f"  Shape of ordered test_1 submission: {test1_submission_ordered_df.shape}")


In [8]:
main()

Data loaded successfully.
Train shape: (9415, 4297)
Test1 shape: (750, 4296)
Test2 shape: (478, 4297)
Identified 199 molecular features.
Identified 2048 ECFP features.
Identified 2048 FCFP features.
Molecular features scaled.

--- Training Final Model on Full Training Data ---
[LightGBM] [Info] Number of positive: 4816, number of negative: 4599
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.414011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31564
[LightGBM] [Info] Number of data points in the train set: 9415, number of used features: 3530
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511524 -> initscore=0.046105
[LightGBM] [Info] Start training from score 0.046105
Model trained successfully.

--- Task 1: Predictions for test_1.csv ---
Predictions for test_1.csv generated.
Example predictions for test_1.csv: [0 0 0 1 1 1 1 0 0 1]

--- Task 1: Cohen Kappa Score (Demonstration) ---
Training a 