In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import seaborn as sns

In [31]:
df = pd.read_csv("highdimensionbigdata.csv")
df.replace(['NaN'],np.nan,inplace=True)
dropped = df.copy()
dropped.drop(columns=["wb","moth"],inplace=True)
dropped['trial'] = dropped['trial'].replace({'pre': 0, 'post': 1})


In [33]:
def train_xgboost_classifier(df, target_col, feature_cols, test_size=0.2, random_state=42):
    """
    Train an XGBoost classifier that handles missing values.
    Compatible with older XGBoost versions.
    
    Parameters:
    df: pandas DataFrame containing the data
    target_col: string, name of the target column
    feature_cols: list of strings, names of feature columns
    test_size: float, proportion of data to use for testing
    random_state: int, random seed for reproducibility
    
    Returns:
    model: trained XGBoost model
    X_test: test features
    y_test: test labels
    feature_importance: DataFrame of feature importance scores
    """
    
    # Prepare features and target
    X = df[feature_cols]
    y = df[target_col]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Create DMatrix objects for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Set parameters
    params = {
        'objective': 'binary:logistic',
        'max_depth': 4,
        'learning_rate': 0.1,
        'eval_metric': ['logloss', 'auc'],
        'silent': 1,
        'seed': random_state
    }
    
    # Set up evaluation list
    evallist = [(dtrain, 'train'), (dtest, 'eval')]
    
    # Train the model
    num_rounds = 100
    bst = xgb.train(
        params,
        dtrain,
        num_rounds,
        evallist,
        early_stopping_rounds=20,
        verbose_eval=False
    )
    
    # Convert to sklearn-style classifier for convenience
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        missing=np.nan,
        max_depth=4,
        learning_rate=0.1,
        n_estimators=bst.best_ntree_limit if hasattr(bst, 'best_ntree_limit') else num_rounds,
        random_state=random_state
    )
    
    # Fit the sklearn model for compatibility
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Print performance metrics
    print("\nModel Performance:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    
    return model, X_test, y_test, feature_importance


In [28]:
features = dropped.columns.tolist()
features.remove('trial')

In [34]:
model, X_test, y_test, importance = train_xgboost_classifier(
    df=dropped,
    target_col='trial',
    feature_cols=features
)

Parameters: { "silent" } are not used.




Model Performance:

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       269
           1       0.98      0.97      0.98       269

    accuracy                           0.98       538
   macro avg       0.98      0.98      0.98       538
weighted avg       0.98      0.98      0.98       538


ROC AUC Score: 0.9960061359019361


In [None]:
def plot_importance(importance_df, top_n=None):
    """
    Plot feature importance scores in a clear, visual way.
    """
    plt.figure(figsize=(10, 6))
    
    # Take top N features if specified
    if top_n:
        plot_data = importance_df.head(top_n)
    else:
        plot_data = importance_df
        
    # Create bar plot
    sns.barplot(
        data=plot_data,
        x='importance',
        y='feature',
        palette='viridis'
    )
    
    plt.title('Feature Importance Scores')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()


Unnamed: 0,feature,importance
34,rsa1,0.086032
0,lax_count,0.083788
11,lax2,0.068192
39,rba2,0.067911
19,lsa1,0.056768
42,rax1,0.056308
7,rsa_count,0.052986
10,lax1,0.049137
32,rdvm2,0.04293
15,lba2,0.041092
