In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

def extract_advanced_features(data):
    """
    create additional features like acceleration and turning angle.
    """
    data['acceleration'] = data['speed'].diff().fillna(0)
    data['turning_angle'] = data['direction'].diff().fillna(0)
    return data

def hyperparameter_tuning(X_train, y_train):
    """
    hyperparameter tuning using GridSearchCV
    """
    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 3, 5]
    }

    xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                               cv=StratifiedKFold(n_splits=5), scoring='accuracy', verbose=1, n_jobs=-1)
    
    grid_search.fit(X_train, y_train)
    print("Best parameters found: ", grid_search.best_params_)
    
    return grid_search.best_estimator_

def train_xgb_model(X, y, test_size=0.2, random_state=42):
    """
    train optimized XGB model with early stopping and feature importance visualization
    """
    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # hyperparameter tuning
    best_xgb_model = hyperparameter_tuning(X_train, y_train)

    # train the best model with early stopping
    best_xgb_model.fit(X_train, y_train, 
                       eval_set=[(X_test, y_test)], 
                       early_stopping_rounds=20, 
                       verbose=False)

    # make predictions
    y_pred = best_xgb_model.predict(X_test)

    # evaluate the model
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

    # plot feature importance
    plot_feature_importance(best_xgb_model, feature_columns)

    return best_xgb_model

def plot_feature_importance(model, feature_names):
    plt.figure(figsize=(8, 5))
    importance = model.feature_importances_
    sorted_idx = np.argsort(importance)
    
    plt.barh(np.array(feature_names)[sorted_idx], importance[sorted_idx], color='#789e82')  
    plt.xlabel("Feature Importance Score")
    plt.ylabel("Features")
    plt.title("XGBoost Feature Importance")
    plt.show()

if __name__ == "__main__":
    # load dataset
    dataset_csv = "fly_features_og.csv"  
    dataset = pd.read_csv(dataset_csv)
    
    # feature engineering
    dataset = extract_advanced_features(dataset)
    
    # normalize features
    feature_columns = ["dx", "dy", "speed", "direction", "acceleration", "turning_angle"]
    scaler = StandardScaler()
    dataset[feature_columns] = scaler.fit_transform(dataset[feature_columns])
    
    # prepare features (X) and labels (y)
    X = dataset[feature_columns].values
    y = dataset["label"].values  
    
    # train the optimized XGBoost model
    model = train_xgb_model(X, y)