In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import numpy as np

# Load your dataset
data_path = ''
data = pd.read_csv(data_path)

# Cleaning and preprocessing
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['label'])
X = data.drop('label', axis=1)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Preprocessing pipeline
pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
])
pipeline.fit(X_train, y_train)
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# XGBoost classifier
classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
classifier.fit(X_train_transformed, y_train)

# Prediction and evaluation
y_pred = classifier.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
print("Classification Report on test set:")
print(classification_report(y_test, y_pred))

# Feature importances
importances = classifier.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': importances})
sorted_importances = importances_df.sort_values(by='Importance', ascending=False)

# Display sorted feature importances
print("Sorted Feature Importances:")
print(sorted_importances)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Initialize results list to store accuracies for different feature counts
results = []

# The transformed training set may not retain the feature names, we need to reassign them
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X.columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X.columns)

# Loop through feature subsets from top 5 to top 20 features
for top_n in range(30, 31):  # Adjust range to top 20
    # Select the top 'top_n' features based on importance
    top_features = sorted_importances['Feature'].head(top_n).tolist()  # Adjust variable name to sorted_importances
    
    # Subset the training and testing sets to the top 'top_n' features
    X_train_reduced = X_train_transformed[top_features]
    X_test_reduced = X_test_transformed[top_features]

    # Scale the reduced feature sets
    scaler = StandardScaler()
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)

    # Reinitialize and retrain the XGBoost Classifier on reduced feature set
    model_reduced = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    model_reduced.fit(X_train_reduced_scaled, y_train)

    # Make predictions with the reduced model
    y_pred_reduced = model_reduced.predict(X_test_reduced_scaled)

    # Calculate and store accuracy
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")
