In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
from xgboost import XGBClassifier
from imblearn.over_sampling import ADASYN, SMOTE  # Import ADASYN and SMOTE for oversampling
import sklearn

# Load dataset
df = pd.read_csv("dataset.csv")

# Drop rows with NaN in the target column (last column)
df = df.dropna(subset=[df.columns[-1]])

# Features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

#  Fixing class imbalance using ADASYN or SMOTE
#  Convert continuous target to categorical using binning or other suitable method
# For example, using quantile-based binning:
num_bins = 5  # Choose the number of bins
y = pd.qcut(y, q=num_bins, labels=False, duplicates='drop')  # labels=False to get numerical labels

# Check class distribution before applying ADASYN or SMOTE
print("Class distribution before resampling:", np.unique(y, return_counts=True))

# Decide whether to apply ADASYN or SMOTE based on class distribution
if len(np.unique(y)) > 1:  # Check if there are more than one class
    # Check if the class distribution is significantly imbalanced
    counts = np.unique(y, return_counts=True)[1]
    mean_count = np.mean(counts)
    if np.any(counts < mean_count * 0.75):  # If any class is less than 75% of the mean
        print("Resampling the dataset due to imbalance.")
        # You can choose between ADASYN or SMOTE, or use one for resampling
        # Here, using ADASYN to resample the minority class
        adasyn = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5)
        X_resampled, y_resampled = adasyn.fit_resample(X, y)
        print("Resampled with ADASYN.")
    else:
        print("Classes are balanced. Skipping resampling.")
        X_resampled, y_resampled = X, y  # If the class distribution is already balanced
else:
    print("Warning: Only one class detected. Skipping ADASYN and SMOTE.")
    X_resampled, y_resampled = X, y  # If only one class, skip resampling

#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 🚿 Clean column names (remove special characters)
X_train.columns = X_train.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)

#  Base models with early stopping and reduced complexity
base_models = [
    ("lightgbm", lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)), #Removed early_stopping_rounds
    ("random_forest", RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)),
    ("xgboost", XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)) #Removed early_stopping_rounds
]

# Use GradientBoostingClassifier for the meta model to speed things up
meta_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42)

# Faster stacking model with reduced cross-validation
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=2, n_jobs=-1)

#  Train the stacked model on the resampled data
stacked_model.fit(X_train, y_train)

#  Make predictions
stacked_pred = stacked_model.predict(X_test)

# 🎯 Evaluation
stacked_acc = accuracy_score(y_test, stacked_pred)
print(f" Stacked Model Accuracy with ADASYN/SMOTE Resampling: {stacked_acc:.4f}")
print("\n Classification Report:\n")
print(classification_report(y_test, stacked_pred))

Class distribution before resampling: (array([0, 1, 2, 3, 4], dtype=int64), array([85525, 85526, 85524, 85774, 85276], dtype=int64))
Classes are balanced. Skipping resampling.
🔥 Stacked Model Accuracy with ADASYN/SMOTE Resampling: 0.9110

💌 Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.92      0.90     16953
           1       0.87      0.87      0.87     17064
           2       0.91      0.88      0.89     17006
           3       0.96      0.92      0.94     17392
           4       0.94      0.96      0.95     17110

    accuracy                           0.91     85525
   macro avg       0.91      0.91      0.91     85525
weighted avg       0.91      0.91      0.91     85525



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier

# Load dataset
df = pd.read_csv("dataset.csv")

# Drop unnecessary columns
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Remove duplicates
df = df.drop_duplicates()

# Separate features & target variable
X = df.drop(columns=["label"])
y = df["label"]

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)

# Train models
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Get predictions
log_reg_pred = log_reg.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

# Evaluate base models
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, log_reg_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, log_reg_pred))

print(f"\nRandom Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, rf_pred))

print(f"\nXGBoost Accuracy: {accuracy_score(y_test, xgb_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, xgb_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Logistic Regression Accuracy: 0.9654
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     52311
           1       0.84      0.67      0.74       424
           2       1.00      1.00      1.00      9894
           3       0.86      0.76      0.81      6416

    accuracy                           0.97     69045
   macro avg       0.92      0.85      0.88     69045
weighted avg       0.96      0.97      0.96     69045


Random Forest Accuracy: 0.9798
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     52311
           1       0.99      0.99      0.99       424
           2       1.00      1.00      1.00      9894
           3       0.89      0.89      0.89      6416

    accuracy                           0.98     69045
   macro avg       0.97      0.97      0.97     69045
weighted avg       0.98      0.98      0.98     69045


XGBoost Accu