In [30]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

## Ensemble Learning Voting

In [13]:
raw_data = pd.read_csv('feature_data.csv')
raw_data.head()
data = raw_data.iloc[:, 2:]
labels = raw_data.iloc[:, 1]
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create base models

random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_leaf=1, min_samples_split=2)
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)
xgb_model = XGBClassifier(learning_rate=0.036, n_estimators=56, max_depth=17, min_child_weight=1, gamma=0.72, subsample=0.58)

# Create an ensemble model using a Voting Classifier
ensemble_model = VotingClassifier(estimators=[
    ('rf', random_forest_model),
    # ('knn', knn_model),
    # ('svm', svm_model),
    # ('xgb', xgb_model)
], voting='soft')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

In [14]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, ensemble_model.predict(X_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8711
           1       0.70      0.47      0.56       128

    accuracy                           0.99      8839
   macro avg       0.84      0.73      0.78      8839
weighted avg       0.99      0.99      0.99      8839



0.7328826340259442

# Using UnderSampling

In [21]:
from imblearn.under_sampling import NearMiss

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
undersample = NearMiss(version=3, n_neighbors=10)
X_under, y_under = undersample.fit_resample(X_train, y_train)

# Create base models
decision_tree_model = DecisionTreeClassifier(max_depth=5)
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)

# Create an ensemble model using a Voting Classifier
ensemble_model = VotingClassifier(estimators=[
    ('dt', decision_tree_model),
    ('knn', knn_model),
    ('svm', svm_model)
], voting='soft')  # 'soft' for probability voting, 'hard' for majority voting

# Train the ensemble model
ensemble_model.fit(X_under, y_under)

In [22]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89      8711
           1       0.06      0.83      0.11       128

    accuracy                           0.80      8839
   macro avg       0.53      0.81      0.50      8839
weighted avg       0.98      0.80      0.88      8839



# Using Stacking

In [27]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create base models
random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_leaf=1, min_samples_split=2)
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)
xgb_model = XGBClassifier(learning_rate=0.036, n_estimators=56, max_depth=17, min_child_weight=1, gamma=0.72, subsample=0.58)

# Create an ensemble model using a Voting Classifier
ensemble_model = StackingClassifier(
    estimators=[
    ('knn', knn_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
    ],
    final_estimator=RandomForestClassifier(n_estimators=300, max_depth=20)
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

In [28]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, ensemble_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8711
           1       0.62      0.44      0.51       128

    accuracy                           0.99      8839
   macro avg       0.81      0.72      0.75      8839
weighted avg       0.99      0.99      0.99      8839

0.7167984444954655


# Using Undersampling

In [34]:
from imblearn.under_sampling import NearMiss
# Prepare data
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
undersample = NearMiss(version=3, n_neighbors=10)
X_under, y_under = undersample.fit_resample(X_train, y_train)

# Create base models
random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_leaf=1, min_samples_split=2)
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)
xgb_model = XGBClassifier(learning_rate=0.036, n_estimators=56, max_depth=17, min_child_weight=1, gamma=0.72, subsample=0.58)

# Create an ensemble model using a Voting Classifier
ensemble_model = StackingClassifier(
    estimators=[
    ('knn', knn_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
    ],
    final_estimator=RandomForestClassifier(n_estimators=300, max_depth=20)
)

# Train the ensemble model
ensemble_model.fit(X_under, y_under)

In [36]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, ensemble_model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92      8711
           1       0.08      0.82      0.15       128

    accuracy                           0.86      8839
   macro avg       0.54      0.84      0.54      8839
weighted avg       0.98      0.86      0.91      8839

0.841335219119504


# Oversampling

In [37]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X_train, y_train)


# Create base models
random_forest_model = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_leaf=1, min_samples_split=2)
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)
xgb_model = XGBClassifier(learning_rate=0.036, n_estimators=56, max_depth=17, min_child_weight=1, gamma=0.72, subsample=0.58)

# Create an ensemble model using a Voting Classifier
ensemble_model = StackingClassifier(
    estimators=[
    ('knn', knn_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
    ],
    final_estimator=RandomForestClassifier(n_estimators=300, max_depth=20)
)

# Train the ensemble model
ensemble_model.fit(X_over, y_over)

In [38]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, ensemble_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8711
           1       0.56      0.17      0.26       128

    accuracy                           0.99      8839
   macro avg       0.78      0.58      0.63      8839
weighted avg       0.98      0.99      0.98      8839

0.5849617222477327


# Test with RF

In [38]:

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create base models
random_forest_model = RandomForestClassifier()
knn_model = KNeighborsClassifier(n_neighbors=5, p=1, leaf_size=10)
svm_model = SVC(kernel='rbf', C=10, probability=True)

# Create an ensemble model using a Voting Classifier
ensemble_model = StackingClassifier(
    estimators=[
    ('rf', random_forest_model),
    ('knn', knn_model),
    ('svm', svm_model)
    ],
    final_estimator=RandomForestClassifier(n_estimators=20, random_state=42)
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

In [39]:
# Make predictions
y_pred = ensemble_model.predict(X_test)

# Classification accuracy
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8711
           1       0.21      0.23      0.22       128

    accuracy                           0.98      8839
   macro avg       0.60      0.61      0.60      8839
weighted avg       0.98      0.98      0.98      8839

