In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [2]:
df_train = pd.read_csv('..\\BT4240\\train_df.csv')
df_test = pd.read_csv('..\\BT4240\\test_df.csv')

In [3]:
smote = SMOTE(random_state=42)

X_train_before_SMOTE = df_train.drop('Target', axis=1)
y_train_before_SMOTE = df_train['Target']

X_train, y_train = smote.fit_resample(X_train_before_SMOTE, y_train_before_SMOTE)

X_test = df_test.drop('Target',axis=1)
y_test = df_test['Target']

In [4]:
print("Original class distribution:")
print(y_train_before_SMOTE.value_counts())
print("\nAfter SMOTE:")
print(pd.Series(y_train).value_counts())

Original class distribution:
Target
0    2102
1     994
Name: count, dtype: int64

After SMOTE:
Target
0    2102
1    2102
Name: count, dtype: int64


# Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=35,
    min_samples_leaf=8,
    criterion='gini',
    splitter='best',
    random_state=42
)

clf.fit(X_train, y_train)

decisionTree_y_pred = clf.predict(X_test)

decisionTree_y_proba = clf.predict_proba(X_test)[:, 1]

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features=None,
    random_state=42
)

# Fit the model to your training data
rf_clf.fit(X_train, y_train)

rf_clf_y_pred = rf_clf.predict(X_test)
rf_clf_y_proba = rf_clf.predict_proba(X_test)[:, 1]

# SVM

In [7]:
from sklearn.svm import SVC

# Initialize the SVM model with a linear kernel and probability estimates enabled
svm_clf = SVC(C=1, kernel='linear', probability=True, random_state=42)

# Train the model
svm_clf.fit(X_train, y_train)

# Get class predictions
svm_y_pred = svm_clf.predict(X_test)

# Get class probability estimates (for class 1)
svm_y_proba = svm_clf.predict_proba(X_test)[:, 1]


# XGBoost

In [8]:
from xgboost import XGBClassifier

# Initialize the XGBoost model with the given parameters
xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=3,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=10,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'  # To suppress warning in newer versions
)

# Train the model
xgb_clf.fit(X_train, y_train)

# Get class predictions
xgb_y_pred = xgb_clf.predict(X_test)

# Get class probabilities for positive class (1)
xgb_y_proba = xgb_clf.predict_proba(X_test)[:, 1]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Neural network

In [9]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

import tensorflow as tf

# Set the random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Convert features to float32 tensors (required by TensorFlow models)
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)

# Convert labels to float32 tensors for binary classification
y_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test.values, dtype=tf.float32)


# Define the model
def create_model():
    model = Sequential([
        Dense(256, input_dim=X_train.shape[1], activation='tanh'),
        Dense(64, activation='tanh'),
        Dense(32, activation='tanh'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = create_model()
model.fit(X_train_tensor, y_train_tensor, epochs=10, batch_size=32, verbose=0)

# Get predicted probabilities and predictions
nn_y_proba = model.predict(X_test_tensor).flatten()
nn_y_pred = (nn_y_proba >= 0.5).astype(int)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [10]:
# Ensemble

In [11]:
ensemble_proba_array = np.column_stack([
    decisionTree_y_proba,
    rf_clf_y_proba,   # rf_clf_y_proba
    svm_y_proba,
    xgb_y_proba,
    nn_y_proba
])

ensemble_proba_df = pd.DataFrame(
    ensemble_proba_array,
    columns=['DecisionTree', 'RandomForest', 'SVM', 'XGBoost', 'NeuralNet']
)

## Averaging

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Average the probabilities across models
ensemble_avg_proba = ensemble_proba_df.mean(axis=1)

# Convert probabilities to final predictions using threshold 0.5
ensemble_avg_pred = (ensemble_avg_proba >= 0.5).astype(int)

# Evaluate against true labels
accuracy = accuracy_score(y_test, ensemble_avg_pred)
precision = precision_score(y_test, ensemble_avg_pred)
recall = recall_score(y_test, ensemble_avg_pred)
f1 = f1_score(y_test, ensemble_avg_pred)

# Display the results
print(f"Ensemble (Averaging) Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Ensemble (Averaging) Evaluation Metrics:
Accuracy : 0.8803
Precision: 0.8221
Recall   : 0.8009
F1 Score : 0.8114


## Voting

In [13]:
from scipy.stats import mode

# Convert all model probabilities to binary predictions (threshold = 0.5)
ensemble_preds = (ensemble_proba_df >= 0.5).astype(int)

# Perform majority voting across rows (axis=1)
voting_pred, _ = mode(ensemble_preds, axis=1, keepdims=False)

# Evaluate voting ensemble against y_test
accuracy = accuracy_score(y_test, voting_pred)
precision = precision_score(y_test, voting_pred)
recall = recall_score(y_test, voting_pred)
f1 = f1_score(y_test, voting_pred)

# Display results
print("Ensemble (Voting) Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Ensemble (Voting) Evaluation Metrics:
Accuracy : 0.8855
Precision: 0.8345
Recall   : 0.8033
F1 Score : 0.8186


## Weighted probabilities according to f1-score

In [14]:
# Define the weights in the correct order
weights = np.array([0.76364, 0.7819, 0.8629, 0.849390, 0.7687])

# Apply weighted probability averaging
weighted_proba = (ensemble_proba_df.values * weights).sum(axis=1) / weights.sum()

# Convert to binary predictions using threshold 0.5
weighted_pred = (weighted_proba >= 0.5).astype(int)

# Evaluate
accuracy = accuracy_score(y_test, weighted_pred)
precision = precision_score(y_test, weighted_pred)
recall = recall_score(y_test, weighted_pred)
f1 = f1_score(y_test, weighted_pred)

# Print results
print("Ensemble (Weighted Probabilities) Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")


Ensemble (Weighted Probabilities) Evaluation Metrics:
Accuracy : 0.8788
Precision: 0.8197
Recall   : 0.7986
F1 Score : 0.8090


## Weighted majority voting according to f1-score

In [15]:
# Define model weights in order: [Decision Tree, Random Forest, SVM, XGBoost, Neural Net]
weights = np.array([0.76364, 0.7819, 0.8629, 0.849390, 0.7687])
total_weight = weights.sum()

# Convert model probabilities to binary predictions
binary_preds = (ensemble_proba_df >= 0.5).astype(int)

# Apply weighted voting: multiply predictions by weights
weighted_votes = binary_preds.values * weights

# Sum the weighted votes across models
vote_sums = weighted_votes.sum(axis=1)

# Final prediction: class 1 if vote sum >= half the total weight
weighted_vote_pred = (vote_sums >= (total_weight / 2)).astype(int)

# Evaluate
accuracy = accuracy_score(y_test, weighted_vote_pred)
precision = precision_score(y_test, weighted_vote_pred)
recall = recall_score(y_test, weighted_vote_pred)
f1 = f1_score(y_test, weighted_vote_pred)

# Display
print("Ensemble (Weighted Voting on Binary Predictions) Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")



Ensemble (Weighted Voting on Binary Predictions) Evaluation Metrics:
Accuracy : 0.8855
Precision: 0.8345
Recall   : 0.8033
F1 Score : 0.8186
