# Scale Data

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler 

In [None]:
# to compare the effect of each scaler on our dataset
scaler = RobustScaler()
robust_df = scaler.fit_transform(df)
robust_df = pd.DataFrame(robust_df)
  
scaler = StandardScaler()
standard_df = scaler.fit_transform(df)
standard_df = pd.DataFrame(standard_df)
  
scaler = MinMaxScaler()
minmax_df = scaler.fit_transform(df)
minmax_df = pd.DataFrame(minmax_df)

# using KDE plot
#Note: some columns are opted out in order to speed up the process
fig, (ax1, ax2, ax3, ax4) = plt.subplots(ncols = 4, figsize =(20, 5))
ax1.set_title('Before Scaling')
sns.kdeplot(df['XStart'], ax = ax1)
sns.kdeplot(df['XEnd'], ax = ax1)
sns.kdeplot(df['YStart'], ax = ax1)
sns.kdeplot(df['YEnd'], ax = ax1)
sns.kdeplot(df['Focal'], ax = ax1)
sns.kdeplot(df['XSize'], ax = ax1)
sns.kdeplot(df['YSize'], ax = ax1)
sns.kdeplot(df['XRadius'], ax = ax1)
sns.kdeplot(df['YRadius'], ax = ax1)

ax2.set_title('After Robust Scaling')  
sns.kdeplot(robust_df[0], ax = ax2)
sns.kdeplot(robust_df[1], ax = ax2)
sns.kdeplot(robust_df[2], ax = ax2)
sns.kdeplot(robust_df[3], ax = ax2)
sns.kdeplot(robust_df[4], ax = ax2)
sns.kdeplot(robust_df[5], ax = ax2)
sns.kdeplot(robust_df[6], ax = ax2)
sns.kdeplot(robust_df[7], ax = ax2)
sns.kdeplot(robust_df[8], ax = ax2)

ax3.set_title('After Standard Scaling')  
sns.kdeplot(standard_df[0], ax = ax3)
sns.kdeplot(standard_df[1], ax = ax3)
sns.kdeplot(standard_df[2], ax = ax3)
sns.kdeplot(standard_df[3], ax = ax3)
sns.kdeplot(standard_df[4], ax = ax3)
sns.kdeplot(standard_df[5], ax = ax3)
sns.kdeplot(standard_df[6], ax = ax3)
sns.kdeplot(standard_df[7], ax = ax3)
sns.kdeplot(standard_df[8], ax = ax3)

ax4.set_title('After Min-Max Scaling')  
sns.kdeplot(minmax_df[0], ax = ax4)
sns.kdeplot(minmax_df[1], ax = ax4)
sns.kdeplot(minmax_df[2], ax = ax4)
sns.kdeplot(minmax_df[3], ax = ax4)
sns.kdeplot(minmax_df[4], ax = ax4)
sns.kdeplot(minmax_df[5], ax = ax4)
sns.kdeplot(minmax_df[6], ax = ax4)
sns.kdeplot(minmax_df[7], ax = ax4)
sns.kdeplot(minmax_df[8], ax = ax4)

plt.show()

In [None]:
y = df["Result"]
X = df.drop("Result", axis=1)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Normalize the data
#scaler = MinMaxScaler()
#X_train_normalized = scaler.fit_transform(X_train)
#X_test_normalized = scaler.transform(X_test)


scaler = RobustScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier, Perceptron, SGDClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict

In [None]:
def evaluate_classification_algorithms(Xtrain, Xtest, ytrain, ytest):
    
    # Create a dictionary of classification algorithms
    algorithms = {
        'Logistic Regression': LogisticRegression(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Neural Network': MLPClassifier(),
        'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
        'Extra Trees Classifier': ExtraTreesClassifier(),
        'Bagging Classifier': BaggingClassifier(),
        'Passive Aggressive Classifier': PassiveAggressiveClassifier(),
        'Perceptron': Perceptron(),
        'XGBoost Classifier': XGBClassifier(),
        'LightGBM Classifier': LGBMClassifier()
    }

    for algorithm_name, algorithm in algorithms.items():
        print(f"Evaluating {algorithm_name}...")
        
        # Fit the algorithm on the training data
        algorithm.fit(Xtrain, ytrain)
        
        # Make predictions on the testing data
        y_pred = algorithm.predict(Xtest)
        
        # Generate classification report
        classification_report_result = classification_report(ytest, y_pred)
        print("Classification Report:")
        print(classification_report_result)
        
        # Generate confusion matrix
        confusion_matrix_result = confusion_matrix(ytest, y_pred)
        
        # Generate heatmap confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(confusion_matrix_result, fmt='.2f', annot=True, cmap='Blues')
        plt.title(f"{algorithm_name} Confusion Matrix")
        plt.xlabel("Predicted Class")
        plt.ylabel("True Class")
        plt.show()
        
        
        print("----------------------------------------\n\n")


In [None]:
evaluate_classification_algorithms(X_train_normalized, X_test_normalized, y_train, y_test)

# SMOTE + ENN

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_normalized, y_train.ravel())

In [None]:
counter = Counter(y_train)
print('Before', counter)

counter = Counter(y_train_res)
print('After', counter)

In [None]:
# to demonstrate the effect of SMOTE over imbalanced datasets
fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize =(15, 5))
ax1.set_title('Before SMOTE')
pd.Series(y_train).value_counts().plot.bar(ax=ax1)


ax2.set_title('After SMOTE')  
pd.Series(y_train_res).value_counts().plot.bar(ax=ax2)

plt.show()

In [None]:
evaluate_classification_algorithms(X_train_res, X_test_normalized, y_train_res.ravel(), y_test)

# Neural Network

In [None]:
# Model and performance
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import class_weight

In [None]:
# manual_weights = {0: 1, 1: 200}
manual_weights = {0: 1, 1: 205}

# Train the neural network model using the imbalanced dataset
# Create model
nn_model_mbalanced = Sequential()
nn_model_mbalanced.add(Dense(2,input_dim=9,activation='relu'))
nn_model_mbalanced.add(Dense(1,activation='sigmoid'))
#Compile model
nn_model_mbalanced.compile(loss='binary_crossentropy',optimizer='adam')
#Fit the model
nn_model_mbalanced.fit(X_train_normalized,y_train, epochs=100, batch_size=100, class_weight=manual_weights)
# Prediction
nn_model_mbalanced_prediction = nn_model_mbalanced.predict(X_test_normalized)
nn_model_mbalanced_classes = [1 if i>0.5 else 0 for i in nn_model_mbalanced_prediction]
# Check the model performance
print(classification_report(y_test, nn_model_mbalanced_classes))

In [None]:
# Generate confusion matrix
confusion_matrix_result = confusion_matrix(y_test, nn_model_mbalanced_classes)
print("Confusion Matrix:")
print(confusion_matrix_result)
        
# Generate heatmap confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result,fmt='.2f', annot=True, cmap='Blues')
plt.title(f"Confusion Matrix")
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.show()

# Logistic Regression Balanced Weights

In [1]:
#importing and training the model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='newton-cg', class_weight='balanced')
lr.fit(X_train_res, y_train_res)

# Predicting on the test data
pred_test = lr.predict(X_test_normalized)

#Calculating and printing the f1 score 
class_report_result = classification_report(y_test, pred_test)
print("Classification Report:")
print(class_report_result)


# Generate confusion matrix
confusion_matrix_result = confusion_matrix(y_test, pred_test)
print("Confusion Matrix:")
print(confusion_matrix_result)
        
# Generate heatmap confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix_result,fmt='.2f', annot=True, cmap='Blues')
plt.title(f"Confusion Matrix")
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.show()

NameError: name 'X_train_res' is not defined