## Ad xgboost XGBC Classifier to Requirements

pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

# import the functions from the evaluate_models.py

import sys

# import own modules
sys.path.append("..")  # Adds higher directory to python modules path.
from scripts import evaluate_models as em


plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')


### Load the CSV into a Dataframe for Processing

easy Pipelines with a Grid Search Technique for following Models:
- Random Forrest
- XGBoost Classifier
- Logistic Regression
- SVC Support Vector Classifier

In [None]:
random_state = 42
# 
df = pd.read_csv("../data/model_data.csv")

In [None]:
# get the target variable
target = df["asd"]
df.drop("asd", axis=1, inplace=True)

### Checking for highly correlated columns
think after running this lines, which column to additionally drop

In [None]:
# Computing the correlation matrix
corr_matrix = df[num_cols].corr()

# Find pairs with correlation >= 0.8
high_corr_pairs = np.column_stack(np.where((np.abs(corr_matrix) >= 0.8) & (corr_matrix != 1)))
high_corr_cols = []

# Extracting and printing the pairs
seen_pairs = set()
for i, j in high_corr_pairs:
    col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
    if (col2, col1) not in seen_pairs:
        print(f"Correlation between {col1} and {col2} is {corr_matrix.iloc[i, j]}")
        seen_pairs.add((col1, col2))
        seen_pairs.add((col2,col1))
        high_corr_cols.append(col1)
        high_corr_cols.append(col2)

### Think about if u want to drop some High Corr Pair Cols?

In [None]:
# add more columns to drop with final df
df.drop(["id", "img", "sp_idx","dummy_feature_name"], axis=1, inplace=True)

In [None]:

# Get the Features
X = df

# Get the target Variable
y = target

In [None]:
# do the Train, Test Split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [None]:
# Add them manually later no need for Encoder
# Random Forrest and XGBoost both handle categorical Variables well
num_cols = df.columns[df.dtypes != "object"]
cat_cols = df.columns[df.dtypes == "object"]

In [None]:
param_grid_rf = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4]
}

param_grid_xgb = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [3, 5, 7, None],
    "classifier__learning_rate": [0.1, 0.01, 0.001]
}

param_grid_svc = {
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel types to try
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter values
    'classifier__gamma': ['scale', 'auto'],  # Gamma parameter for RBF kernel
    'classifier__degree': [2, 3, 4]  # Degree of the polynomial kernel (only for poly kernel)
}

param_grid_logreg = {
    'classifier__penalty': ['l1', 'l2'],  # Penalty type: l1 (Lasso) or l2 (Ridge)
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]  # Regularization strength
}

### Column Transformers for Data Preproccessing
these are only used for certain Models, which we expect to perform better with Preproccessed Data

In [None]:
# add other transformations at the end if needed

transformer = [("scaler", MinMaxScaler(), num_cols),
               ("ohe", OneHotEncoder(drop="first"), cat_cols  )]
               
preprocessing = ColumnTransformer(transformer,
                                  remainder="passthrough")

##  Single Pipelines for each Model

### Random Forrest Pipeline
Random Forrest needs no Scaling or Encoding
we will run it with the default data and a Grid Search

In [None]:
forrest_pipeline = Pipeline([
    ("classifier", RandomForestClassifier())
])

### XGBoost Classifier Pipeline
Some Hyperparameters profit from Scaled data
we will apply a Scaler here

In [None]:
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier", XGBClassifier())
])

### Logistic Regression Pipeline

This Model type profits from Scaling and Encoding! We apply our preproccessor here

In [None]:
log_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier", LogisticRegression(max_iter=1000))
])

### Support Vector Classifier Pipeline

We will Scale and Encode here as well, due to the nature of this Algorithm

In [None]:
svc_pipeline = Pipeline([
    ("preprocessor", preprocessing),
    ("classifier",SVC())
])

# Grid Search CV for the Best Param of each Pipeline

### Random Forrest Pipeline - Best Params/Est

In [None]:
# Specify the list of scoring metrics

# Create GridSearchCV object
grid_search_rf = GridSearchCV(forrest_pipeline, param_grid=param_grid_rf, cv=5, scoring="f1")
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_rf = grid_search_rf.best_params_
best_est_rf = grid_search_rf.best_estimator_
print("Best params for RF are:", best_params_rf)
print("Best est for RF are:", best_est_rf)



### XGBC Classifier Pipeline - Best Params/Est

In [None]:
# Specify the list of scoring metrics

# Create GridSearchCV object
grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid=param_grid_xgb, cv=5, scoring="f1")
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_xgb = grid_search_xgb.best_params_
best_est_xgb = grid_search_xgb.best_estimator_
print("Best params for XGB are:", best_params_xgb)
print("Best est for XGB are:", best_est_xgb)

### Logistic Regression Pipeline - Best Params/Est

In [None]:
# Specify the list of scoring metrics

# Create GridSearchCV object
grid_search_log = GridSearchCV(log_pipeline, param_grid=param_grid_logreg, cv=5, scoring="f1")
grid_search_log.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_log = grid_search_log.best_params_
best_est_log = grid_search_log.best_estimator_
print("Best params for LogReg are:", best_params_log)
print("Best est for LogReg are:", best_est_log)

### Support Vector Classifier Pipeline - Best Params/Est

In [None]:
# Specify the list of scoring metrics

# Create GridSearchCV object
grid_search_svc = GridSearchCV(svc_pipeline, param_grid=param_grid_svc, cv=5, scoring="f1")
grid_search_svc.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_svc = grid_search_log.best_params_
best_est_svc = grid_search_log.best_estimator_
print("Best params for SVC are:", best_params_svc)
print("Best est for SVC are:", best_est_svc)

# Now everything at Once - Get the best GridSearch CV parameters for each Model

In [None]:
# Define pipelines for different models
pipelines = {
    "RandomForestClassifier": Pipeline([
        ("preprocessing", preprocessing),
        ("classifier", RandomForestClassifier())
    ]),
    "XGBClassifier": Pipeline([
        ("preprocessing", preprocessing),
        ("classifier", XGBClassifier())
    ]),
    "SVC": Pipeline([
        ("preprocessing", preprocessing),
        ("classifier", SVC())
    ]),
    "LogisticRegression": Pipeline([
        ("preprocessing", preprocessing),
        ("classifier", LogisticRegression())
    ])
}

# Define parameter grids for different models
param_grids = {
    "RandomForestClassifier": {
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [None, 10, 20],
        "classifier__min_samples_split": [2, 5, 10],
        "classifier__min_samples_leaf": [1, 2, 4]
    },
    "XGBClassifier": {
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [3, 5, 7, None],
        "classifier__learning_rate": [0.1, 0.01, 0.001]
    },
    "SVC": {
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__gamma': ['scale', 'auto'],
        'classifier__degree': [2, 3, 4]
    },
    "LogisticRegression": {
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
}

# Perform grid search for each pipeline
for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='f1')
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model_name}:")
    print(grid_search.best_params_)

### Predicting and Modelling Section (Maybe do it in other Notebook with Pickle File)

### Do the Predicting on our Models: Don't forget to scale or preprocess X_test too! Have a look at the pipelines and Code above