# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
# Assuming the dataset is in a CSV format
df = pd.read_csv('data/data.csv')

In [None]:
# Separate features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training and Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialize models
rf = RandomForestClassifier()
svc = SVC()
lr = LogisticRegression()

# Train and validate models
models = [rf, svc, lr]
model_names = ['Random Forest', 'SVM', 'Logistic Regression']

for model, name in zip(models, model_names):
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} accuracy: {accuracy:.4f}')
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f'{name} confusion matrix:')
    print(cm)


# Hyperparameter Tuning

# Define parameter grid for each model
rf_param_grid = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 5, 10]}

svc_param_grid = {'C': [0.1, 1, 10],
                  'kernel': ['linear', 'rbf']}

lr_param_grid = {'C': [0.1, 1, 10],
                 'penalty': ['l1', 'l2']}

# Perform GridSearchCV for each model
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5)
rf_grid_search.fit(X_train_scaled, y_train)

svc_grid_search = GridSearchCV(svc, svc_param_grid, cv=5)
svc_grid_search.fit(X_train_scaled, y_train)

lr_grid_search = GridSearchCV(lr, lr_param_grid, cv=5)
lr_grid_search.fit(X_train_scaled, y_train)

# Get best parameters and scores
print('Random Forest - Best Parameters:', rf
