train.csv - The training set.
Id Unique identifier for each observation.
AB-GL Fifty-six anonymized health characteristics. All are numeric except for EJ, which is categorical.
Class A binary target: 1 indicates the subject has been diagnosed with one of the three conditions, 0 indicates they have not.

[Logistic Regression, Random Forest, Gradient Boosting Models (e.g., XGBoost, LightGBM, Support Vector Machines (SVM), Decision Trees, Naive Bayes, k-nearest neighbors]

GridSearchCV
RandomizedSearchCV
Bayesian Optimization
HyperOpt
Optuna

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [8]:
data = pd.read_csv('train.csv')
print(data.shape)
data.head()

(617, 58)


Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      557 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      615 non-null    float64
 17  CC      614 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [10]:
data['EJ'].value_counts()

B    395
A    222
Name: EJ, dtype: int64

In [11]:
# converting categorical columns
data = data.replace({'EJ':{'A':0, 'B':1}})

In [12]:
data.isnull().sum()

Id        0
AB        0
AF        0
AH        0
AM        0
AR        0
AX        0
AY        0
AZ        0
BC        0
BD        0
BN        0
BP        0
BQ       60
BR        0
BZ        0
CB        2
CC        3
CD        0
CF        0
CH        0
CL        0
CR        0
CS        0
CU        0
CW        0
DA        0
DE        0
DF        0
DH        0
DI        0
DL        0
DN        0
DU        1
DV        0
DY        0
EB        0
EE        0
EG        0
EH        0
EJ        0
EL       60
EP        0
EU        0
FC        1
FD        0
FE        0
FI        0
FL        1
FR        0
FS        2
GB        0
GE        0
GF        0
GH        0
GI        0
GL        1
Class     0
dtype: int64

In [13]:
# fill missing age values with the median value
data['BQ'] = data['BQ'].fillna(data['BQ'].mean())
data['CB'] = data['CB'].fillna(data['CB'].mean())
data['CC'] = data['CC'].fillna(data['CC'].mean())
data['DU'] = data['DU'].fillna(data['DU'].mean())
data['EL'] = data['EL'].fillna(data['EL'].mean())

data['FC'] = data['FC'].fillna(data['FC'].mean())
data['FL'] = data['FL'].fillna(data['FL'].mean())
data['FS'] = data['FS'].fillna(data['FS'].mean())
data['GL'] = data['GL'].fillna(data['GL'].mean())

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      617 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      617 non-null    float64
 17  CC      617 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [15]:
data.columns

Index(['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],
      dtype='object')

In [16]:
data['Class'].value_counts()

0    509
1    108
Name: Class, dtype: int64

In [17]:
data.describe()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
count,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,...,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0
mean,0.477149,3502.013221,118.624513,38.968552,10.128242,5.545576,0.06032,10.566447,8.053012,5350.388655,...,5.433199,3.533905,0.421501,20.724856,131.714987,14679.595398,31.489716,50.584437,8.530961,0.175041
std,0.468388,2300.322717,127.83895,69.728226,10.518877,2.551696,0.416817,4.350645,65.166943,3021.326641,...,11.486922,50.181948,1.303244,9.991907,144.181524,19352.959387,9.864239,36.266251,10.318624,0.38031
min,0.081187,192.59328,85.200147,3.177522,8.138688,0.699861,0.025578,3.396778,1.2299,1693.62432,...,0.173229,0.49706,0.06773,4.102182,72.611063,13.038894,9.432735,0.897628,0.001129,0.0
25%,0.252107,2197.34548,85.200147,12.270314,8.138688,4.128294,0.025578,8.12958,1.2299,4155.70287,...,0.173229,0.49706,0.06773,14.036718,72.611063,2798.992584,25.034888,23.011684,0.124414,0.0
50%,0.354659,3120.31896,85.200147,20.53311,8.138688,5.031912,0.025578,10.46132,1.2299,4997.96073,...,3.036891,1.131,0.257374,18.771436,72.611063,7838.27361,30.608946,41.007968,0.339429,0.0
75%,0.559763,4361.63739,113.73954,39.139886,8.138688,6.431634,0.036845,12.969516,5.081244,6035.8857,...,6.237329,1.51206,0.535067,25.608406,127.591671,19035.70924,36.863947,67.931664,21.978,0.0
max,6.161666,28688.18766,1910.123198,630.51823,178.943634,38.27088,10.315851,38.971568,1463.693448,53060.59924,...,137.932739,1244.22702,31.365763,135.781294,1497.351958,143790.0712,81.210825,191.194764,21.978,1.0


In [18]:
correlation_matrix = data.corr()

correlation_threshold = 0.5

In [19]:
# Find highly correlated features
highly_correlated_features = correlation_matrix[((correlation_matrix > correlation_threshold) | (correlation_matrix < -correlation_threshold)) & (correlation_matrix != 1)]
highly_correlated_features = highly_correlated_features.unstack().dropna().reset_index()

# Print the highly correlated features
print("Highly Correlated Features:")
for index, row in highly_correlated_features.iterrows():
    feature1 = row['level_0']
    feature2 = row['level_1']
    correlation = row[0]
    print(f"{feature1} - {feature2}: {correlation:.2f}")

Highly Correlated Features:
AB - AM: 0.53
AH - AR: 0.75
AH - CH: 0.68
AH - CL: 0.69
AH - CS: 0.63
AH - DV: 0.75
AH - EB: 0.71
AH - EP: 0.68
AM - AB: 0.53
AR - AH: 0.75
AR - CH: 0.66
AR - CL: 0.75
AR - CS: 0.72
AR - DV: 0.82
AR - EB: 0.74
AR - EP: 0.75
AY - BD : 0.51
BC - BD : 0.75
BC - BZ: 0.91
BC - CF: 0.55
BD  - AY: 0.51
BD  - BC: 0.75
BD  - BZ: 0.68
BZ - BC: 0.91
BZ - BD : 0.68
BZ - CC: 0.51
BZ - CF: 0.54
CC - BZ: 0.51
CF - BC: 0.55
CF - BZ: 0.54
CH - AH: 0.68
CH - AR: 0.66
CH - CL: 0.56
CH - CS: 0.60
CH - DV: 0.61
CH - EB: 0.62
CH - EP: 0.57
CL - AH: 0.69
CL - AR: 0.75
CL - CH: 0.56
CL - CS: 0.63
CL - DV: 0.95
CL - EB: 0.62
CL - EP: 0.65
CS - AH: 0.63
CS - AR: 0.72
CS - CH: 0.60
CS - CL: 0.63
CS - DV: 0.69
CS - EB: 0.69
CS - EP: 0.79
DU - EH: 0.85
DU - FD : 0.81
DU - FL: 0.61
DV - AH: 0.75
DV - AR: 0.82
DV - CH: 0.61
DV - CL: 0.95
DV - CS: 0.69
DV - EB: 0.69
DV - EP: 0.72
EB - AH: 0.71
EB - AR: 0.74
EB - CH: 0.62
EB - CL: 0.62
EB - CS: 0.69
EB - DV: 0.69
EB - EP: 0.73
EH - DU: 0.85

In [20]:
# find highly correlated features with diagnosis
highly_correlated_features = np.abs(correlation_matrix['Class']).sort_values(ascending=False)
highly_correlated_features= highly_correlated_features[highly_correlated_features > correlation_threshold]

# print the highly correlated features
print("Highly Correlated Features with Diagnosis: ")
for feature, correlation in highly_correlated_features.iteritems():
    print(f"{feature}: {correlation:.2f}")

Highly Correlated Features with Diagnosis: 
Class: 1.00


In [21]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [22]:
X = data.drop(['Class', 'Id'], axis=1)
y = data['Class']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

###### logistic regression

In [19]:
model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 83.87096774193549


In [20]:
model = LogisticRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 88.70967741935483


In [21]:
# define the parameter grid for hyperparameter tuning
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# create a logistic regression model
model = LogisticRegression()

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)
print()

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'C': 10}

Accuracy: 83.33333333333334


In [22]:
# define the parameter grid for hyperparameter tuning
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# create a logistic regression model
model = LogisticRegression()

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data
y_pred = best_model.predict(X_test_scaled)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)
print()

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'C': 0.1}

Accuracy: 89.24731182795699


###### support vector machines

In [23]:
model = SVC()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 81.72043010752688


In [24]:
model = SVC()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 89.78494623655914


In [25]:
# hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [0.1, 1, 10, 100],
    'kernel': ['lnear', 'rbf']
}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# get the best hyperparameters
best_params = grid_search.best_params_

# support vector machine model
model = SVC(**best_params)

# fit the model on the training data
model.fit(X_train, y_train)

# predict on the test datad
y_pred = model.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 79.56989247311827


In [26]:
# hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [0.1, 1, 10, 100],
    'kernel': ['lnear', 'rbf']
}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# get the best hyperparameters
best_params = grid_search.best_params_

# logistic regression model
model = SVC(**best_params)

# fit the model on the training data
model.fit(X_train_scaled, y_train)

# predict on the test data
y_pred = model.predict(X_test_scaled)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 81.18279569892472


###### random forest 

In [23]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 91.93548387096774


In [24]:
model = RandomForestClassifier()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 88.70967741935483


###### gradient boosting classifier

In [25]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 93.01075268817203


In [26]:
model = GradientBoostingClassifier()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

Accuracy: 85.48387096774194


In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

###### decision trees

In [28]:
# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 88.70967741935483


In [29]:
# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 66.66666666666666


In [30]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'max_depth': [3, 5, 7, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['sqrt', 'log2', None]}

# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)
print()

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'max_depth': 7, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}

Accuracy: 90.32258064516128


In [31]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'max_depth': [3, 5, 7, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['sqrt', 'log2', None]}

# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_scaled)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)
print()

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'max_depth': 7, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}

Accuracy: 61.82795698924731


###### naive bayes

In [32]:
# Create a Naive Bayes classifier
model = GaussianNB()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 84.94623655913979


In [33]:
# Create a Naive Bayes classifier
model = GaussianNB()

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 23.118279569892472


In [34]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

# Create a Naive Bayes classifier
model = GaussianNB()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'var_smoothing': 0.01873817422860384}
Accuracy: 81.72043010752688


In [35]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

# Create a Naive Bayes classifier
model = GaussianNB()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_scaled)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'var_smoothing': 5.3366992312063123e-05}
Accuracy: 25.268817204301076


###### k-nearest neighbors

In [36]:
# Create a KNN classifier
model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 81.18279569892472


In [37]:
# Create a KNN classifier
model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

Accuracy: 87.63440860215054


In [38]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'n_neighbors': [3, 5, 7, 9]}

# Create a KNN classifier
model = KNeighborsClassifier()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'n_neighbors': 9}
Accuracy: 81.72043010752688


In [39]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'n_neighbors': [3, 5, 7, 9]}

# Create a KNN classifier
model = KNeighborsClassifier()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_scaled)

# print the best parameters found during hyperparameter tuning
print("\nBest Parameters: ")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")


Best Parameters: 
{'n_neighbors': 3}
Accuracy: 88.17204301075269


In [26]:
import optuna

###### logistic regression

In [41]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    C = trial.suggest_loguniform("C", 0.01, 10)
    max_iter = trial.suggest_int("max_iter", 100, 1000, step=100)

    model = LogisticRegression(C=C, max_iter=max_iter, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the logistic regression model with the best hyperparameters
best_model = LogisticRegression(
    C=study.best_params["C"], max_iter=study.best_params["max_iter"], random_state=42
)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:01:27,483][0m A new study created in memory with name: no-name-2ce06725-c99d-4c0a-8727-e9176857afaf[0m
[32m[I 2023-05-30 09:01:27,806][0m Trial 0 finished with value: -0.8064516129032258 and parameters: {'C': 4.470897909685265, 'max_iter': 300}. Best is trial 0 with value: -0.8064516129032258.[0m
[32m[I 2023-05-30 09:01:27,938][0m Trial 1 finished with value: -0.8225806451612904 and parameters: {'C': 0.03565299251224623, 'max_iter': 200}. Best is trial 1 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:01:28,249][0m Trial 2 finished with value: -0.8225806451612904 and parameters: {'C': 2.625693405975026, 'max_iter': 700}. Best is trial 1 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:01:28,617][0m Trial 3 finished with value: -0.8440860215053764 and parameters: {'C': 0.5009228837929462, 'max_iter': 800}. Best is trial 3 with value: -0.8440860215053764.[0m
[32m[I 2023-05-30 09:01:28,685][0m Trial 4 finished with value: -0.833333333

[32m[I 2023-05-30 09:01:42,091][0m Trial 41 finished with value: -0.8387096774193549 and parameters: {'C': 0.6449372496293956, 'max_iter': 100}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:42,276][0m Trial 42 finished with value: -0.8064516129032258 and parameters: {'C': 0.32750190741607393, 'max_iter': 300}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:42,403][0m Trial 43 finished with value: -0.8279569892473119 and parameters: {'C': 0.9737682882782778, 'max_iter': 200}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:42,578][0m Trial 44 finished with value: -0.8064516129032258 and parameters: {'C': 0.5704259463657328, 'max_iter': 300}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:42,671][0m Trial 45 finished with value: -0.8387096774193549 and parameters: {'C': 1.4722775254644773, 'max_iter': 100}. Best is trial 22 with value: -0.8494623655913979

[32m[I 2023-05-30 09:01:57,429][0m Trial 82 finished with value: -0.8387096774193549 and parameters: {'C': 0.32182632548758594, 'max_iter': 700}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:57,832][0m Trial 83 finished with value: -0.8333333333333334 and parameters: {'C': 0.48889803105831847, 'max_iter': 800}. Best is trial 22 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:01:58,159][0m Trial 84 finished with value: -0.8548387096774194 and parameters: {'C': 0.3006039871164258, 'max_iter': 700}. Best is trial 84 with value: -0.8548387096774194.[0m
[32m[I 2023-05-30 09:01:58,516][0m Trial 85 finished with value: -0.8602150537634409 and parameters: {'C': 0.2823688668541671, 'max_iter': 700}. Best is trial 85 with value: -0.8602150537634409.[0m
[32m[I 2023-05-30 09:01:58,881][0m Trial 86 finished with value: -0.8225806451612904 and parameters: {'C': 0.4425075229563935, 'max_iter': 700}. Best is trial 85 with value: -0.860215053763440

Best Hyperparameters: {'C': 0.2823688668541671, 'max_iter': 700}
Best Accuracy: 0.8602150537634409
Accuracy: 86.02150537634408


In [42]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    C = trial.suggest_loguniform("C", 0.01, 10)
    max_iter = trial.suggest_int("max_iter", 100, 1000, step=100)

    model = LogisticRegression(C=C, max_iter=max_iter, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the logistic regression model with the best hyperparameters
best_model = LogisticRegression(
    C=study.best_params["C"], max_iter=study.best_params["max_iter"], random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:03:04,341][0m A new study created in memory with name: no-name-7b6c30c6-fc38-4556-a3ed-baae4a7c61e2[0m
[32m[I 2023-05-30 09:03:04,366][0m Trial 0 finished with value: -0.8924731182795699 and parameters: {'C': 0.13001539984480753, 'max_iter': 100}. Best is trial 0 with value: -0.8924731182795699.[0m
[32m[I 2023-05-30 09:03:04,393][0m Trial 1 finished with value: -0.8870967741935484 and parameters: {'C': 1.1223182969757548, 'max_iter': 200}. Best is trial 0 with value: -0.8924731182795699.[0m
[32m[I 2023-05-30 09:03:04,421][0m Trial 2 finished with value: -0.8978494623655914 and parameters: {'C': 0.4686368453467886, 'max_iter': 500}. Best is trial 2 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:03:04,443][0m Trial 3 finished with value: -0.8924731182795699 and parameters: {'C': 0.3410523817783667, 'max_iter': 500}. Best is trial 2 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:03:04,477][0m Trial 4 finished with value: -0.8924731

[32m[I 2023-05-30 09:03:06,564][0m Trial 41 finished with value: -0.9086021505376344 and parameters: {'C': 0.05893402197539367, 'max_iter': 100}. Best is trial 41 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:03:06,610][0m Trial 42 finished with value: -0.9032258064516129 and parameters: {'C': 0.047097013379811044, 'max_iter': 100}. Best is trial 41 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:03:06,654][0m Trial 43 finished with value: -0.9086021505376344 and parameters: {'C': 0.05098363556849973, 'max_iter': 100}. Best is trial 41 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:03:06,693][0m Trial 44 finished with value: -0.9086021505376344 and parameters: {'C': 0.05646316976765035, 'max_iter': 100}. Best is trial 41 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:03:06,738][0m Trial 45 finished with value: -0.9086021505376344 and parameters: {'C': 0.05803105329303791, 'max_iter': 100}. Best is trial 41 with value: -0.90860215053

[32m[I 2023-05-30 09:03:08,426][0m Trial 82 finished with value: -0.9139784946236559 and parameters: {'C': 0.07981382600575755, 'max_iter': 200}. Best is trial 47 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:03:08,467][0m Trial 83 finished with value: -0.9086021505376344 and parameters: {'C': 0.05006349112029403, 'max_iter': 100}. Best is trial 47 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:03:08,510][0m Trial 84 finished with value: -0.8924731182795699 and parameters: {'C': 0.1039013879974892, 'max_iter': 200}. Best is trial 47 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:03:08,550][0m Trial 85 finished with value: -0.9086021505376344 and parameters: {'C': 0.06417003192968325, 'max_iter': 300}. Best is trial 47 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:03:08,603][0m Trial 86 finished with value: -0.9032258064516129 and parameters: {'C': 0.09017197602319273, 'max_iter': 100}. Best is trial 47 with value: -0.9139784946236

Best Hyperparameters: {'C': 0.06713197643627092, 'max_iter': 100}
Best Accuracy: 0.9139784946236559
Accuracy: 91.39784946236558


###### decision tree

In [43]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the decision tree model with the best hyperparameters
best_model = DecisionTreeClassifier(
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    random_state=42
)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:03:09,261][0m A new study created in memory with name: no-name-c4901baa-2140-48d9-a993-bb296715ca82[0m
[32m[I 2023-05-30 09:03:09,300][0m Trial 0 finished with value: -0.8870967741935484 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 0 with value: -0.8870967741935484.[0m
[32m[I 2023-05-30 09:03:09,341][0m Trial 1 finished with value: -0.8709677419354839 and parameters: {'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 0 with value: -0.8870967741935484.[0m
[32m[I 2023-05-30 09:03:09,384][0m Trial 2 finished with value: -0.8870967741935484 and parameters: {'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: -0.8870967741935484.[0m
[32m[I 2023-05-30 09:03:09,421][0m Trial 3 finished with value: -0.8924731182795699 and parameters: {'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 3 with value: -0.892473118279569

[32m[I 2023-05-30 09:03:11,815][0m Trial 37 finished with value: -0.8924731182795699 and parameters: {'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:11,898][0m Trial 38 finished with value: -0.8870967741935484 and parameters: {'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:11,996][0m Trial 39 finished with value: -0.9032258064516129 and parameters: {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:12,072][0m Trial 40 finished with value: -0.8924731182795699 and parameters: {'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:12,149][0m Trial 41 finished with value: -0.9193548387096774 and parameters: {'max_de

[32m[I 2023-05-30 09:03:14,997][0m Trial 74 finished with value: -0.8924731182795699 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:15,085][0m Trial 75 finished with value: -0.9032258064516129 and parameters: {'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:15,179][0m Trial 76 finished with value: -0.8978494623655914 and parameters: {'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:15,260][0m Trial 77 finished with value: -0.8709677419354839 and parameters: {'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 7}. Best is trial 15 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:03:15,347][0m Trial 78 finished with value: -0.9032258064516129 and parameters: {'max_

Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 3}
Best Accuracy: 0.9193548387096774
Accuracy: 91.93548387096774


In [44]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the decision tree model with the best hyperparameters
best_model = DecisionTreeClassifier(
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:03:17,357][0m A new study created in memory with name: no-name-fe2db8f5-5e68-4524-b467-8663359b94e4[0m
[32m[I 2023-05-30 09:03:17,391][0m Trial 0 finished with value: -0.8225806451612904 and parameters: {'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 6}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:03:17,423][0m Trial 1 finished with value: -0.8225806451612904 and parameters: {'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 6}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:03:17,458][0m Trial 2 finished with value: -0.7849462365591398 and parameters: {'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 10}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:03:17,489][0m Trial 3 finished with value: -0.6612903225806451 and parameters: {'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 0 with value: -0.8225806451612

[32m[I 2023-05-30 09:03:19,641][0m Trial 37 finished with value: -0.8225806451612904 and parameters: {'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:19,730][0m Trial 38 finished with value: -0.6182795698924731 and parameters: {'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:19,810][0m Trial 39 finished with value: -0.7849462365591398 and parameters: {'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:19,885][0m Trial 40 finished with value: -0.6612903225806451 and parameters: {'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:19,961][0m Trial 41 finished with value: -0.8387096774193549 and parameters: {'max_d

[32m[I 2023-05-30 09:03:22,577][0m Trial 74 finished with value: -0.6559139784946236 and parameters: {'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:22,649][0m Trial 75 finished with value: -0.8225806451612904 and parameters: {'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 6}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:22,734][0m Trial 76 finished with value: -0.8387096774193549 and parameters: {'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:22,821][0m Trial 77 finished with value: -0.6236559139784946 and parameters: {'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 12 with value: -0.8387096774193549.[0m
[32m[I 2023-05-30 09:03:22,897][0m Trial 78 finished with value: -0.7634408602150538 and parameters: {'max_d

Best Hyperparameters: {'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 5}
Best Accuracy: 0.8387096774193549
Accuracy: 83.87096774193549


###### random forest

In [45]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the random forest model with the best hyperparameters
best_model = RandomForestClassifier(
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    random_state=42
)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:03:24,744][0m A new study created in memory with name: no-name-d3263ba1-1637-4cbc-92e6-0679098561f6[0m
[32m[I 2023-05-30 09:03:25,598][0m Trial 0 finished with value: -0.9139784946236559 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:03:33,944][0m Trial 1 finished with value: -0.9247311827956989 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 1 with value: -0.9247311827956989.[0m
[32m[I 2023-05-30 09:03:40,772][0m Trial 2 finished with value: -0.8924731182795699 and parameters: {'n_estimators': 900, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 1 with value: -0.9247311827956989.[0m
[32m[I 2023-05-30 09:03:46,264][0m Trial 3 finished with value: -0.9139784946236559 and parameters: {'n_estimators': 700, 'max_depth': 7, 'min_sam

[32m[I 2023-05-30 09:05:08,172][0m Trial 34 finished with value: -0.9408602150537635 and parameters: {'n_estimators': 300, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:05:09,884][0m Trial 35 finished with value: -0.9354838709677419 and parameters: {'n_estimators': 200, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:05:12,528][0m Trial 36 finished with value: -0.9354838709677419 and parameters: {'n_estimators': 300, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:05:16,238][0m Trial 37 finished with value: -0.8763440860215054 and parameters: {'n_estimators': 500, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:05:19,340

[32m[I 2023-05-30 09:06:32,635][0m Trial 68 finished with value: -0.9408602150537635 and parameters: {'n_estimators': 300, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:06:33,632][0m Trial 69 finished with value: -0.956989247311828 and parameters: {'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:06:34,602][0m Trial 70 finished with value: -0.9193548387096774 and parameters: {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:06:35,626][0m Trial 71 finished with value: -0.956989247311828 and parameters: {'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 17 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:06:36,632][

Best Hyperparameters: {'n_estimators': 200, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 2}
Best Accuracy: 0.956989247311828
Accuracy: 95.6989247311828


In [46]:
95.6989247311828

95.6989247311828

In [47]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the random forest model with the best hyperparameters
best_model = RandomForestClassifier(
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    min_samples_leaf=study.best_params["min_samples_leaf"],
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:07:25,244][0m A new study created in memory with name: no-name-cac2e45b-55d9-4fa6-9e0a-3c7da1f1fc59[0m
[32m[I 2023-05-30 09:07:28,672][0m Trial 0 finished with value: -0.8870967741935484 and parameters: {'n_estimators': 400, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 0 with value: -0.8870967741935484.[0m
[32m[I 2023-05-30 09:07:33,713][0m Trial 1 finished with value: -0.9032258064516129 and parameters: {'n_estimators': 600, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 1 with value: -0.9032258064516129.[0m
[32m[I 2023-05-30 09:07:37,721][0m Trial 2 finished with value: -0.9032258064516129 and parameters: {'n_estimators': 500, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with value: -0.9032258064516129.[0m
[32m[I 2023-05-30 09:07:39,947][0m Trial 3 finished with value: -0.8763440860215054 and parameters: {'n_estimators': 300, 'max_depth': 5, 'min_sam

[32m[I 2023-05-30 09:10:28,934][0m Trial 34 finished with value: -0.8924731182795699 and parameters: {'n_estimators': 600, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:10:32,635][0m Trial 35 finished with value: -0.9032258064516129 and parameters: {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:10:34,726][0m Trial 36 finished with value: -0.9139784946236559 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 6}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:10:39,431][0m Trial 37 finished with value: -0.9086021505376344 and parameters: {'n_estimators': 600, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:10:43

[32m[I 2023-05-30 09:13:21,382][0m Trial 68 finished with value: -0.9193548387096774 and parameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:13:27,383][0m Trial 69 finished with value: -0.9193548387096774 and parameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:13:33,558][0m Trial 70 finished with value: -0.8817204301075269 and parameters: {'n_estimators': 1000, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:13:39,559][0m Trial 71 finished with value: -0.9193548387096774 and parameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 24 with value: -0.9193548387096774.[0m
[32m[I 2023-05-30 09:13:4

Best Hyperparameters: {'n_estimators': 900, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 5}
Best Accuracy: 0.9193548387096774
Accuracy: 91.93548387096774


###### support vector machines

In [48]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    gamma = trial.suggest_loguniform("gamma", 1e-3, 1e3)

    model = SVC(C=C, gamma=gamma, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the support vector machine model with the best hyperparameters
best_model = SVC(
    C=study.best_params["C"],
    gamma=study.best_params["gamma"],
    random_state=42
)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:16:21,417][0m A new study created in memory with name: no-name-6e6b9f91-4831-4928-a746-55b2c761608f[0m
[32m[I 2023-05-30 09:16:21,513][0m Trial 0 finished with value: -0.7956989247311828 and parameters: {'C': 151.2421953738923, 'gamma': 220.5768944054082}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:21,611][0m Trial 1 finished with value: -0.7956989247311828 and parameters: {'C': 55.937305990653925, 'gamma': 54.75582052029073}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:21,703][0m Trial 2 finished with value: -0.7956989247311828 and parameters: {'C': 8.165106203133089, 'gamma': 453.25641569359357}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:21,799][0m Trial 3 finished with value: -0.7956989247311828 and parameters: {'C': 0.03194213804558949, 'gamma': 0.3084681751364566}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:21,893]

[32m[I 2023-05-30 09:16:25,830][0m Trial 39 finished with value: -0.7956989247311828 and parameters: {'C': 15.4692437528405, 'gamma': 39.700812759552186}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:25,951][0m Trial 40 finished with value: -0.7956989247311828 and parameters: {'C': 510.410744587342, 'gamma': 149.88303804887414}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:26,077][0m Trial 41 finished with value: -0.7956989247311828 and parameters: {'C': 111.00304028650304, 'gamma': 119.50344838516814}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:26,282][0m Trial 42 finished with value: -0.7956989247311828 and parameters: {'C': 15.184150677925144, 'gamma': 32.66682827598535}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:26,408][0m Trial 43 finished with value: -0.7956989247311828 and parameters: {'C': 667.5464726448087, 'gamma': 614.2640849316305}. 

[32m[I 2023-05-30 09:16:30,609][0m Trial 78 finished with value: -0.7956989247311828 and parameters: {'C': 1.7731897743987404, 'gamma': 271.48431744843174}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:30,736][0m Trial 79 finished with value: -0.7956989247311828 and parameters: {'C': 141.59275892040316, 'gamma': 664.9261635932362}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:30,849][0m Trial 80 finished with value: -0.7956989247311828 and parameters: {'C': 0.09415149566598836, 'gamma': 44.13460692809864}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:30,964][0m Trial 81 finished with value: -0.7956989247311828 and parameters: {'C': 6.283116447543879, 'gamma': 122.13064405093529}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:31,097][0m Trial 82 finished with value: -0.7956989247311828 and parameters: {'C': 23.70973493764282, 'gamma': 15.69317552220648

Best Hyperparameters: {'C': 151.2421953738923, 'gamma': 220.5768944054082}
Best Accuracy: 0.7956989247311828
Accuracy: 79.56989247311827


In [49]:
def objective(trial):
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    gamma = trial.suggest_loguniform("gamma", 1e-3, 1e3)

    model = SVC(C=C, gamma=gamma, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the support vector machine model with the best hyperparameters
best_model = SVC(
    C=study.best_params["C"],
    gamma=study.best_params["gamma"],
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:16:33,233][0m A new study created in memory with name: no-name-b44da10e-1039-414f-8478-64e17dd57868[0m
[32m[I 2023-05-30 09:16:33,300][0m Trial 0 finished with value: -0.7956989247311828 and parameters: {'C': 0.0020671055041594825, 'gamma': 246.795438165782}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:33,372][0m Trial 1 finished with value: -0.7956989247311828 and parameters: {'C': 476.7339616918006, 'gamma': 0.9491913212468617}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:33,467][0m Trial 2 finished with value: -0.7956989247311828 and parameters: {'C': 168.0149087827016, 'gamma': 80.57631956815447}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:33,534][0m Trial 3 finished with value: -0.7956989247311828 and parameters: {'C': 9.198094812112187, 'gamma': 0.21202242991516618}. Best is trial 0 with value: -0.7956989247311828.[0m
[32m[I 2023-05-30 09:16:33,575

[32m[I 2023-05-30 09:16:36,665][0m Trial 39 finished with value: -0.8924731182795699 and parameters: {'C': 22.637070692146064, 'gamma': 0.02313727748792173}. Best is trial 37 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:16:36,722][0m Trial 40 finished with value: -0.9086021505376344 and parameters: {'C': 18.358658008186485, 'gamma': 0.017281841926823565}. Best is trial 40 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:16:36,772][0m Trial 41 finished with value: -0.9032258064516129 and parameters: {'C': 22.26254043492283, 'gamma': 0.020382499366019543}. Best is trial 40 with value: -0.9086021505376344.[0m
[32m[I 2023-05-30 09:16:36,827][0m Trial 42 finished with value: -0.9139784946236559 and parameters: {'C': 6.303410510857286, 'gamma': 0.017403054416094126}. Best is trial 42 with value: -0.9139784946236559.[0m
[32m[I 2023-05-30 09:16:36,871][0m Trial 43 finished with value: -0.9193548387096774 and parameters: {'C': 5.40616613023465, 'gamma': 0.01060

[32m[I 2023-05-30 09:16:38,757][0m Trial 78 finished with value: -0.9086021505376344 and parameters: {'C': 14.266847876934802, 'gamma': 0.0097036583721944}. Best is trial 49 with value: -0.9301075268817204.[0m
[32m[I 2023-05-30 09:16:38,843][0m Trial 79 finished with value: -0.8118279569892473 and parameters: {'C': 1.3676186911322152, 'gamma': 0.085881705514487}. Best is trial 49 with value: -0.9301075268817204.[0m
[32m[I 2023-05-30 09:16:38,918][0m Trial 80 finished with value: -0.8817204301075269 and parameters: {'C': 32.774758083095364, 'gamma': 0.0318949243257301}. Best is trial 49 with value: -0.9301075268817204.[0m
[32m[I 2023-05-30 09:16:38,971][0m Trial 81 finished with value: -0.9086021505376344 and parameters: {'C': 5.8728452983282535, 'gamma': 0.008133460275289503}. Best is trial 49 with value: -0.9301075268817204.[0m
[32m[I 2023-05-30 09:16:39,034][0m Trial 82 finished with value: -0.9086021505376344 and parameters: {'C': 3.925532168749192, 'gamma': 0.01597674

Best Hyperparameters: {'C': 4.337493225309348, 'gamma': 0.004428023949986605}
Best Accuracy: 0.9301075268817204
Accuracy: 93.01075268817203


###### naive bayes

In [50]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    var_smoothing = trial.suggest_loguniform("var_smoothing", 1e-9, 1e-3)

    model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the Naive Bayes model with the best hyperparameters
best_model = GaussianNB(var_smoothing=study.best_params["var_smoothing"])
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:16:40,022][0m A new study created in memory with name: no-name-2f67a592-c6f2-47e7-a6d3-63c03856505b[0m
[32m[I 2023-05-30 09:16:40,047][0m Trial 0 finished with value: -0.8225806451612904 and parameters: {'var_smoothing': 3.192176149858837e-07}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:16:40,064][0m Trial 1 finished with value: -0.7956989247311828 and parameters: {'var_smoothing': 3.756903056743719e-05}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:16:40,086][0m Trial 2 finished with value: -0.7956989247311828 and parameters: {'var_smoothing': 0.0005371774259767182}. Best is trial 0 with value: -0.8225806451612904.[0m
[32m[I 2023-05-30 09:16:40,110][0m Trial 3 finished with value: -0.8279569892473119 and parameters: {'var_smoothing': 4.323427041841371e-08}. Best is trial 3 with value: -0.8279569892473119.[0m
[32m[I 2023-05-30 09:16:40,122][0m Trial 4 finished with value: -0.7956989247311828

[32m[I 2023-05-30 09:16:41,037][0m Trial 41 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 1.7093569943156465e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:41,070][0m Trial 42 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 2.0532735252348487e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:41,095][0m Trial 43 finished with value: -0.8387096774193549 and parameters: {'var_smoothing': 4.540709302914249e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:41,126][0m Trial 44 finished with value: -0.8387096774193549 and parameters: {'var_smoothing': 6.516863884906183e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:41,154][0m Trial 45 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 1.9876634930029913e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[

[32m[I 2023-05-30 09:16:42,255][0m Trial 83 finished with value: -0.8387096774193549 and parameters: {'var_smoothing': 5.32031765438322e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:42,280][0m Trial 84 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 1.0735547313831625e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:42,307][0m Trial 85 finished with value: -0.8387096774193549 and parameters: {'var_smoothing': 3.2947172260762947e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:42,329][0m Trial 86 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 1.7408436775646666e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[32m[I 2023-05-30 09:16:42,353][0m Trial 87 finished with value: -0.8494623655913979 and parameters: {'var_smoothing': 1.0026579627703786e-09}. Best is trial 10 with value: -0.8494623655913979.[0m
[

Best Hyperparameters: {'var_smoothing': 1.175659108031642e-09}
Best Accuracy: 0.8494623655913979
Accuracy: 84.94623655913979


In [51]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    var_smoothing = trial.suggest_loguniform("var_smoothing", 1e-9, 1e-3)

    model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the Naive Bayes model with the best hyperparameters
best_model = GaussianNB(var_smoothing=study.best_params["var_smoothing"])
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:16:42,726][0m A new study created in memory with name: no-name-5504d6dc-a63f-4167-b116-e52e014a4df8[0m
[32m[I 2023-05-30 09:16:42,742][0m Trial 0 finished with value: -0.23655913978494625 and parameters: {'var_smoothing': 2.2463335405604533e-05}. Best is trial 0 with value: -0.23655913978494625.[0m
[32m[I 2023-05-30 09:16:42,756][0m Trial 1 finished with value: -0.23118279569892472 and parameters: {'var_smoothing': 1.2227676644561027e-06}. Best is trial 0 with value: -0.23655913978494625.[0m
[32m[I 2023-05-30 09:16:42,764][0m Trial 2 finished with value: -0.23655913978494625 and parameters: {'var_smoothing': 1.3355084363159967e-05}. Best is trial 0 with value: -0.23655913978494625.[0m
[32m[I 2023-05-30 09:16:42,780][0m Trial 3 finished with value: -0.23118279569892472 and parameters: {'var_smoothing': 2.110627909011757e-09}. Best is trial 0 with value: -0.23655913978494625.[0m
[32m[I 2023-05-30 09:16:42,788][0m Trial 4 finished with value: -0.23118

[32m[I 2023-05-30 09:16:43,436][0m Trial 41 finished with value: -0.2956989247311828 and parameters: {'var_smoothing': 0.0008694673421847314}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:43,453][0m Trial 42 finished with value: -0.2903225806451613 and parameters: {'var_smoothing': 0.00039837221663740796}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:43,476][0m Trial 43 finished with value: -0.3064516129032258 and parameters: {'var_smoothing': 0.0009400733688924175}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:43,495][0m Trial 44 finished with value: -0.26344086021505375 and parameters: {'var_smoothing': 0.00016253368579700298}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:43,512][0m Trial 45 finished with value: -0.2956989247311828 and parameters: {'var_smoothing': 0.0005706232768467147}. Best is trial 15 with value: -0.3064516129032258.[0m
[

[32m[I 2023-05-30 09:16:44,303][0m Trial 83 finished with value: -0.2956989247311828 and parameters: {'var_smoothing': 0.000483324476125053}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:44,320][0m Trial 84 finished with value: -0.2956989247311828 and parameters: {'var_smoothing': 0.0007054534107071393}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:44,344][0m Trial 85 finished with value: -0.27419354838709675 and parameters: {'var_smoothing': 0.0003178523414152558}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:44,363][0m Trial 86 finished with value: -0.26344086021505375 and parameters: {'var_smoothing': 0.00017248160005280998}. Best is trial 15 with value: -0.3064516129032258.[0m
[32m[I 2023-05-30 09:16:44,390][0m Trial 87 finished with value: -0.2956989247311828 and parameters: {'var_smoothing': 0.00047563239293546534}. Best is trial 15 with value: -0.3064516129032258.[0m
[

Best Hyperparameters: {'var_smoothing': 0.0009209123809787445}
Best Accuracy: 0.3064516129032258
Accuracy: 30.64516129032258


###### gradient boosting

In [52]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
    max_depth = trial.suggest_int("max_depth", 2, 8)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the gradient boosting model with the best hyperparameters
best_model = GradientBoostingClassifier(
    n_estimators=study.best_params["n_estimators"],
    learning_rate=study.best_params["learning_rate"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    random_state=42
)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:16:44,683][0m A new study created in memory with name: no-name-bd478b31-85b6-4425-a5a4-f1c4c7e1defd[0m
[32m[I 2023-05-30 09:16:51,594][0m Trial 0 finished with value: -0.946236559139785 and parameters: {'n_estimators': 196, 'learning_rate': 0.058306396421710406, 'max_depth': 4, 'min_samples_split': 7}. Best is trial 0 with value: -0.946236559139785.[0m
[32m[I 2023-05-30 09:16:57,883][0m Trial 1 finished with value: -0.9139784946236559 and parameters: {'n_estimators': 333, 'learning_rate': 0.010090011428548191, 'max_depth': 2, 'min_samples_split': 5}. Best is trial 0 with value: -0.946236559139785.[0m
[32m[I 2023-05-30 09:17:00,424][0m Trial 2 finished with value: -0.9032258064516129 and parameters: {'n_estimators': 138, 'learning_rate': 0.020304017386952117, 'max_depth': 2, 'min_samples_split': 3}. Best is trial 0 with value: -0.946236559139785.[0m
[32m[I 2023-05-30 09:17:03,799][0m Trial 3 finished with value: -0.9247311827956989 and parameters: {'n_

[32m[I 2023-05-30 09:21:47,286][0m Trial 32 finished with value: -0.9354838709677419 and parameters: {'n_estimators': 149, 'learning_rate': 0.0726500898129506, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:21:51,375][0m Trial 33 finished with value: -0.9247311827956989 and parameters: {'n_estimators': 213, 'learning_rate': 0.05443112498409148, 'max_depth': 2, 'min_samples_split': 2}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:21:55,923][0m Trial 34 finished with value: -0.9408602150537635 and parameters: {'n_estimators': 162, 'learning_rate': 0.04610372410037059, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:22:00,751][0m Trial 35 finished with value: -0.9354838709677419 and parameters: {'n_estimators': 253, 'learning_rate': 0.028572967830642445, 'max_depth': 2, 'min_samples_split': 2}. Best is trial 6 with valu

[32m[I 2023-05-30 09:25:31,794][0m Trial 64 finished with value: -0.9247311827956989 and parameters: {'n_estimators': 320, 'learning_rate': 0.07814428888299738, 'max_depth': 4, 'min_samples_split': 4}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:25:38,683][0m Trial 65 finished with value: -0.9301075268817204 and parameters: {'n_estimators': 246, 'learning_rate': 0.04758556388388633, 'max_depth': 3, 'min_samples_split': 2}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:25:48,466][0m Trial 66 finished with value: -0.9301075268817204 and parameters: {'n_estimators': 272, 'learning_rate': 0.08714619686483802, 'max_depth': 4, 'min_samples_split': 6}. Best is trial 6 with value: -0.9516129032258065.[0m
[32m[I 2023-05-30 09:25:58,215][0m Trial 67 finished with value: -0.9301075268817204 and parameters: {'n_estimators': 222, 'learning_rate': 0.07185003878130701, 'max_depth': 5, 'min_samples_split': 5}. Best is trial 6 with valu

[32m[I 2023-05-30 09:30:05,704][0m Trial 96 finished with value: -0.9408602150537635 and parameters: {'n_estimators': 319, 'learning_rate': 0.06621963929689981, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 73 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:30:11,362][0m Trial 97 finished with value: -0.9354838709677419 and parameters: {'n_estimators': 305, 'learning_rate': 0.0752924780543345, 'max_depth': 2, 'min_samples_split': 9}. Best is trial 73 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:30:20,126][0m Trial 98 finished with value: -0.9301075268817204 and parameters: {'n_estimators': 268, 'learning_rate': 0.061733273788508396, 'max_depth': 4, 'min_samples_split': 10}. Best is trial 73 with value: -0.956989247311828.[0m
[32m[I 2023-05-30 09:30:34,961][0m Trial 99 finished with value: -0.8655913978494624 and parameters: {'n_estimators': 332, 'learning_rate': 0.06900705826498994, 'max_depth': 7, 'min_samples_split': 8}. Best is trial 73 with va

Best Hyperparameters: {'n_estimators': 143, 'learning_rate': 0.06568851623231012, 'max_depth': 5, 'min_samples_split': 8}
Best Accuracy: 0.956989247311828
Accuracy: 95.6989247311828


In [53]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1)
    max_depth = trial.suggest_int("max_depth", 2, 8)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return -accuracy_score(y_test, y_pred)  # maximize accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best accuracy achieved
print("Best Hyperparameters:", study.best_params)
print("Best Accuracy:", -study.best_value)

# Train the gradient boosting model with the best hyperparameters
best_model = GradientBoostingClassifier(
    n_estimators=study.best_params["n_estimators"],
    learning_rate=study.best_params["learning_rate"],
    max_depth=study.best_params["max_depth"],
    min_samples_split=study.best_params["min_samples_split"],
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test_scaled)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100}")

[32m[I 2023-05-30 09:30:40,567][0m A new study created in memory with name: no-name-62a54a0a-a16a-40a4-ba61-821e14d18306[0m
[32m[I 2023-05-30 09:30:45,752][0m Trial 0 finished with value: -0.8817204301075269 and parameters: {'n_estimators': 284, 'learning_rate': 0.05008164712024572, 'max_depth': 2, 'min_samples_split': 6}. Best is trial 0 with value: -0.8817204301075269.[0m
[32m[I 2023-05-30 09:30:57,498][0m Trial 1 finished with value: -0.5591397849462365 and parameters: {'n_estimators': 439, 'learning_rate': 0.08044612866690498, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 0 with value: -0.8817204301075269.[0m
[32m[I 2023-05-30 09:31:12,184][0m Trial 2 finished with value: -0.6021505376344086 and parameters: {'n_estimators': 471, 'learning_rate': 0.06858776810218326, 'max_depth': 7, 'min_samples_split': 9}. Best is trial 0 with value: -0.8817204301075269.[0m
[32m[I 2023-05-30 09:31:22,219][0m Trial 3 finished with value: -0.5913978494623656 and parameters: {'n

[32m[I 2023-05-30 09:34:21,233][0m Trial 32 finished with value: -0.8924731182795699 and parameters: {'n_estimators': 129, 'learning_rate': 0.04436839753253285, 'max_depth': 2, 'min_samples_split': 6}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:34:22,756][0m Trial 33 finished with value: -0.8709677419354839 and parameters: {'n_estimators': 81, 'learning_rate': 0.05091399502856877, 'max_depth': 2, 'min_samples_split': 4}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:34:28,572][0m Trial 34 finished with value: -0.8440860215053764 and parameters: {'n_estimators': 215, 'learning_rate': 0.08324144353399035, 'max_depth': 3, 'min_samples_split': 6}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:34:31,752][0m Trial 35 finished with value: -0.8763440860215054 and parameters: {'n_estimators': 169, 'learning_rate': 0.07141944682139696, 'max_depth': 2, 'min_samples_split': 7}. Best is trial 21 with v

[32m[I 2023-05-30 09:36:45,876][0m Trial 64 finished with value: -0.8655913978494624 and parameters: {'n_estimators': 107, 'learning_rate': 0.047059690782964486, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:36:49,436][0m Trial 65 finished with value: -0.8870967741935484 and parameters: {'n_estimators': 196, 'learning_rate': 0.05474242859447254, 'max_depth': 2, 'min_samples_split': 5}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:36:53,894][0m Trial 66 finished with value: -0.8870967741935484 and parameters: {'n_estimators': 235, 'learning_rate': 0.06716234597067405, 'max_depth': 2, 'min_samples_split': 7}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:36:56,879][0m Trial 67 finished with value: -0.8924731182795699 and parameters: {'n_estimators': 154, 'learning_rate': 0.04366278530992156, 'max_depth': 2, 'min_samples_split': 6}. Best is trial 21 with

[32m[I 2023-05-30 09:38:29,438][0m Trial 96 finished with value: -0.8817204301075269 and parameters: {'n_estimators': 138, 'learning_rate': 0.06129300354828444, 'max_depth': 2, 'min_samples_split': 5}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:38:33,174][0m Trial 97 finished with value: -0.8709677419354839 and parameters: {'n_estimators': 193, 'learning_rate': 0.055857508402025906, 'max_depth': 2, 'min_samples_split': 4}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:38:38,529][0m Trial 98 finished with value: -0.6559139784946236 and parameters: {'n_estimators': 91, 'learning_rate': 0.0418552547191322, 'max_depth': 7, 'min_samples_split': 5}. Best is trial 21 with value: -0.8978494623655914.[0m
[32m[I 2023-05-30 09:38:46,127][0m Trial 99 finished with value: -0.8333333333333334 and parameters: {'n_estimators': 280, 'learning_rate': 0.0526725510591582, 'max_depth': 3, 'min_samples_split': 7}. Best is trial 21 with va

Best Hyperparameters: {'n_estimators': 106, 'learning_rate': 0.053033662597125254, 'max_depth': 2, 'min_samples_split': 5}
Best Accuracy: 0.8978494623655914
Accuracy: 89.78494623655914


In [27]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

### using randomized search

###### gradient boosting

### 1

In [57]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Create a Gradient Boosting classifier
model = GradientBoostingClassifier(random_state=42)

# Perform Randomized Search CV to find the best hyperparameters
random_search = RandomizedSearchCV(model, param_grid, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Print the best parameters found during hyperparameter tuning
print("\nBest Parameters:")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

KeyboardInterrupt: 

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Create a Gradient Boosting classifier
model = GradientBoostingClassifier(random_state=42)

# Perform Randomized Search CV to find the best hyperparameters
random_search = RandomizedSearchCV(model, param_grid, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_scaled)

# Print the best parameters found during hyperparameter tuning
print("\nBest Parameters:")
print(best_params)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}")

### 2

In [58]:
# Define the classifier
clf = GradientBoostingClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "n_estimators": sp_randint(10, 100),
    "learning_rate": [0.1, 0.05, 0.01],
    "max_depth": sp_randint(1, 10),
    "subsample": [0.5, 0.7, 1.0],
    "max_features": ["sqrt", "log2", None]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'learning_rate': 0.05, 'max_depth': 3, 'max_features': None, 'n_estimators': 73, 'subsample': 0.5}
Accuracy on Test Set: 92.47%


In [59]:
# Define the classifier
clf = GradientBoostingClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "n_estimators": sp_randint(10, 100),
    "learning_rate": [0.1, 0.05, 0.01],
    "max_depth": sp_randint(1, 10),
    "subsample": [0.5, 0.7, 1.0],
    "max_features": ["sqrt", "log2", None]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'learning_rate': 0.05, 'max_depth': 3, 'max_features': None, 'n_estimators': 73, 'subsample': 0.5}
Accuracy on Test Set: 83.87%


###### decision trees

In [60]:
model = DecisionTreeClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": sp_randint(1, 10),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "max_features": sp_randint(1, X.shape[1] + 1),
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'criterion': 'entropy', 'max_depth': 5, 'max_features': 33, 'min_samples_leaf': 10, 'min_samples_split': 7}
Accuracy on Test Set: 87.10%


In [61]:
model = DecisionTreeClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": sp_randint(1, 10),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "max_features": sp_randint(1, X.shape[1] + 1),
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'criterion': 'entropy', 'max_depth': 8, 'max_features': 24, 'min_samples_leaf': 3, 'min_samples_split': 2}
Accuracy on Test Set: 77.42%


###### random forest

In [62]:
# Define the classifier
clf = RandomForestClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "n_estimators": sp_randint(10, 100),
    "max_depth": [None] + list(range(1, 10)),
    "max_features": sp_randint(1, X.shape[1] + 1),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 29, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 80}
Accuracy on Test Set: 94.62%


In [63]:
# Define the classifier
clf = RandomForestClassifier()

# Define the parameter grid for hyperparameter search
param_dist = {
    "n_estimators": sp_randint(10, 100),
    "max_depth": [None] + list(range(1, 10)),
    "max_features": sp_randint(1, X.shape[1] + 1),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 29, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 80}
Accuracy on Test Set: 88.17%


In [28]:
from scipy.stats import uniform

###### logistic regression

In [66]:
# Define the classifier
clf = LogisticRegression()

# Define the parameter grid for hyperparameter search
param_dist = {
    "penalty": ["l1", "l2"],
    "C": uniform(loc=0, scale=4)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'C': 2.473544037332349, 'penalty': 'l2'}
Accuracy on Test Set: 83.33%


In [67]:
# Define the classifier
clf = LogisticRegression()

# Define the parameter grid for hyperparameter search
param_dist = {
    "penalty": ["l1", "l2"],
    "C": uniform(loc=0, scale=4)
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  {'C': 0.23233444867279784, 'penalty': 'l2'}
Accuracy on Test Set: 89.25%


###### using support vector machines

In [68]:
# Define the classifier
clf = SVC()

# Define the parameter grid for hyperparameter search
param_dist = {
    "C": uniform(loc=0, scale=10),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

KeyboardInterrupt: 

In [None]:
# Define the classifier
clf = SVC()

# Define the parameter grid for hyperparameter search
param_dist = {
    "C": uniform(loc=0, scale=10),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=5, random_state=42)
random_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

### using bayesian optimization

In [2]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
     ------------------------------------- 100.3/100.3 kB 52.4 kB/s eta 0:00:00
Collecting pyaml>=16.9
  Downloading pyaml-23.5.9-py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.5.9 scikit-optimize-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [29]:
from skopt import BayesSearchCV

###### logistic regression

In [30]:
# Define the classifier
clf = LogisticRegression()

# Define the parameter search space
param_space = {
    "C": (0.01, 10.0, "log-uniform"),
    "penalty": ["l1", "l2"],
    "fit_intercept": [True, False],
    "solver": ["liblinear", "saga"]
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('C', 9.961646744450592), ('fit_intercept', False), ('penalty', 'l1'), ('solver', 'liblinear')])
Accuracy on Test Set: 86.56%


In [31]:
# Define the classifier
clf = LogisticRegression()

# Define the parameter search space
param_space = {
    "C": (0.01, 10.0, "log-uniform"),
    "penalty": ["l1", "l2"],
    "fit_intercept": [True, False],
    "solver": ["liblinear", "saga"]
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('C', 0.010253943538922507), ('fit_intercept', True), ('penalty', 'l2'), ('solver', 'liblinear')])
Accuracy on Test Set: 90.86%


###### decision trees

In [32]:
# Define the classifier
clf = DecisionTreeClassifier()

# Define the parameter search space
param_space = {
    "criterion": ["gini", "entropy"],
    "max_depth": (1, 10),
    "min_samples_split": (2, 11),
    "min_samples_leaf": (1, 11),
    "max_features": (1, X.shape[1] + 1)
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('criterion', 'gini'), ('max_depth', 7), ('max_features', 50), ('min_samples_leaf', 5), ('min_samples_split', 5)])
Accuracy on Test Set: 85.48%


In [33]:
# Define the classifier
clf = DecisionTreeClassifier()

# Define the parameter search space
param_space = {
    "criterion": ["gini", "entropy"],
    "max_depth": (1, 10),
    "min_samples_split": (2, 11),
    "min_samples_leaf": (1, 11),
    "max_features": (1, X.shape[1] + 1)
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('criterion', 'gini'), ('max_depth', 8), ('max_features', 57), ('min_samples_leaf', 1), ('min_samples_split', 11)])
Accuracy on Test Set: 62.37%


###### random forest

In [34]:
# Define the classifier
clf = RandomForestClassifier()

# Define the parameter search space
param_space = {
    "n_estimators": (10, 100),
    "max_depth": (1, 10),
    "min_samples_split": (2, 11),
    "min_samples_leaf": (1, 11),
    "max_features": (1, X.shape[1] + 1)
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('max_depth', 6), ('max_features', 57), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 100)])
Accuracy on Test Set: 94.62%


In [35]:
# Define the classifier
clf = RandomForestClassifier()

# Define the parameter search space
param_space = {
    "n_estimators": (10, 100),
    "max_depth": (1, 10),
    "min_samples_split": (2, 11),
    "min_samples_leaf": (1, 11),
    "max_features": (1, X.shape[1] + 1)
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('max_depth', 5), ('max_features', 52), ('min_samples_leaf', 2), ('min_samples_split', 6), ('n_estimators', 27)])
Accuracy on Test Set: 79.57%


###### support vector machines

In [36]:
# Define the classifier
clf = SVC()

# Define the parameter search space
param_space = {
    "C": (0.01, 10.0, "log-uniform"),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": (0.01, 10.0, "log-uniform")
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

KeyboardInterrupt: 

###### naive bayes

In [37]:
# Define the classifier
clf = GaussianNB()

# Perform cross-validation to evaluate the classifier
scores = cross_val_score(clf, X, y, cv=5)

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Average Accuracy: {:.2f}%".format(scores.mean() * 100))

NameError: name 'cross_val_score' is not defined

###### k-nearest neighbors

In [38]:
# Define the classifier
clf = KNeighborsClassifier()

# Define the parameter search space
param_space = {
    "n_neighbors": (1, 10),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('n_neighbors', 10), ('p', 2), ('weights', 'distance')])
Accuracy on Test Set: 81.18%


In [39]:
# Define the classifier
clf = KNeighborsClassifier()

# Define the parameter search space
param_space = {
    "n_neighbors": (1, 10),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

# Perform Bayesian Optimization for hyperparameter tuning
bayes_search = BayesSearchCV(
    clf,
    param_space,
    n_iter=20,
    cv=5,
    random_state=42
)
bayes_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters: ", bayes_search.best_params_)

# Evaluate the best model on the test set
best_model = bayes_search.best_estimator_
accuracy = best_model.score(X_test_scaled, y_test)
print("Accuracy on Test Set: {:.2f}%".format(accuracy * 100))

Best Hyperparameters:  OrderedDict([('n_neighbors', 2), ('p', 1), ('weights', 'distance')])
Accuracy on Test Set: 89.25%
