In [3]:
import pandas as pd

df = pd.read_csv("machine_learning.csv")

In [4]:
df.drop(columns="Unnamed: 0", inplace=True)

In [5]:
df.head()

Unnamed: 0,OHE__job_blue-collar,OHE__job_entrepreneur,OHE__job_housemaid,OHE__job_management,OHE__job_retired,OHE__job_self-employed,OHE__job_services,OHE__job_student,OHE__job_technician,OHE__job_unemployed,...,remainder__balance,remainder__day_of_week,remainder__duration,remainder__campaign,remainder__pdays,remainder__previous,remainder__default,remainder__housing,remainder__loan,remainder__y
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.958876,-1.351873,0.412248,-1.113596,0.0,0.0,1,1,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.631754,-1.351873,-0.264939,-1.113596,0.0,0.0,1,1,1,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.679929,-1.351873,-0.978769,-1.113596,0.0,0.0,1,1,0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.593647,-1.351873,-0.793941,-1.113596,0.0,0.0,1,1,1,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.682348,-1.351873,0.057359,-1.113596,0.0,0.0,1,0,1,0


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df.drop(columns="remainder__y")
y = df["remainder__y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
rf = RandomForestClassifier(n_estimators=300)

In [8]:
model = rf.fit(X_train, y_train)

In [9]:
pred = model.predict(X_test)

In [10]:
from sklearn.metrics import classification_report

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95     13671
           1       0.48      0.67      0.56      1249

    accuracy                           0.91     14920
   macro avg       0.72      0.80      0.75     14920
weighted avg       0.93      0.91      0.92     14920



In [11]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# # Dictionary to store models
# models = {
#     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
#     "Logistic Regression": LogisticRegression(max_iter=200, random_state=42),
#     "Naive Bayes": GaussianNB(),
#     "Support Vector Machine": SVC(kernel="linear", random_state=42),
#     "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
#     "Decision Tree": DecisionTreeClassifier(random_state=42),
# }

# # Train and evaluate each model
# results = {}
# for name, model in models.items():
#     print(f"Training {name}...")
#     # Train the model
#     model.fit(X_train, y_train)
#     # Predict on test data
#     y_pred = model.predict(X_test)
#     # Evaluate accuracy
#     accuracy = accuracy_score(y_test, y_pred)
#     results[name] = accuracy
#     print(f"Accuracy for {name}: {accuracy:.4f}")
#     print("Classification Report:")
#     print(classification_report(y_test, y_pred))
#     print("Confusion Matrix:")
#     print(confusion_matrix(y_test, y_pred))
#     print("-" * 50)

# # Display results summary
# print("\nModel Performance Summary:")
# for name, accuracy in results.items():
#     print(f"{name}: {accuracy:.4f}")

In [12]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score


# Define the objective function
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 10, 300)
    max_depth = trial.suggest_int("max_depth", 2, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # Create a RandomForest model with suggested hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
    )

    # Evaluate the model using cross-validation
    score = cross_val_score(rf, X_train, y_train, cv=3, scoring="accuracy").mean()
    return score


# Create and run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Display the best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Cross-Validation Accuracy:", study.best_value)

# Train and evaluate the final model with the best parameters
best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

# Test accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy with Best Parameters:", accuracy)

[I 2024-12-18 11:37:30,628] A new study created in memory with name: no-name-993a9cb2-2f1a-4b2f-9880-a360093b7331
[I 2024-12-18 11:37:41,087] Trial 0 finished with value: 0.898913868805916 and parameters: {'n_estimators': 196, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.898913868805916.
[I 2024-12-18 11:37:53,029] Trial 1 finished with value: 0.8995411178237761 and parameters: {'n_estimators': 203, 'max_depth': 11, 'min_samples_split': 12, 'min_samples_leaf': 20, 'max_features': 'log2'}. Best is trial 1 with value: 0.8995411178237761.
[I 2024-12-18 11:38:00,787] Trial 2 finished with value: 0.9054834769403453 and parameters: {'n_estimators': 94, 'max_depth': 29, 'min_samples_split': 16, 'min_samples_leaf': 15, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.9054834769403453.
[I 2024-12-18 11:38:05,657] Trial 3 finished with value: 0.8985177115314779 and parameters: {'n_estimators': 70, 'max_depth': 2, 'min

KeyboardInterrupt: 