In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the preprocessed data from the CSV file
df = pd.read_csv('preprocessed_dataclassification.csv')

# Split the data into features (X) and target variable (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Classifier Accuracy: {accuracy_rf}')

# Train and evaluate Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Classifier Accuracy: {accuracy_dt}')

# Train and evaluate Logistic Regression Classifier
lr_clf = LogisticRegression(random_state=42)
lr_clf.fit(X_train, y_train)
y_pred_lr = lr_clf.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Classifier Accuracy: {accuracy_lr}')

Random Forest Classifier Accuracy: 0.8268156424581006
Decision Tree Classifier Accuracy: 0.8156424581005587
Logistic Regression Classifier Accuracy: 0.8044692737430168


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search for Random Forest three times
for i in range(3):
    rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy')
    rf_grid_search.fit(X_train, y_train)

    print(f"Random Forest Iteration {i+1} - Best Parameters:", rf_grid_search.best_params_)
    print(f"Random Forest Iteration {i+1} - Best Score:", rf_grid_search.best_score_)
    print()

# Define hyperparameters grid for Decision Tree
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search for Decision Tree three times
for i in range(3):
    dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5, scoring='accuracy')
    dt_grid_search.fit(X_train, y_train)

    print(f"Decision Tree Iteration {i+1} - Best Parameters:", dt_grid_search.best_params_)
    print(f"Decision Tree Iteration {i+1} - Best Score:", dt_grid_search.best_score_)
    print()

# Define hyperparameters grid for Logistic Regression
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Perform Grid Search for Logistic Regression three times
for i in range(3):
    lr_grid_search = GridSearchCV(LogisticRegression(random_state=42), lr_param_grid, cv=5, scoring='accuracy')
    lr_grid_search.fit(X_train, y_train)

    print(f"Logistic Regression Iteration {i+1} - Best Parameters:", lr_grid_search.best_params_)
    print(f"Logistic Regression Iteration {i+1} - Best Score:", lr_grid_search.best_score_)
    print()

Random Forest Iteration 1 - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Iteration 1 - Best Score: 0.8314389835516597

Random Forest Iteration 2 - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Iteration 2 - Best Score: 0.8314389835516597

Random Forest Iteration 3 - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Iteration 3 - Best Score: 0.8314389835516597

Decision Tree Iteration 1 - Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5}
Decision Tree Iteration 1 - Best Score: 0.8118093174431202

Decision Tree Iteration 2 - Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5}
Decision Tree Iteration 2 - Best Score: 0.8118093174431202

Decision Tree Iteration 3 - Best Parameters: {'max_depth': 30, 'min_samples_leaf'

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Iteration 1 - Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Logistic Regression Iteration 1 - Best Score: 0.7990840145769724



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Iteration 2 - Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Logistic Regression Iteration 2 - Best Score: 0.7990840145769724



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Iteration 3 - Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Logistic Regression Iteration 3 - Best Score: 0.7990840145769724



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\U

In [5]:
from sklearn.metrics import accuracy_score

# Select the best model based on the hyperparameter tuning results
best_model = RandomForestClassifier(**{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy on Test Set: {accuracy_best}')

# Finalize the best model by retraining it on the entire dataset
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])
best_model.fit(X_full, y_full)

# Save the finalized model using joblib
import joblib
joblib.dump(best_model, 'final_classification_model.pkl')
print('Finalized model saved as final_model.pkl')

Best Model Accuracy on Test Set: 0.7597765363128491
Finalized model saved as final_model.pkl
