## 1. Import libraries

In [75]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

## 2. Load train and test Data

In [20]:
X_train, y_train, X_test, y_test = (pd.read_csv(f"../Data/Processed_Data/{file}.csv") for file in ["X_train", "y_train", "X_test", "y_test"])

In [58]:
# Convert y_train and y_test to 1-dimensional arrays using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

## 3. Model Selection

According to the map below, I used three models to finally choose the best ones according to their accuracy.

<img src="https://scikit-learn.org/stable/_static/ml_map.png"/>

Models used:
   * **`K-Nearest Neighbors (KNN)`:**
        KNN is a simple classification algorithm that assigns a data point to the majority class of its k-nearest neighbors based on distance.

------------------
   * **`Logistic Regression`:**
        Logistic Regression is a classification algorithm that estimates probabilities and predicts the most likely class using the logistic (sigmoid) function.
-----------------
   * **`Random Forest`:**
        Random Forest is an ensemble learning method that combines multiple decision trees to make predictions, providing robust and accurate results.


## 4. Model Training

In [59]:
models={
    "Logistic_Regression":LogisticRegression(),
    "KNN":KNeighborsClassifier(),
    "Random_Forest":RandomForestClassifier()
}

In [60]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_scores={}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name]=model.score(X_test, y_test)
        
    return model_scores

In [61]:
fit_and_score(models, X_train, X_test, y_train, y_test)

{'Logistic_Regression': 0.8360655737704918,
 'KNN': 0.8524590163934426,
 'Random_Forest': 0.8852459016393442}

## 5. Hyperparameter Tuning

### 5-1. Using RandomizedSearchCV

In [62]:
# create a hyperparametr grid for logistic regression
log_reg_grid = {
    "C": np.logspace(-4,4,20),
    "solver": ['liblinear']
}

np.random.seed(42)

rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                             param_distributions=log_reg_grid,
                             cv=5,
                             n_iter=50,
                             verbose=True)

rs_log_reg.fit(X_train, y_train)



Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [63]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.012742749857031334}

In [64]:
rs_log_reg.score(X_test, y_test)

0.8524590163934426

In [65]:
# create a hyperparametr grid for KNN
knn_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'p': [1, 2] 
}

np.random.seed(42)

rs_knn=RandomizedSearchCV(KNeighborsClassifier(),
                             param_distributions=knn_grid,
                             cv=5,
                             n_iter=20,
                             verbose=True)

rs_knn.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [66]:
rs_knn.best_params_

{'weights': 'uniform', 'p': 2, 'n_neighbors': 17}

In [67]:
rs_knn.score(X_test, y_test)

0.8524590163934426

In [76]:
# create a hyperparametr grid for random forest 
rf_grid = {
    "n_estimators": np.arange(10,1000,50),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2,20,2),
    "min_samples_leaf": np.arange(1,20,2)
}



rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                             param_distributions=rf_grid,
                             cv=5,
                             n_iter=20,
                             verbose=True)

rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [77]:
rs_rf.best_params_

{'n_estimators': 260,
 'min_samples_split': 12,
 'min_samples_leaf': 13,
 'max_depth': 5}

In [78]:
rs_rf.score(X_test, y_test)

0.8360655737704918

### 5-2. Using GridSearchCV

In [81]:
# create a hyperparametr grid for logistic regression
log_reg_grid = {
    "C": np.logspace(-4,4,20),
    "solver": ['liblinear']
}

np.random.seed(42)

gs_log_reg=GridSearchCV(LogisticRegression(),
                             param_grid=log_reg_grid,
                             cv=5,
                             verbose=True)

gs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [82]:
gs_log_reg.best_params_

{'C': 0.012742749857031334, 'solver': 'liblinear'}

In [83]:
gs_log_reg.score(X_test, y_test)

0.8524590163934426

In [85]:
# create a hyperparametr grid for KNN
knn_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'p': [1, 2] 
}

np.random.seed(42)

gs_knn=GridSearchCV(KNeighborsClassifier(),
                             param_grid=knn_grid,
                             cv=5,
                             verbose=True)

gs_knn.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [86]:
gs_knn.best_params_

{'n_neighbors': 17, 'p': 2, 'weights': 'uniform'}

In [87]:
gs_knn.score(X_test, y_test)

0.8524590163934426

In [None]:
# create a hyperparametr grid for random forest 
rf_grid = {
    "n_estimators": np.arange(10,1000,50),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2,20,2),
    "min_samples_leaf": np.arange(1,20,2)
}



gs_rf=GridSearchCV(RandomForestClassifier(),
                             param_grid=rf_grid,
                             cv=5,
                             verbose=True)

gs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 7200 candidates, totalling 36000 fits


In [None]:
gs_rf.best_params_

In [None]:
gs_rf.score(X_test, y_test)

## 6. Save the best model

In [None]:
# # Get the best model and its hyperparameters
# best_rf_model = gs_rf.best_estimator_

# # Save the best model to a file
# joblib.dump(best_rf_model, "../Models/Trained_models/best_rf_model.pkl")