In [2]:
#initialize packages 


import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import GridSearchCV


In [3]:
#load data 

data = pd.read_csv("modeldata_for_parameters.csv")



# Split the data into input features (X) and target variable (y)
X = data.drop('Activity', axis=1)  # Input features
y = data['Activity']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Random grid search 

In [4]:
rfc = RandomForestClassifier(random_state=42)
print(rfc.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [5]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 50)]

# Number of features to consider at every split
max_features = ['sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 50)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500], 'max_features': ['sqrt'], 'max_depth': [10, 13, 17, 21, 25, 29, 33, 37, 41, 44, 48, 52, 56, 60, 64, 68, 72, 75, 79, 83, 87, 91, 95, 99, 103, 106, 110, 114, 118, 122, 126, 130, 134, 137, 141, 145, 149, 153, 157, 161, 165, 168, 172, 176, 180, 184, 188, 192, 196, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [6]:

#https://www.kaggle.com/code/prashant111/random-forest-classifier-feature-importance
#https://towardsdatascience.com/understanding-feature-importance-and-how-to-implement-it-in-python-ff0287b20285

#Initialize the model 

rfc = RandomForestClassifier(random_state=42)
k = 5

#Initialize the randomizedSeachCV using the parameter grid and model 
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = k, verbose=0, random_state=42, n_jobs = -1)


# Fit the random search model
rfc_random.fit(X_train, y_train)

best_iteration_indices = []
parameters_for_grid = {'n_estimators': [],
               'max_features': [],
               'max_depth': [],
               'min_samples_split': [],
               'min_samples_leaf': [],
               'bootstrap': []}

#print(rfc_random.cv_results_) 
for i in range(0,k):
    print(f"Fold {i+1}:")
    best_value = np.argmax(rfc_random.cv_results_[f"split{i}_test_score"])
    best_iteration_indices.append(best_value)
    best_accuracy = rfc_random.cv_results_[f"split{i}_test_score"][best_value]
    best_params = rfc_random.cv_results_['params'][best_value]
    for key in best_params:
        if best_params[key] not in parameters_for_grid[key]:
            parameters_for_grid[key].append(best_params[key])
    #mean_accuracy = rfc_random.cv_results_['mean_test_score'][best_value]
    print(f"Best Accuracy in fold {i+1}:", best_accuracy)
    print("Best Parameter:", best_params)
    print("Indices for best accuracy in fold:", best_value)
    #print(f"Mean Accuracy in fold {i+1}:", mean_accuracy)
    print()


for i in best_iteration_indices:
    mean_accuracy = rfc_random.cv_results_["mean_test_score"][i]
    print(f"Mean test score across folds for candidate {i}:", mean_accuracy)
    
print(best_iteration_indices)
#print(rfc_random.cv_results_)    
#Predict and confusion matrix    
    
pred = rfc_random.predict(X_test)

accuracy_score(y_test,pred)

print(confusion_matrix(y_test,pred))



Fold 1:
Best Accuracy in fold 1: 0.5674786043449638
Best Parameter: {'n_estimators': 70, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 168, 'bootstrap': False}
Indices for best accuracy in fold: 37

Fold 2:
Best Accuracy in fold 2: 0.5411454904542462
Best Parameter: {'n_estimators': 460, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 145, 'bootstrap': True}
Indices for best accuracy in fold: 0

Fold 3:
Best Accuracy in fold 3: 0.5589203423304806
Best Parameter: {'n_estimators': 190, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 87, 'bootstrap': True}
Indices for best accuracy in fold: 86

Fold 4:
Best Accuracy in fold 4: 0.5540184453227931
Best Parameter: {'n_estimators': 480, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 33, 'bootstrap': True}
Indices for best accuracy in fold: 27

Fold 5:
Best Accuracy in fold 5: 0.5355731225296443
Be

In [78]:
print(parameters_for_grid['n_estimators'])
n_estimaters_grid = []
n_estimaters_grid.append(round(np.quantile(parameters_for_grid['n_estimators'], 0)))
n_estimaters_grid.append(round(np.quantile(parameters_for_grid['n_estimators'], 0.25)))
n_estimaters_grid.append(round(np.quantile(parameters_for_grid['n_estimators'], 0.5)))
n_estimaters_grid.append(round(np.quantile(parameters_for_grid['n_estimators'], 0.75)))
n_estimaters_grid.append(round(np.quantile(parameters_for_grid['n_estimators'], 1)))


max_depth_grid = []
max_depth_grid.append(round(np.quantile(parameters_for_grid['max_depth'], 0)))
max_depth_grid.append(round(np.quantile(parameters_for_grid['max_depth'], 0.25)))
max_depth_grid.append(round(np.quantile(parameters_for_grid['max_depth'], 0.5)))
max_depth_grid.append(round(np.quantile(parameters_for_grid['max_depth'], 0.75)))
max_depth_grid.append(round(np.quantile(parameters_for_grid['max_depth'], 1)))


print(max_depth_grid)

[480, 40, 110, 130]
[13, 15, 17, 87, 157]


# Grid search 

In [7]:
# Create the parameter grid based on the results of random search 

grid_grid = {
    'bootstrap': parameters_for_grid['bootstrap'],
    'max_depth': parameters_for_grid['max_depth'],
    'max_features': ['sqrt'],
    'min_samples_leaf': parameters_for_grid['min_samples_leaf'],
    'min_samples_split': parameters_for_grid['min_samples_split'],
    'n_estimators': parameters_for_grid['n_estimators']
}

rfc = RandomForestClassifier(random_state=42)
k = 5 

rfc_grid = GridSearchCV(estimator = rfc, param_grid = grid_grid, 
                          cv = k, n_jobs = -1, verbose = 0)

rfc_grid.fit(X_train, y_train)

best_iteration_indices_grid = []

#print(rfc_random.cv_results_) 
for i in range(0,k):
    print(f"Fold {i+1}:")
    best_value = np.argmax(rfc_grid.cv_results_[f"split{i}_test_score"])
    best_iteration_indices.append(best_value)
    best_accuracy = rfc_grid.cv_results_[f"split{i}_test_score"][best_value]
    best_params = rfc_grid.cv_results_['params'][best_value]
    #mean_accuracy = rfc_random.cv_results_['mean_test_score'][best_value]
    print(f"Best Accuracy in fold {i+1}:", best_accuracy)
    print("Best Parameter:", best_params)
    print("Indices for best accuracy in fold:", best_value)
    #print(f"Mean Accuracy in fold {i+1}:", mean_accuracy)



for i in best_iteration_indices_grid:
    mean_accuracy = rfc_grid.cv_results_["mean_test_score"][i]
    print(f"Mean test score across folds for candidate {i}:", mean_accuracy)
    
    
pred_grid = rfc_grid.predict(X_test)

accuracy_score(y_test,pred)

print(confusion_matrix(y_test,pred_grid))


Fold 1:
Best Accuracy in fold 1: 0.5674786043449638
Best Parameter: {'bootstrap': False, 'max_depth': 168, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 70}
Indices for best accuracy in fold: 0
Fold 2:
Best Accuracy in fold 2: 0.5457537853851218
Best Parameter: {'bootstrap': False, 'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 280}
Indices for best accuracy in fold: 149
Fold 3:
Best Accuracy in fold 3: 0.5589203423304806
Best Parameter: {'bootstrap': True, 'max_depth': 168, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 190}
Indices for best accuracy in fold: 162
Fold 4:
Best Accuracy in fold 4: 0.5540184453227931
Best Parameter: {'bootstrap': True, 'max_depth': 33, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 480}
Indices for best accuracy in fold: 263
Fold 5:
Best Accuracy in fold 5: 0.5375494071146245
B