In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import metrics 

In [None]:
df = pd.read_csv('kp_classify.csv')
df.head()

In [None]:
# dividing the datasets into two parts i.e. training datasets and test datasets
mapping = {'right-side': 0, 'left-side': 1, 'bottom': 2, 'complete': 3}
df['pos'] = df['pos'].map(mapping)

y = df['match']
X = df.drop(labels=['match', 'image_tag'], axis=1)


print(y[:2])
print(X[:2])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

In [None]:
def calculate_accuracy(X_train, X_test, y_train, y_test, parameters = {'n_estimators': 100,
                                     'max_depth': None,
                                     'max_features': 'sqrt',
                                     'criterion': 'entropy',
                                     'min_samples_split': 2,
                                     'min_samples_leaf': 1,
                                     'bootstrap': True,
                                     'class_weight' : "balanced",
                                     'max_samples': None
                                     }):
    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test) 

    # using metrics module for accuracy calculation
    accuracy = metrics.accuracy_score(y_test, y_pred)
    feature_imp = pd.Series(clf.feature_importances_, index = X.keys()).sort_values(ascending = False)
    return accuracy, feature_imp
    

In [None]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 15],
    'criterion': ['gini', 'entropy', 'log_loss'],
    # 'min_samples_split': [2, 4],
    # 'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_samples': [None]
}

In [None]:
# Initialize the model
model = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)


In [None]:
# Assuming you have X_train and y_train as your training data
grid_search.fit(X_train, y_train)


In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# Evaluate on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Set Accuracy:", test_accuracy)

In [None]:
results = grid_search.cv_results_

# Create a DataFrame to hold the results
results_df = pd.DataFrame({
    'params': results['params'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score'],
    'mean_fit_time': results['mean_fit_time'],
    'mean_score_time': results['mean_score_time']
})

# Print or inspect the results DataFrame
print(results_df)