In [2]:
#importing all the packages needed
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix
from collections import Counter

In [3]:
#this is from part2 I hadn't saved the standardized data that's why it's done again here
#taking the data and standardizing it
df = pd.read_csv('training_data/rice_feature_data.csv')
feats = ['mean_b', 'var_b', 'skew_b', 'kurt_b', 'entr_b', 'mean_g', 'var_g',
       'skew_g', 'kurt_g', 'entr_g', 'mean_r', 'var_r', 'skew_r', 'kurt_r',
       'entr_r', 'major_axis_length', 'minor_axis_length', 'area', 'perimeter',
       'roundness', 'aspect_ratio']
for feat in feats:
    df['{}_Z'.format(feat)] = (df[feat] - df[feat].mean()) / df[feat].std()

feats_Z = [feat + '_Z' for feat in feats]

y = df['class'].values
X = df[feats_Z].values

In [4]:
# Define models and their respective hyperparameter search ranges
models = {
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=20),
    'MLP': MLPClassifier(max_iter=500, early_stopping=True, random_state=20)
}

params = {
    'KNN': {'n_neighbors': range(1, 30)},
    'Random Forest': {'max_depth': [2, 4, 6, 8, 10, 12], 'max_features': [2, 3, 4, 5, 6, 7, 8]},
    'MLP': {'hidden_layer_sizes': range(3,22),
            'activation': ['logistic', 'relu'],
            'solver': ['sgd', 'adam'],
            'validation_fraction': [0.1,0.5]}
}

# Perform nested cross-validation for each model
outer_kf = KFold(n_splits=10, shuffle=True, random_state=10)
inner_kf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=50)

for name, model in models.items():
    # Define parameter grid for this model
    param_grid = params[name]

    # Perform grid search for best hyperparameters
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_kf)
    grid_search.fit(X, y)

    # Use best hyperparameters to evaluate model
    best_model = grid_search.best_estimator_
    accuracy = cross_val_score(best_model, X=X, y=y, cv=outer_kf, scoring='accuracy')
    y_pred = cross_val_predict(best_model, X=X, y=y, cv=outer_kf)
    cm = confusion_matrix(y, y_pred)
    
    # Print results for this model
    print(f"{name} - Best Params: {grid_search.best_params_}")
    print(f"{name} - Accuracy: {accuracy.mean()}")
    print(f"{name} - Confusion Matrix:\n{cm}")
    
    # Determine the most frequent hyperparameters for this model
    param_results = [(tuple(params.items()), mean_test_score) for params, mean_test_score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score'])]
    freq_params = Counter(param_results)
    most_freq_params = freq_params.most_common(1)[0]
    print(f"{name} - Most frequent hyperparameters: {most_freq_params[0]} with score {most_freq_params[1]}")


KNN - Best Params: {'n_neighbors': 9}
KNN - Accuracy: 0.9800000000000001
KNN - Confusion Matrix:
[[99  0  1]
 [ 0 99  1]
 [ 2  2 96]]
KNN - Most frequent hyperparameters: ((('n_neighbors', 1),), 0.9744444444444443) with score 1
Random Forest - Best Params: {'max_depth': 2, 'max_features': 3}
Random Forest - Accuracy: 0.99
Random Forest - Confusion Matrix:
[[100   0   0]
 [  0  99   1]
 [  1   1  98]]
Random Forest - Most frequent hyperparameters: ((('max_depth', 2), ('max_features', 2)), 0.9855555555555554) with score 1
MLP - Best Params: {'activation': 'relu', 'hidden_layer_sizes': 19, 'solver': 'adam', 'validation_fraction': 0.5}
MLP - Accuracy: 0.9733333333333334
MLP - Confusion Matrix:
[[100   0   0]
 [  3  95   2]
 [  3   0  97]]
MLP - Most frequent hyperparameters: ((('activation', 'logistic'), ('hidden_layer_sizes', 3), ('solver', 'sgd'), ('validation_fraction', 0.1)), 0.2733333333333333) with score 1
