# COMP5318 Assignment 1: Rice Classification

##### Group number: A1 group-set2 169
##### Student 1 SID: 520463341
##### Student 2 SID: ...  

In [1]:
# Import all libraries
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score



In [2]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Load the rice dataset: rice-final2.csv
raw_data = np.genfromtxt("rice-final2.csv", delimiter=",", dtype=str, skip_header=1)

In [4]:
# Separate features and labels
X_raw = raw_data[:, :-1]
y_raw = raw_data[:, -1]

# 1. Replace missing attributes
X_raw[X_raw == '?'] = np.nan # Replace '?' with np.nan for imputation
X = X_raw.astype(float)
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# 2. Normalise using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 3. Change class values to 0 and 1
y = np.where(y_raw == 'class1', 0, 1).astype(int)

In [5]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])

print_data(X_scaled, y)


0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1


### Part 1: Cross-validation without parameter tuning

In [6]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [7]:
# Logistic Regression
def logregClassifier(X, y):
    model = LogisticRegression(random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

In [8]:
#Naïve Bayes
def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

In [9]:
# Decision Tree
def dtClassifier(X, y):
    model = DecisionTreeClassifier(criterion="entropy", random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

In [10]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    base_estimator = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = BaggingClassifier(estimator=base_estimator,
                              n_estimators=n_estimators,
                              max_samples=max_samples,
                              random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    base_estimator = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = AdaBoostClassifier(estimator=base_estimator,
                               n_estimators=n_estimators,
                               learning_rate=learning_rate,
                               random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    model = GradientBoostingClassifier(n_estimators=n_estimators,
                                       learning_rate=learning_rate,
                                       random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [11]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
print(f"LR average cross-validation accuracy: {logregClassifier(X_scaled, y):.4f}")
print(f"NB average cross-validation accuracy: {nbClassifier(X_scaled, y):.4f}")
print(f"DT average cross-validation accuracy: {dtClassifier(X_scaled, y):.4f}")
print(f"Bagging average cross-validation accuracy: {bagDTClassifier(X_scaled, y, bag_n_estimators, bag_max_samples, bag_max_depth):.4f}")
print(f"AdaBoost average cross-validation accuracy: {adaDTClassifier(X_scaled, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth):.4f}")
print(f"GB average cross-validation accuracy: {gbClassifier(X_scaled, y, gb_n_estimators, gb_learning_rate):.4f}")

LR average cross-validation accuracy: 0.9386
NB average cross-validation accuracy: 0.9264
DT average cross-validation accuracy: 0.9179
Bagging average cross-validation accuracy: 0.9414
AdaBoost average cross-validation accuracy: 0.9407
GB average cross-validation accuracy: 0.9321


### Part 2: Cross-validation with parameter tuning

In [12]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]


def bestKNNClassifier(X, y):
    param_grid = {
        'n_neighbors': k,
        'p': p
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    model = KNeighborsClassifier()
    grid = GridSearchCV(model, param_grid, cv=cvKFold)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    cv_acc = grid.best_score_
    test_acc = accuracy_score(y_test, best_model.predict(X_test))
    
    return best_params['n_neighbors'], best_params['p'], cv_acc, test_acc


In [13]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5] 
gamma = [0.01, 0.1, 1, 10]

def bestSVMClassifier(X, y):
    param_grid = {
        'C': C,
        'gamma': gamma
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    model = SVC(kernel='rbf', random_state=0)
    grid = GridSearchCV(model, param_grid, cv=cvKFold)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    cv_acc = grid.best_score_
    test_acc = accuracy_score(y_test, best_model.predict(X_test))
    
    return best_params['C'], best_params['gamma'], cv_acc, test_acc


In [14]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    param_grid = {
        'n_estimators': n_estimators,
        'max_leaf_nodes': max_leaf_nodes
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    model = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=0)
    grid = GridSearchCV(model, param_grid, cv=cvKFold)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    cv_acc = grid.best_score_
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    return best_params['n_estimators'], best_params['max_leaf_nodes'], cv_acc, test_acc, macro_f1, weighted_f1


### Part 2: Results

In [15]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

k_best, p_best, knn_cv_acc, knn_test_acc = bestKNNClassifier(X_scaled, y)
print(f"KNN best k: {k_best}")
print(f"KNN best p: {p_best}")
print(f"KNN cross-validation accuracy: {knn_cv_acc:.4f}")
print(f"KNN test set accuracy: {knn_test_acc:.4f}")

print()

C_best, gamma_best, svm_cv_acc, svm_test_acc = bestSVMClassifier(X_scaled, y)
print(f"SVM best C: {C_best:.4f}")
print(f"SVM best gamma: {gamma_best:.4f}")
print(f"SVM cross-validation accuracy: {svm_cv_acc:.4f}")
print(f"SVM test set accuracy: {svm_test_acc:.4f}")

print()

n_best, leaf_best, rf_cv_acc, rf_test_acc, macro_f1, weighted_f1 = bestRFClassifier(X_scaled, y)
print(f"RF best n_estimators: {n_best}")
print(f"RF best max_leaf_nodes: {leaf_best}")
print(f"RF cross-validation accuracy: {rf_cv_acc:.4f}")
print(f"RF test set accuracy: {rf_test_acc:.4f}")
print(f"RF test set macro average F1: {macro_f1:.4f}")
print(f"RF test set weighted average F1: {weighted_f1:.4f}")

KNN best k: 5
KNN best p: 1
KNN cross-validation accuracy: 0.9371
KNN test set accuracy: 0.9257

SVM best C: 5.0000
SVM best gamma: 1.0000
SVM cross-validation accuracy: 0.9457
SVM test set accuracy: 0.9343

RF best n_estimators: 30
RF best max_leaf_nodes: 12
RF cross-validation accuracy: 0.9390
RF test set accuracy: 0.9371
RF test set macro average F1: 0.9355
RF test set weighted average F1: 0.9370


### Part 3: Reflection

##### Write one paragraph describing the most important thing that you have learned throughout this assignment.
##### Student 1: The most important thing I learned throughout this assignment was how to actually apply and compare different machine learning classifiers in practice in a real-world context and using real data. While I had previously understood the theory behind various models, this was the first time I had to implement them side by side, evaluate their performance fairly using cross-validation, and interpret the results on a real dataset. It taught me not just how each model works in isolation, but how they behave relative to each other under the same conditions. This helped me understand their relative strengths. I noticed, for example, that ensemble methods like Bagging performed better overall, while simpler models like Naive Bayes worked quickly with minimal tuning. Although I didn't exactly learn in which cases I should use each model, it gave me the practical experience to begin recognising which ones might be more suitable depending on the goals — whether that's speed, simplicity, or accuracy.

##### Student 2: ...