# COMP5318 Assignment 1: Rice Classification

##### Group number: ...
##### Student 1 SID: 540660818
##### Student 2 SID: ...  

In [6]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold

In [7]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [8]:
# Load the rice dataset: rice-final2.csv
import pandas as pd

# Load dataset
file_path = "rice-final2.csv"  # Adjust the path if needed
df = pd.read_csv(file_path)

# Display dataset information
print("Dataset Information:")
print(df.info())

# Display summary statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Display first few rows
print("\nFirst few rows of the dataset:")
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Area               1400 non-null   object
 1   Perimiter          1400 non-null   object
 2   Major_Axis_Length  1400 non-null   object
 3   Minor_Axis_Length  1400 non-null   object
 4   Eccentricity       1400 non-null   object
 5   Convex_Area        1400 non-null   object
 6   Extent             1400 non-null   object
 7   class              1400 non-null   object
dtypes: object(8)
memory usage: 87.6+ KB
None

Descriptive Statistics:
         Area Perimiter Major_Axis_Length Minor_Axis_Length Eccentricity  \
count    1400      1400              1400              1400         1400   
unique   1259      1389              1396              1397         1393   
top     12837         ?                 ?                 ?            ?   
freq        4         4     

In [9]:
# Pre-process dataset
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Convert all feature columns to float
for col in df.columns[:-1]:  
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float, coercing errors to NaN

# Fill missing values with column mean
imputer = SimpleImputer(strategy='mean')
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])  # Apply only to feature columns

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

# Convert class labels: "class1" -> 0, "class2" -> 1
df.iloc[:, -1] = df.iloc[:, -1].replace({"class1": 0, "class2": 1}).astype(int)

# Convert DataFrame to NumPy arrays
X = df.iloc[:, :-1].to_numpy()  # Features
y = df.iloc[:, -1].to_numpy()   # Class labels

# Define the modified print_data function
def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first n_rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: number of rows to print (default is 10)
    """
    for example_num in range(n_rows):
        # Print feature values formatted to 4 decimal places
        print(",".join("{:.4f}".format(feature) for feature in X[example_num]), end=",")
        # Print class label without decimal places
        print(y[example_num])

# Call print_data function with X and y
print_data(X, y, n_rows=10)

0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1


### Part 1: Cross-validation without parameter tuning

In [10]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Ensure y is an integer array
y = np.array(y, dtype=int)  # Convert y to integer type

# Define StratifiedKFold with 10 splits and random_state=0
cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [11]:
from sklearn.linear_model import LogisticRegression
# Function for Logistic Regression Classifier
def logregClassifier(X, y):
    model = LogisticRegression(random_state=0, max_iter=1000)  # Ensure convergence with more iterations
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [12]:
from sklearn.naive_bayes import GaussianNB

# Function for Naïve Bayes Classifier
def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [13]:
from sklearn.tree import DecisionTreeClassifier

# Function for Decision Tree Classifier
def dtClassifier(X, y):
    model = DecisionTreeClassifier(criterion="entropy", random_state=0)  # Using Information Gain (Entropy)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [14]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Function for Bagging with Decision Trees
def bagDTClassifier(X, y, n_estimators=50, max_samples=1.0, max_depth=None):
    base_tree = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = BaggingClassifier(base_tree, n_estimators=n_estimators, max_samples=max_samples, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

# Function for AdaBoost with Decision Trees
def adaDTClassifier(X, y, n_estimators=50, learning_rate=1.0, max_depth=1):
    base_tree = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = AdaBoostClassifier(base_tree, n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

# Function for Gradient Boosting
def gbClassifier(X, y, n_estimators=50, learning_rate=0.1):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

### Part 1 Results

In [15]:
# Parameters for Part 1:

# Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

# AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_max_depth = 5

# Gradient Boosting
gb_n_estimators = 50
gb_learning_rate = 0.5

# Run classifiers and store results
logreg_score = logregClassifier(X, y)
nb_score = nbClassifier(X, y)
dt_score = dtClassifier(X, y)
bag_score = bagDTClassifier(X, y, n_estimators=bag_n_estimators, max_samples=0.8, max_depth=bag_max_depth)
ada_score = adaDTClassifier(X, y, n_estimators=ada_n_estimators, learning_rate=ada_learning_rate, max_depth=ada_max_depth)
gb_score = gbClassifier(X, y, n_estimators=gb_n_estimators, learning_rate=gb_learning_rate)

# Print results for each classifier in Part 1 to 4 decimal places
print(f"LogR average cross-validation accuracy: {logreg_score:.4f}")
print(f"NB average cross-validation accuracy: {nb_score:.4f}")
print(f"DT average cross-validation accuracy: {dt_score:.4f}")
print(f"Bagging average cross-validation accuracy: {bag_score:.4f}")
print(f"AdaBoost average cross-validation accuracy: {ada_score:.4f}")
print(f"GB average cross-validation accuracy: {gb_score:.4f}")

LogR average cross-validation accuracy: 0.9386
NB average cross-validation accuracy: 0.9264
DT average cross-validation accuracy: 0.9179
Bagging average cross-validation accuracy: 0.9400
AdaBoost average cross-validation accuracy: 0.9407
GB average cross-validation accuracy: 0.9321


### Part 2: Cross-validation with parameter tuning

In [16]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Define the stratified KFold for cross-validation
cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [17]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]


def bestKNNClassifier(X, y):
    """
    Finds the best KNN classifier using grid search with 10-fold stratified cross-validation.
    
    Arguments:
        X: numpy array of shape (n_samples, n_features), feature matrix.
        y: numpy array of shape (n_samples,), target labels.
    
    Returns:
        best_k: Best value of k (n_neighbors).
        best_p: Best value of p (distance metric).
        best_cv_accuracy: Best cross-validation accuracy.
        test_accuracy: Test set accuracy.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Define the parameter grid for KNN
    param_grid = {'n_neighbors': k, 'p': p}
    
    # Initialize KNN classifier
    knn = KNeighborsClassifier()
    
    # Perform grid search with 10-fold stratified cross-validation
    grid_search = GridSearchCV(knn, param_grid, cv=cvKFold, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and cross-validation accuracy
    best_k = grid_search.best_params_['n_neighbors']
    best_p = grid_search.best_params_['p']
    best_cv_accuracy = grid_search.best_score_
    
    # Evaluate on the test set
    y_pred = grid_search.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    return best_k, best_p, best_cv_accuracy, test_accuracy


In [18]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5] 
gamma = [0.01, 0.1, 1, 10]

def bestSVMClassifier(X, y):
    """
    Finds the best SVM classifier using grid search with 10-fold stratified cross-validation.
    
    Arguments:
        X: numpy array of shape (n_samples, n_features), feature matrix.
        y: numpy array of shape (n_samples,), target labels.
    
    Returns:
        best_C: Best value of C (regularization parameter).
        best_gamma: Best value of gamma (kernel coefficient).
        best_cv_accuracy: Best cross-validation accuracy.
        test_accuracy: Test set accuracy.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Define the parameter grid for SVM
    param_grid = {'C': C, 'gamma': gamma}
    
    # Initialize SVM classifier with RBF kernel
    svm = SVC(kernel='rbf', random_state=0)
    
    # Perform grid search with 10-fold stratified cross-validation
    grid_search = GridSearchCV(svm, param_grid, cv=cvKFold, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and cross-validation accuracy
    best_C = grid_search.best_params_['C']
    best_gamma = grid_search.best_params_['gamma']
    best_cv_accuracy = grid_search.best_score_
    
    # Evaluate on the test set
    y_pred = grid_search.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    return best_C, best_gamma, best_cv_accuracy, test_accuracy

In [19]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    """
    Finds the best Random Forest classifier using grid search with 10-fold stratified cross-validation.
    
    Arguments:
        X: numpy array of shape (n_samples, n_features), feature matrix.
        y: numpy array of shape (n_samples,), target labels.
    
    Returns:
        best_n_estimators: Best value of n_estimators (number of trees).
        best_max_leaf_nodes: Best value of max_leaf_nodes (maximum number of leaf nodes).
        best_cv_accuracy: Best cross-validation accuracy.
        test_accuracy: Test set accuracy.
        test_macro_f1: Test set macro average F1 score.
        test_weighted_f1: Test set weighted average F1 score.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    
    # Define the parameter grid for Random Forest
    param_grid = {'n_estimators': n_estimators, 'max_leaf_nodes': max_leaf_nodes}
    
    # Initialize Random Forest classifier with information gain and max_features='sqrt'
    rf = RandomForestClassifier(criterion='entropy', max_features='sqrt', random_state=0)
    
    # Perform grid search with 10-fold stratified cross-validation
    grid_search = GridSearchCV(rf, param_grid, cv=cvKFold, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Get the best parameters and cross-validation accuracy
    best_n_estimators = grid_search.best_params_['n_estimators']
    best_max_leaf_nodes = grid_search.best_params_['max_leaf_nodes']
    best_cv_accuracy = grid_search.best_score_
    
    # Evaluate on the test set
    y_pred = grid_search.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_macro_f1 = f1_score(y_test, y_pred, average='macro')
    test_weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    
    return best_n_estimators, best_max_leaf_nodes, best_cv_accuracy, test_accuracy, test_macro_f1, test_weighted_f1

### Part 2: Results

In [20]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.

# KNN
best_k, best_p, knn_cv_accuracy, knn_test_accuracy = bestKNNClassifier(X, y)
print("KNN best k:", best_k)
print("KNN best p:", best_p)
print("KNN cross-validation accuracy: {:.4f}".format(knn_cv_accuracy))
print("KNN test set accuracy: {:.4f}".format(knn_test_accuracy))
print()

# SVM
best_C, best_gamma, svm_cv_accuracy, svm_test_accuracy = bestSVMClassifier(X, y)
print("SVM best C:", best_C)
print("SVM best gamma:", best_gamma)
print("SVM cross-validation accuracy: {:.4f}".format(svm_cv_accuracy))
print("SVM test set accuracy: {:.4f}".format(svm_test_accuracy))
print()

# Random Forest
best_n_estimators, best_max_leaf_nodes, rf_cv_accuracy, rf_test_accuracy, rf_macro_f1, rf_weighted_f1 = bestRFClassifier(X, y)
print("RF best n_estimators:", best_n_estimators)
print("RF best max_leaf_nodes:", best_max_leaf_nodes)
print("RF cross-validation accuracy: {:.4f}".format(rf_cv_accuracy))
print("RF test set accuracy: {:.4f}".format(rf_test_accuracy))
print("RF test set macro average F1: {:.4f}".format(rf_macro_f1))
print("RF test set weighted average F1: {:.4f}".format(rf_weighted_f1))

KNN best k: 5
KNN best p: 1
KNN cross-validation accuracy: 0.9371
KNN test set accuracy: 0.9257

SVM best C: 5
SVM best gamma: 1
SVM cross-validation accuracy: 0.9457
SVM test set accuracy: 0.9343

RF best n_estimators: 30
RF best max_leaf_nodes: 12
RF cross-validation accuracy: 0.9390
RF test set accuracy: 0.9371
RF test set macro average F1: 0.9355
RF test set weighted average F1: 0.9370


### Part 3: Reflection

##### Write one paragraph describing the most important thing that you have learned throughout this assignment.
##### Student 1: ...
##### Student 2: ...