# COMP5318 Assignment 1: Rice Classification

##### Group number: A1 group-set1 177
##### Student 1 SID: 540660818
##### Student 2 SID: 550247340  

In [1]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold

In [3]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Load the rice dataset: rice-final2.csv
import pandas as pd

# Load dataset
file_path = "rice-final2.csv"  # Adjust the path if needed
df = pd.read_csv(file_path)

# Display dataset information
print("Dataset Information:")
print(df.info())

# Display summary statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Display first few rows
print("\nFirst few rows of the dataset:")
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Area               1400 non-null   object
 1   Perimiter          1400 non-null   object
 2   Major_Axis_Length  1400 non-null   object
 3   Minor_Axis_Length  1400 non-null   object
 4   Eccentricity       1400 non-null   object
 5   Convex_Area        1400 non-null   object
 6   Extent             1400 non-null   object
 7   class              1400 non-null   object
dtypes: object(8)
memory usage: 87.6+ KB
None

Descriptive Statistics:
         Area Perimiter Major_Axis_Length Minor_Axis_Length Eccentricity  \
count    1400      1400              1400              1400         1400   
unique   1259      1389              1396              1397         1393   
top     12837         ?                 ?                 ?            ?   
freq        4         4     

In [8]:
# Pre-process dataset
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Convert all feature columns to float
for col in df.columns[:-1]:  
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float, coercing errors to NaN

# Fill missing values with column mean
imputer = SimpleImputer(strategy='mean')
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])  # Apply only to feature columns

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

# Convert class labels: "class1" -> 0, "class2" -> 1
df.iloc[:, -1] = df.iloc[:, -1].replace({"class1": 0, "class2": 1}).astype(int)

# Convert DataFrame to NumPy arrays
X = df.iloc[:, :-1].to_numpy()  # Features
y = df.iloc[:, -1].to_numpy()   # Class labels

# Define the modified print_data function
def print_data(X, y, n_rows=10):
    for example_num in range(n_rows):
        # Print feature values formatted to 4 decimal places
        print(",".join("{:.4f}".format(feature) for feature in X[example_num]), end=",")
        # Print class label without decimal places
        print(y[example_num])

# Call print_data function with X and y
print_data(X, y, n_rows=10)

0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1


### Part 1: Cross-validation without parameter tuning

In [20]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Ensure y is an integer array
y = np.array(y, dtype=int)  # Convert y to integer type

# Define StratifiedKFold with 10 splits and random_state=0
cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [21]:
from sklearn.linear_model import LogisticRegression
# Function for Logistic Regression Classifier
def logregClassifier(X, y):
    model = LogisticRegression(random_state=0, max_iter=1000)  # Ensure convergence with more iterations
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [22]:
from sklearn.naive_bayes import GaussianNB

# Function for Naïve Bayes Classifier
def nbClassifier(X, y):
    model = GaussianNB()
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [23]:
from sklearn.tree import DecisionTreeClassifier

# Function for Decision Tree Classifier
def dtClassifier(X, y):
    model = DecisionTreeClassifier(criterion="entropy", random_state=0)  # Using Information Gain (Entropy)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

In [24]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Function for Bagging with Decision Trees
def bagDTClassifier(X, y, n_estimators=50, max_samples=1.0, max_depth=None):
    base_tree = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = BaggingClassifier(base_tree, n_estimators=n_estimators, max_samples=max_samples, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

# Function for AdaBoost with Decision Trees
def adaDTClassifier(X, y, n_estimators=50, learning_rate=1.0, max_depth=1):
    base_tree = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, random_state=0)
    model = AdaBoostClassifier(base_tree, n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

# Function for Gradient Boosting
def gbClassifier(X, y, n_estimators=50, learning_rate=0.1):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=0)
    scores = cross_val_score(model, X, y, cv=cvKFold, scoring='accuracy')
    return scores.mean()

### Part 1 Results

In [25]:
# Parameters for Part 1:

# Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

# AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_max_depth = 5

# Gradient Boosting
gb_n_estimators = 50
gb_learning_rate = 0.5

# Run classifiers and store results
logreg_score = logregClassifier(X, y)
nb_score = nbClassifier(X, y)
dt_score = dtClassifier(X, y)
bag_score = bagDTClassifier(X, y, n_estimators=bag_n_estimators, max_samples=0.8, max_depth=bag_max_depth)
ada_score = adaDTClassifier(X, y, n_estimators=ada_n_estimators, learning_rate=ada_learning_rate, max_depth=ada_max_depth)
gb_score = gbClassifier(X, y, n_estimators=gb_n_estimators, learning_rate=gb_learning_rate)

# Print results for each classifier in Part 1 to 4 decimal places
print(f"LR average cross-validation accuracy: {logreg_score:.4f}")
print(f"NB average cross-validation accuracy: {nb_score:.4f}")
print(f"DT average cross-validation accuracy: {dt_score:.4f}")
print(f"Bagging average cross-validation accuracy: {bag_score:.4f}")
print(f"AdaBoost average cross-validation accuracy: {ada_score:.4f}")
print(f"GB average cross-validation accuracy: {gb_score:.4f}")

LogR average cross-validation accuracy: 0.9386
NB average cross-validation accuracy: 0.9264
DT average cross-validation accuracy: 0.9179
Bagging average cross-validation accuracy: 0.9400
AdaBoost average cross-validation accuracy: 0.9407
GB average cross-validation accuracy: 0.9321


### Part 2: Cross-validation with parameter tuning

In [57]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]


def bestKNNClassifier(X, y):
    
    return #(appropriate values so that the required printing can be done)

In [58]:
# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5] 
gamma = [0.01, 0.1, 1, 10]

def bestSVMClassifier(X, y):

    return  #(appropriate values so that the required printing can be done)

In [59]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):

    return #(appropriate values so that the required printing can be done)

### Part 2: Results

In [60]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.
print("KNN best k: ")
print("KNN best p: ")
print("KNN cross-validation accuracy: ")
print("KNN test set accuracy: ")

print()

print("SVM best C: ")
print("SVM best gamma: ")
print("SVM cross-validation accuracy: ")
print("SVM test set accuracy: ")

print()

print("RF best n_estimators: ")
print("RF best max_leaf_nodes: ")
print("RF cross-validation accuracy: ")
print("RF test set accuracy: ")
print("RF test set macro average F1: ")
print("RF test set weighted average F1: ")

KNN best k: 
KNN best p: 
KNN cross-validation accuracy: 
KNN test set accuracy: 

SVM best C: 
SVM best gamma: 
SVM cross-validation accuracy: 
SVM test set accuracy: 

RF best n_estimators: 
RF best max_leaf_nodes: 
RF cross-validation accuracy: 
RF test set accuracy: 
RF test set macro average F1: 
RF test set weighted average F1: 


### Part 3: Reflection

##### Write one paragraph describing the most important thing that you have learned throughout this assignment.
##### Student 1: ...
##### Student 2: ...