# Mandatory Assignment 3 DAT200 NMBU 2025

In [70]:
# Import necessary libraries
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

## 1. Load the dataset and explore it

In [71]:
train_df = pd.read_csv('assets/train.csv')
test_df = pd.read_csv('assets/test.csv')
train_df.head()

Unnamed: 0,Size,Weight,Sweetness,Softness,HarvestTime,Ripeness,Acidity,Peel Thickness,Banana Density,Quality
0,-1.825734,-0.883754,-2.42353,-1.198136,-4.286523,1.585792,-0.589002,0.164434,65822.758426,0
1,-0.142286,-0.708374,-2.224219,2.22265,1.896814,-4.284821,1.069387,0.704945,99930.329162,0
2,-1.957254,-4.293733,-1.073703,-1.405019,-0.729812,3.930497,-0.39784,0.85281,37081.567982,0
3,-2.168043,3.095472,1.707717,-0.584218,-0.564767,0.01474,-0.103799,0.582909,63931.435551,1
4,-3.149338,3.058402,2.173671,-0.265609,-2.56322,0.376015,1.434252,0.102162,48711.448819,1


In [72]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Size            2800 non-null   float64
 1   Weight          2800 non-null   float64
 2   Sweetness       2800 non-null   float64
 3   Softness        2800 non-null   float64
 4   HarvestTime     2800 non-null   float64
 5   Ripeness        2800 non-null   float64
 6   Acidity         2800 non-null   float64
 7   Peel Thickness  2800 non-null   float64
 8   Banana Density  2800 non-null   float64
 9   Quality         2800 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 218.9 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Size            1200 non-null   float64
 1   Weight          1200 non-null   float64
 2   Sweetness 

(None, None)

## 2. Preprocess the data

In [73]:
X_train_df = train_df.drop('Quality', axis=1)
y_train_df = train_df['Quality']

X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=0.2, random_state=42)

scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

## 3. Model selection

We will try:
* 3.1 RandomForest Classifier
* 3.2 Support Vector Classifier
* 3.3 Logistic Regression
* 3.4 K-Neighbors
* 3.5 DecisionTreeClassifier
* 3.6 GridSearchCV for best parameters

First we'll find the classifier which seems to be the best one, and then use GridSearchCV to find the best parameters for this model.*


### 3.1 RandomForest Classifier

For the RandomForest we dont need the scaled data

In [74]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

Accuracy: 0.9678571428571429, F1: 0.9678522207719291


### 3.2 Support Vector Classifier

In [75]:
clf = svm.SVC()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

Accuracy: 0.9696428571428571, F1: 0.9696419856987479


### 3.3 Logistic Regression

In [76]:
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

Accuracy: 0.8946428571428572, F1: 0.8946398327191836


### 3.4 K-neighbors Classifier

In [77]:
clf = KNeighborsClassifier()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

Accuracy: 0.9571428571428572, F1: 0.9571444972728282


### 3.5 DecisionTree Classifier

In [78]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

Accuracy: 0.9214285714285714, F1: 0.9214315780266876


### 3.6 GridSearchCV

With experimenting with our different models, RandomForest and SVC seems most promising. Therefore, we'll use GridSearchCV on these models.

In [79]:
# Grid search for Random Forest
param_grid = {
    'n_estimators': [10, 20, 50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier()
model_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0)
model_rf.fit(X_train, y_train)

# Test the model on the test data
y_pred = model_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')


Accuracy: 0.9607142857142857, F1: 0.9607037561741395


In [80]:
# Save best parameters to a JSON file
with open('best_params_random.json', 'w') as f:
    json.dump(grid_search.best_params_, f)

In [81]:
# Grid search for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

clf = svm.SVC()
model_svm = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0)
model_svm.fit(X_train_scaled, y_train)

# Test the model on the test data
y_pred = model_svm.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

# Save best parameters to a JSON file
with open('best_params_svm.json', 'w') as f:
    json.dump(grid_search.best_params_, f)

Accuracy: 0.9696428571428571, F1: 0.9696419856987479


In [82]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import json

# Define the parameter grid for KNeighborsClassifier
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Initialize the classifier
clf = KNeighborsClassifier()

# Setup GridSearchCV using the parameter grid
model_knn = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0) 
model_knn.fit(X_train_scaled, y_train)

# Test the model on the test data
y_pred = model_knn.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy}, F1: {f1}')

# Save best parameters to a JSON file
with open('best_params_knn.json', 'w') as f:
    json.dump(grid_search.best_params_, f)


Accuracy: 0.9696428571428571, F1: 0.9696419856987479


In [84]:
def save_submission(model, df_test, file_name="submission.csv"):
    y_pred = model.predict(df_test)
    submission = pd.DataFrame(y_pred, columns=["Quality"])
    submission.index.name = "ID"
    submission.to_csv(file_name)

save_submission(model_rf, test_df, file_name="submission_rf.csv")
save_submission(model_svm, test_df, file_name="submission_svm.csv")
save_submission(model_knn, test_df, file_name="submission_knn.csv")

