In [100]:
# CML3014N Machine Learning Assignment 2
# Data Source:
#     1. Student Performance https://archive.ics.uci.edu/dataset/320/student+performance
#     2. Wine Quality https://archive.ics.uci.edu/dataset/186/wine+quality

# Dataset Description:
#    1. Student Performance mixed dataset with both numerical and categorical data
#       G1, G2, G3 - numerical from 0 to 20 (the target variable. G3 is the final grade)
#    2. Wine Quality dataset with numerical continuous
#       quality - score between 0 and 10 (the target variable)

# Import Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pd.set_option("display.max_columns", None)

In [101]:
# Step 1: Load the dataset
wineRed_df = pd.read_csv('data/winequality-red.csv', delimiter=';')
wineWhite_df = pd.read_csv('data/winequality-white.csv', delimiter=';')
stdMath_df = pd.read_csv('data/student-mat.csv', delimiter=';')
stdPor_df = pd.read_csv('data/student-por.csv', delimiter=';')

# Display the first few rows of the dataset
print("Wine Red Dataset")
print("Total number of columns: ", wineRed_df.shape[1], "Total number of rows: ", wineRed_df.shape[0])
display(wineRed_df.head())

print("Wine White Dataset")
print("Total number of columns: ", wineWhite_df.shape[1], "Total number of rows: ", wineWhite_df.shape[0])
display(wineWhite_df.head())

print("Student Math Dataset")
print("Total number of columns: ", stdMath_df.shape[1], "Total number of rows: ", stdMath_df.shape[0])
display(stdMath_df.head())

print("Student Portuguese Dataset")
print("Total number of columns: ", stdPor_df.shape[1], "Total number of rows: ", stdPor_df.shape[0])
display(stdPor_df.head())

Wine Red Dataset
Total number of columns:  12 Total number of rows:  1599


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Wine White Dataset
Total number of columns:  12 Total number of rows:  4898


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Student Math Dataset
Total number of columns:  33 Total number of rows:  395


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


Student Portuguese Dataset
Total number of columns:  33 Total number of rows:  649


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


In [102]:
# Step 2: Preprocess the student dataset
# Handle categorical data using label encoder and one hot encoder
# Column to process School, sex, address, famsize, pstatus, medu, fedu, mjob, fjob, reason, guardian, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic

# First encode the categorical data
columns = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
numerical_columns = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']
# Label encoding for categorical data
labelEncoder = LabelEncoder()
stdMath_df_le = stdMath_df.copy()
stdPor_df_le = stdPor_df.copy()
for column in columns:
    stdMath_df_le[column] = labelEncoder.fit_transform(stdMath_df[column])
    stdPor_df_le[column] = labelEncoder.fit_transform(stdPor_df[column])

oneHotEncoder = OneHotEncoder(sparse_output=False)
stdMath_df_ohe = pd.DataFrame(oneHotEncoder.fit_transform(stdMath_df[columns]), columns=oneHotEncoder.get_feature_names_out(columns))
stdPor_df_ohe = pd.DataFrame(oneHotEncoder.fit_transform(stdPor_df[columns]), columns=oneHotEncoder.get_feature_names_out(columns))
stdMath_df_ohe = pd.concat([stdMath_df_ohe, stdMath_df[numerical_columns]], axis=1)
stdPor_df_ohe = pd.concat([stdPor_df_ohe, stdPor_df[numerical_columns]], axis=1)

In [103]:
# Step 3: Split the dataset into training and testing
# * Use 80% of the data for training and 20% for testing
# * For each dataset, split into input features and target
# * For wine dataset, the target is quality
# * For student dataset, the target is G3

# Step 3.1 Split the wine dataset with train_test_split, x is features, y is target
wineRed_x = wineRed_df.drop(columns=['quality'])
wineRed_y = wineRed_df['quality']  # Target variable
wineRed_x_train, wineRed_x_test, wineRed_y_train, wineRed_y_test = train_test_split(wineRed_x, wineRed_y, test_size=0.2, random_state=42)

wineWhite_x = wineWhite_df.drop(columns=['quality'])
wineWhite_y = wineWhite_df['quality']  # Target variable
wineWhite_x_train, wineWhite_x_test, wineWhite_y_train, wineWhite_y_test = train_test_split(wineWhite_x, wineWhite_y, test_size=0.2, random_state=42)

# Step 3.2 Split the student dataset with train_test_split, x is features, y is target
# * Use the label encoded dataset
stdMath_x_le = stdMath_df_le.drop(columns=['G3'])
stdMath_y_le = stdMath_df_le['G3']
stdMath_x_train_le, stdMath_x_test_le, stdMath_y_train_le, stdMath_y_test_le = train_test_split(stdMath_x_le, stdMath_y_le, test_size=0.2, random_state=42)

stdPor_x_le = stdPor_df_le.drop(columns=['G3'])
stdPor_y_le = stdPor_df_le['G3']
stdPor_x_train_le, stdPor_x_test_le, stdPor_y_train_le, stdPor_y_test_le = train_test_split(stdPor_x_le, stdPor_y_le, test_size=0.2, random_state=42)

# * Use the one hot encoded dataset
stdMath_x_ohe = stdMath_df_ohe.drop(columns=['G3'])
stdMath_y_ohe = stdMath_df_ohe['G3']
stdMath_x_train_ohe, stdMath_x_test_ohe, stdMath_y_train_ohe, stdMath_y_test_ohe = train_test_split(stdMath_x_ohe, stdMath_y_ohe, test_size=0.2, random_state=42)

stdPor_x_ohe = stdPor_df_ohe.drop(columns=['G3'])
stdPor_y_ohe = stdPor_df_ohe['G3']
stdPor_x_train_ohe, stdPor_x_test_ohe, stdPor_y_train_ohe, stdPor_y_test_ohe = train_test_split(stdPor_x_ohe, stdPor_y_ohe, test_size=0.2, random_state=42)

In [104]:
# Step 4: Train, Evaluate, and Visualize with Random Forest
# * Train the Gaussian Naive Bayes model
# * Both dataset target are continuous, use MLPClassifier
# * Evaluate the model with Classification report and Confusion matrix
# * Plot the confusion matrix

result = pd.DataFrame(columns=['Accuracy'])


# A function to train and evaluate Random Forest models
def train_and_evaluate(model, x_train, y_train, x_test, y_test, tittle):
    # Train the Random Forest model
    model.fit(x_train, y_train)

    # * Evaluate the model with error based metrics (Accuracy, Precision, Recall, F1 Score)
    y_pred = model.predict(x_test)
    # Classification report
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: ", accuracy)
    print("Classification Report for ", tittle)
    print(classification_report(y_test, y_pred))
    result.loc[tittle] = [accuracy]

    # * Plot the prediction vs actual target (Confusion Matrix)
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
    plt.title(tittle)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig('output/2. RandomForest ' + tittle + '.png', dpi=600)
    plt.close()


# Step 4.1 Build Random Forest Regressors
# Define a general-purpose Random Forest model
# For wine dataset
# 11 features with 4898 samples
wineModel = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    max_features='sqrt',
)

# For student dataset
# 30 features with 649 samples
stdModel = RandomForestClassifier(
    n_estimators=600,
    max_depth=100,
    min_samples_split=24,
    min_samples_leaf=12,
    random_state=42,
    n_jobs=-1,
    max_features=None,
)

# Standard Random Forest model
standardModel = RandomForestClassifier(
    n_estimators=100,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,
    max_features='sqrt',
)

In [105]:
# Step 4.2 Train and evaluate the Random Forest model
# For red Wine dataset


# ! Possible Combination
# ! 1. ori feature - ori target
# 1. ori feature - ori target
# train_and_evaluate(wineModel, wineRed_x_train, wineRed_y_train, wineRed_x_test, wineRed_y_test, 'Wine Red [wineModel]')

# For white Wine dataset
# train_and_evaluate(wineModel, wineWhite_x_train, wineWhite_y_train, wineWhite_x_test, wineWhite_y_test, 'Wine White [wineModel]')

# For student dataset
# ! Possible Combination
# ! 1. le feature - le target
# ! 2. ohe feature - ohe target
# For Math Student dataset
# 1. le feature - le target
train_and_evaluate(stdModel, stdMath_x_train_le, stdMath_y_train_le, stdMath_x_test_le, stdMath_y_test_le, 'Student Math [stdModel] - [le]')
# 2. ohe feature - ohe target
train_and_evaluate(stdModel, stdMath_x_train_ohe, stdMath_y_train_ohe, stdMath_x_test_ohe, stdMath_y_test_ohe, 'Student Math [stdModel] - [ohe]')

# For Portuguese Student dataset
# 1. le feature - le target
train_and_evaluate(stdModel, stdPor_x_train_le, stdPor_y_train_le, stdPor_x_test_le, stdPor_y_test_le, 'Student Portuguese [stdModel] - [le]')
# 2. ohe feature - ohe target
train_and_evaluate(stdModel, stdPor_x_train_ohe, stdPor_y_train_ohe, stdPor_x_test_ohe, stdPor_y_test_ohe, 'Student Portuguese [stdModel] - [ohe]')

Accuracy:  0.4050632911392405
Classification Report for  Student Math [stdModel] - [le]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         1
           8       0.06      0.17      0.09         6
           9       0.00      0.00      0.00         5
          10       0.38      0.55      0.44        11
          11       0.57      0.80      0.67         5
          12       0.33      0.20      0.25         5
          13       0.57      0.80      0.67         5
          14       0.57      0.67      0.62         6
          15       0.70      0.70      0.70        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.14      1.00      0.25         1
          19       0.00      0.00      0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.4177215189873418
Classification Report for  Student Math [stdModel] - [ohe]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         1
           8       0.06      0.17      0.09         6
           9       0.50      0.40      0.44         5
          10       0.42      0.45      0.43        11
          11       0.57      0.80      0.67         5
          12       0.33      0.20      0.25         5
          13       0.57      0.80      0.67         5
          14       0.57      0.67      0.62         6
          15       0.70      0.70      0.70        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.14      1.00      0.25         1
          19       0.00      0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.5153846153846153
Classification Report for  Student Portuguese [stdModel] - [le]
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           7       0.00      0.00      0.00         1
           8       0.56      0.71      0.62         7
           9       0.00      0.00      0.00         5
          10       0.52      0.71      0.60        17
          11       0.67      0.72      0.69        25
          12       0.43      0.19      0.26        16
          13       0.42      0.62      0.50        13
          14       0.38      0.25      0.30        12
          15       0.53      1.00      0.69        10
          16       0.80      0.44      0.57         9
          17       0.27      0.60      0.38         5
          18       0.00      0.00      0.00         7
          19       0.00      0.00      0.00         1

    accuracy                           0.52       130
   macro avg       0.36      0.41      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.5076923076923077
Classification Report for  Student Portuguese [stdModel] - [ohe]
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           7       0.00      0.00      0.00         1
           8       0.56      0.71      0.62         7
           9       0.00      0.00      0.00         5
          10       0.52      0.71      0.60        17
          11       0.67      0.72      0.69        25
          12       0.33      0.12      0.18        16
          13       0.37      0.77      0.50        13
          14       1.00      0.08      0.15        12
          15       0.53      1.00      0.69        10
          16       0.80      0.44      0.57         9
          17       0.27      0.60      0.38         5
          18       0.00      0.00      0.00         7
          19       0.00      0.00      0.00         1

    accuracy                           0.51       130
   macro avg       0.40      0.40      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [106]:
# Step 4.2 Train and evaluate the Random Forest model
# ! Possible Combination
# ! 1. ori feature - ori target

# For red Wine dataset
train_and_evaluate(standardModel, wineRed_x_train, wineRed_y_train, wineRed_x_test, wineRed_y_test, 'Wine Red [standardModel]')

# For white Wine dataset
train_and_evaluate(standardModel, wineWhite_x_train, wineWhite_y_train, wineWhite_x_test, wineWhite_y_test, 'Wine White [standardModel]')


# ! Possible Combination
# ! 1. le feature - le target
# ! 2. ohe feature - ohe target
# For student dataset


# For student dataset
# For Math Student dataset
# 1. le feature - le target
train_and_evaluate(standardModel, stdMath_x_train_le, stdMath_y_train_le, stdMath_x_test_le, stdMath_y_test_le, 'Student Math [standardModel] - [le]')
# 2. ohe feature - ohe target
train_and_evaluate(standardModel, stdMath_x_train_ohe, stdMath_y_train_ohe, stdMath_x_test_ohe, stdMath_y_test_ohe, 'Student Math [standardModel] - [ohe]')

# For Portuguese Student dataset
# 1. le feature - le target
train_and_evaluate(standardModel, stdPor_x_train_le, stdPor_y_train_le, stdPor_x_test_le, stdPor_y_test_le, 'Student Portuguese [standardModel] - [le]')
# 2. ohe feature - ohe target
train_and_evaluate(standardModel, stdPor_x_train_ohe, stdPor_y_train_ohe, stdPor_x_test_ohe, stdPor_y_test_ohe, 'Student Portuguese [standardModel] - [ohe]')

Accuracy:  0.659375
Classification Report for  Wine Red [standardModel]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        10
           5       0.72      0.75      0.73       130
           6       0.63      0.69      0.66       132
           7       0.63      0.52      0.57        42
           8       0.00      0.00      0.00         5

    accuracy                           0.66       320
   macro avg       0.33      0.33      0.33       320
weighted avg       0.63      0.66      0.64       320



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.689795918367347
Classification Report for  Wine White [standardModel]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.60      0.24      0.34        25
           5       0.70      0.69      0.70       291
           6       0.66      0.79      0.72       432
           7       0.76      0.58      0.66       192
           8       0.80      0.46      0.58        35

    accuracy                           0.69       980
   macro avg       0.59      0.46      0.50       980
weighted avg       0.69      0.69      0.68       980



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.35443037974683544
Classification Report for  Student Math [standardModel] - [le]
              precision    recall  f1-score   support

           0       0.43      0.60      0.50         5
           5       0.00      0.00      0.00         4
           6       1.00      0.33      0.50         6
           7       0.00      0.00      0.00         1
           8       0.10      0.17      0.12         6
           9       0.00      0.00      0.00         5
          10       0.39      0.64      0.48        11
          11       0.12      0.20      0.15         5
          12       0.25      0.20      0.22         5
          13       0.40      0.40      0.40         5
          14       0.50      0.50      0.50         6
          15       0.54      0.70      0.61        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.25      1.00      0.40         1
          19       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.31645569620253167
Classification Report for  Student Math [standardModel] - [ohe]
              precision    recall  f1-score   support

           0       0.33      0.60      0.43         5
           5       0.00      0.00      0.00         4
           6       1.00      0.17      0.29         6
           7       0.00      0.00      0.00         1
           8       0.18      0.33      0.24         6
           9       0.00      0.00      0.00         5
          10       0.44      0.73      0.55        11
          11       0.25      0.40      0.31         5
          12       0.00      0.00      0.00         5
          13       0.14      0.20      0.17         5
          14       0.33      0.33      0.33         6
          15       0.50      0.50      0.50        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.25      1.00      0.40         1
          19       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.4230769230769231
Classification Report for  Student Portuguese [standardModel] - [le]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           8       0.67      0.57      0.62         7
           9       0.00      0.00      0.00         5
          10       0.35      0.47      0.40        17
          11       0.59      0.64      0.62        25
          12       0.00      0.00      0.00        16
          13       0.33      0.69      0.45        13
          14       0.33      0.08      0.13        12
          15       0.42      0.80      0.55        10
          16       0.60      0.33      0.43         9
          17       0.29      0.40      0.33         5
          18       1.00      0.43      0.60         7
          19       0.00      0.00      0.00         1

    accuracy                           0.42       130
   macro avg       0.40      0.35  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.4307692307692308
Classification Report for  Student Portuguese [standardModel] - [ohe]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
           8       0.50      0.57      0.53         7
           9       0.00      0.00      0.00         5
          10       0.38      0.47      0.42        17
          11       0.56      0.60      0.58        25
          12       0.23      0.19      0.21        16
          13       0.45      0.69      0.55        13
          14       0.29      0.17      0.21        12
          15       0.50      0.80      0.62        10
          16       0.75      0.33      0.46         9
          17       0.22      0.40      0.29         5
          18       1.00      0.29      0.44         7
          19       0.00      0.00      0.00         1

    accuracy                           0.43       130
   macro avg       0.35      0.32 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [107]:
# Step 4.2 Train and evaluate the Random Forest model
# ! Possible Combination
# ! 1. ori feature - ori target

# For red Wine dataset
train_and_evaluate(standardModel, wineRed_x_train, wineRed_y_train, wineRed_x_test, wineRed_y_test, 'Wine Red [standardModel]')

# For white Wine dataset
train_and_evaluate(standardModel, wineWhite_x_train, wineWhite_y_train, wineWhite_x_test, wineWhite_y_test, 'Wine White [standardModel]')


# ! Possible Combination
# ! 1. le feature - le target
# ! 2. ohe feature - ohe target
# For student dataset


# For student dataset
# For Math Student dataset
# 1. le feature - le target
train_and_evaluate(standardModel, stdMath_x_train_le, stdMath_y_train_le, stdMath_x_test_le, stdMath_y_test_le, 'Student Math [standardModel]')
# 2. ohe feature - ohe target
train_and_evaluate(standardModel, stdMath_x_train_ohe, stdMath_y_train_ohe, stdMath_x_test_ohe, stdMath_y_test_ohe, 'Student Math [standardModel]')


# For Portuguese Student dataset
# 1. le feature - le target
train_and_evaluate(standardModel, stdPor_x_train_le, stdPor_y_train_le, stdPor_x_test_le, stdPor_y_test_le, 'Student Portuguese [standardModel]')
# 2. ohe feature - ohe target
train_and_evaluate(standardModel, stdPor_x_train_ohe, stdPor_y_train_ohe, stdPor_x_test_ohe, stdPor_y_test_ohe, 'Student Portuguese [standardModel]')

Accuracy:  0.659375
Classification Report for  Wine Red [standardModel]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        10
           5       0.72      0.75      0.73       130
           6       0.63      0.69      0.66       132
           7       0.63      0.52      0.57        42
           8       0.00      0.00      0.00         5

    accuracy                           0.66       320
   macro avg       0.33      0.33      0.33       320
weighted avg       0.63      0.66      0.64       320



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.689795918367347
Classification Report for  Wine White [standardModel]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.60      0.24      0.34        25
           5       0.70      0.69      0.70       291
           6       0.66      0.79      0.72       432
           7       0.76      0.58      0.66       192
           8       0.80      0.46      0.58        35

    accuracy                           0.69       980
   macro avg       0.59      0.46      0.50       980
weighted avg       0.69      0.69      0.68       980



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.35443037974683544
Classification Report for  Student Math [standardModel]
              precision    recall  f1-score   support

           0       0.43      0.60      0.50         5
           5       0.00      0.00      0.00         4
           6       1.00      0.33      0.50         6
           7       0.00      0.00      0.00         1
           8       0.10      0.17      0.12         6
           9       0.00      0.00      0.00         5
          10       0.39      0.64      0.48        11
          11       0.12      0.20      0.15         5
          12       0.25      0.20      0.22         5
          13       0.40      0.40      0.40         5
          14       0.50      0.50      0.50         6
          15       0.54      0.70      0.61        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.25      1.00      0.40         1
          19       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.31645569620253167
Classification Report for  Student Math [standardModel]
              precision    recall  f1-score   support

           0       0.33      0.60      0.43         5
           5       0.00      0.00      0.00         4
           6       1.00      0.17      0.29         6
           7       0.00      0.00      0.00         1
           8       0.18      0.33      0.24         6
           9       0.00      0.00      0.00         5
          10       0.44      0.73      0.55        11
          11       0.25      0.40      0.31         5
          12       0.00      0.00      0.00         5
          13       0.14      0.20      0.17         5
          14       0.33      0.33      0.33         6
          15       0.50      0.50      0.50        10
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.25      1.00      0.40         1
          19       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.4230769230769231
Classification Report for  Student Portuguese [standardModel]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           8       0.67      0.57      0.62         7
           9       0.00      0.00      0.00         5
          10       0.35      0.47      0.40        17
          11       0.59      0.64      0.62        25
          12       0.00      0.00      0.00        16
          13       0.33      0.69      0.45        13
          14       0.33      0.08      0.13        12
          15       0.42      0.80      0.55        10
          16       0.60      0.33      0.43         9
          17       0.29      0.40      0.33         5
          18       1.00      0.43      0.60         7
          19       0.00      0.00      0.00         1

    accuracy                           0.42       130
   macro avg       0.40      0.35      0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  0.4307692307692308
Classification Report for  Student Portuguese [standardModel]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
           8       0.50      0.57      0.53         7
           9       0.00      0.00      0.00         5
          10       0.38      0.47      0.42        17
          11       0.56      0.60      0.58        25
          12       0.23      0.19      0.21        16
          13       0.45      0.69      0.55        13
          14       0.29      0.17      0.21        12
          15       0.50      0.80      0.62        10
          16       0.75      0.33      0.46         9
          17       0.22      0.40      0.29         5
          18       1.00      0.29      0.44         7
          19       0.00      0.00      0.00         1

    accuracy                           0.43       130
   macro avg       0.35      0.32      0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [108]:
# Step 5: Compare the result
# Compare the result
pd.set_option("display.max_rows", None)
display(result)

# Step 6: Save the result
result.to_csv("output/2. Random Forest Evaluation Result.csv")

Unnamed: 0,Accuracy
Student Math [stdModel] - [le],0.405063
Student Math [stdModel] - [ohe],0.417722
Student Portuguese [stdModel] - [le],0.515385
Student Portuguese [stdModel] - [ohe],0.507692
Wine Red [standardModel],0.659375
Wine White [standardModel],0.689796
Student Math [standardModel] - [le],0.35443
Student Math [standardModel] - [ohe],0.316456
Student Portuguese [standardModel] - [le],0.423077
Student Portuguese [standardModel] - [ohe],0.430769
