# The next block of code install the necessary packages required for the homework

In [2]:
!pip install numpy pandas scikit-learn mlxtend
#numpy – for numerical operations.
#pandas – for data manipulation and analysis.
#scikit-learn – for machine learning tasks, including model creation and data splitting.
#mlxtend – for advanced machine learning utilities, specifically the SequentialFeatureSelector



#The next block of code imports the necessary libraries

In [3]:
import numpy as np             #library for data manipulation
import pandas as pd            #library for data manipulation
from sklearn.ensemble import RandomForestClassifier    #RandomForestClassifier for the model
from sklearn.model_selection import train_test_split   #train_test_split to split the dataset
from sklearn.metrics import accuracy_score as acc      #accuracy_score to measure performance
from mlxtend.feature_selection import SequentialFeatureSelector as sfs   #SequentialFeatureSelector for step-forward feature selection

In [None]:
#The next block of code reads the wine-quality dataset from the Raw folder within the Data Folder

In [4]:
import os
import pandas as pd

# Define the file path relative to the Codes directory
data_folder = os.path.join('..', 'Data', 'Raw')  # Go up one directory from Codes, then navigate to Data/Raw
file_name = 'winequality-white.csv'
file_path = os.path.join(data_folder, file_name)

# Load the dataset
df = pd.read_csv(file_path, sep=';')

# Print confirmation
print("Dataset loaded successfully from:", file_path)


Dataset loaded successfully from: ..\Data\Raw\winequality-white.csv


#The next block of code provides a descriptive statistics set of the wine quality dataset

In [5]:
import os          # For handling file paths and directories
import pandas as pd  # For data manipulation and analysis

data_folder = os.path.join('..', 'Data', 'Raw')  # Relative path to the Raw data folder by navigating up one directory (..)
file_name = 'winequality-white.csv'              

# Combine the folder path and file name to create the full file path
file_path = os.path.join(data_folder, file_name)

df = pd.read_csv(file_path, sep=';')
print("Dataset loaded successfully from:", file_path)

# Provides summary statistics such as count, mean, standard deviation, min, max, etc for each numeric column in the dataset.
description = df.describe()

#Output path where the descriptive statistics will be saved by navigating one directory up from Codes to the Results folder.
output_folder = os.path.join('..', 'Results')
os.makedirs(output_folder, exist_ok=True)  # Create the Results folder if it doesn't already exist

# Step 6: Define the file name for saving the descriptive statistics by following the class naming convention
output_file_name = 'EH_Description_02_07_2025.csv'
output_file_path = os.path.join(output_folder, output_file_name)

# Saving the DataFrame `description` to the specified CSV file.
description.to_csv(output_file_path)

# Printing the confirmation of successful save
print("Descriptive statistics saved to:", output_file_path)


Dataset loaded successfully from: ..\Data\Raw\winequality-white.csv
Descriptive statistics saved to: ..\Results\EH_Description_02_07_2025.csv


#The next block of code split the dataset into training and testing sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.values[:, :-1],    # All columns except the last (features)
    df.values[:, -1],     # The last column (target: 'quality')
    test_size=0.25,       # 25% of data for testing
    random_state=42       # Seed to ensure consistent results
)


#The next block of code flatten the Target Arrays

In [7]:
y_train = y_train.ravel()
y_test = y_test.ravel()


#The next block of code prints the Dataset Shapes-This helps to verify that the dataset has been split correctly

In [8]:
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)


Training dataset shape: (3673, 11) (3673,)
Testing dataset shape: (1225, 11) (1225,)


#The next block of codes is about using Feature Selection of 5

In [9]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Explanation:
# - Creates a Random Forest classifier with 100 trees.
# - n_jobs=-1 allows the use of all available CPU cores for parallel processing.

# Build step forward feature selection
sfs1 = sfs(
    clf,                # Classifier to use for feature evaluation
    k_features=5,       # Selects 5 features
    forward=True,       # Step forward feature selection
    floating=False,     # Disables floating selection
    verbose=2,          # Outputs detailed logs of the selection process
    scoring='accuracy', # Uses accuracy to evaluate subsets
    cv=5                # 5-fold cross-validation for validation
)

# Explanation:
# - The feature selection process aims to find the best 5 features using forward selection.
# - It validates each subset using cross-validation and accuracy scoring.

# Perform SFFS (Sequential Forward Feature Selection)
sfs1 = sfs1.fit(X_train, y_train)

# Explanation:
# - Trains the feature selector using the training data.
# - Iteratively evaluates different subsets to determine the optimal set of 5 features.



[2025-02-07 11:19:27] Features: 1/5 -- score: 0.49577304491278806
[2025-02-07 11:19:43] Features: 2/5 -- score: 0.5428842054532985
[2025-02-07 11:19:57] Features: 3/5 -- score: 0.6044034180429666
[2025-02-07 11:20:10] Features: 4/5 -- score: 0.6272694581919962
[2025-02-07 11:20:21] Features: 5/5 -- score: 0.6387054440304732

#The next block of code extract and display the 5 Selected Feature indices

In [10]:
# Extract and print the selected feature indices
feat_cols = list(sfs1.k_feature_idx_)
print("Selected feature indices:", feat_cols)


Selected feature indices: [1, 3, 5, 7, 10]


#The next block of code evaluates the Full Model Using the 5 Selected Features

In [11]:
# Initialize the Random Forest Classifier with the specified parameters
clf = RandomForestClassifier(
    n_estimators=1000,  # Number of trees in the random forest
    random_state=42,    # Random state for reproducibility
    max_depth=4         # Maximum depth of the decision trees to prevent overfitting
)

# Train the classifier using only the selected features
clf.fit(X_train[:, feat_cols], y_train)

# Make predictions on the training data using the selected features
y_train_pred = clf.predict(X_train[:, feat_cols])

# Calculate and print the training accuracy
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

# Make predictions on the test data using the selected features
y_test_pred = clf.predict(X_test[:, feat_cols])

# Calculate and print the testing accuracy
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))


Training accuracy on selected features: 0.562
Testing accuracy on selected features: 0.519


#The testing accuracy obtained with 5 features is 0.519 as seen from above

#The next block evaluates the full model on ALL features

In [12]:
# Initialize the Random Forest Classifier
clf = RandomForestClassifier(
    n_estimators=1000,   # Use 1000 trees in the forest
    random_state=42,     # Random state ensures results are reproducible
    max_depth=4          # Limit tree depth to prevent overfitting
)

# Train the model on the full feature set
clf.fit(X_train, y_train)  # Train the classifier using all features in X_train

# Make predictions on the training dataset
y_train_pred = clf.predict(X_train)  # Predict the training labels using the full feature set

# Evaluate training accuracy
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))

# Make predictions on the test dataset
y_test_pred = clf.predict(X_test)  # Predict the test labels using all features

# Evaluate testing accuracy
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))


Training accuracy on all features: 0.566
Testing accuracy on all features: 0.509


#The next block of code modifies Feature Selection to Use k_features=6

In [13]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Explanation:
# - Creates a Random Forest classifier with 100 trees.
# - n_jobs=-1 allows the use of all available CPU cores for parallel processing.

# Build step forward feature selection
sfs1 = sfs(
    clf,                   # Classifier to use for feature evaluation
    k_features=6,          # Selects 6 features (modified from 5)
    forward=True,          # Step forward feature selection
    floating=False,        # Disables floating selection
    verbose=2,             # Outputs detailed logs of the selection process
    scoring='accuracy',    # Uses accuracy to evaluate subsets
    cv=5                   # 5-fold cross-validation for validation
)

# Explanation:
# - The feature selection process aims to find the best 6 features using forward selection.
# - It validates each subset using cross-validation and accuracy scoring.

# Perform SFFS (Sequential Forward Feature Selection)
sfs1 = sfs1.fit(X_train, y_train)

# Explanation:
# - Trains the feature selector using the training data.
# - Iteratively evaluates different subsets to determine the optimal set of 6 features.



[2025-02-07 11:23:11] Features: 1/6 -- score: 0.4927798476338764
[2025-02-07 11:23:27] Features: 2/6 -- score: 0.5450629298040742
[2025-02-07 11:23:42] Features: 3/6 -- score: 0.6060394075886485
[2025-02-07 11:23:53] Features: 4/6 -- score: 0.6275489814454392
[2025-02-07 11:24:05] Features: 5/6 -- score: 0.6389786650354965
[2025-02-07 11:24:15] Features: 6/6 -- score: 0.642520157926931

#The next block of code extract and display the 6 Selected Feature indices

In [14]:
feat_cols = list(sfs1.k_feature_idx_)
print("Selected feature indices with 6 features:", feat_cols)


Selected feature indices with 6 features: [0, 1, 3, 6, 7, 10]


In [16]:
#The next block of code evaluates the Full Model Using the 6 Selected Features

In [17]:
# Initialize the Random Forest Classifier with the specified parameters
clf = RandomForestClassifier(
    n_estimators=1000,   # Use 1000 trees in the forest
    random_state=42,     # Ensures reproducibility
    max_depth=4          # Limits tree depth to prevent overfitting
)

# Train the classifier on the 6 selected features
clf.fit(X_train[:, feat_cols], y_train)

# Make predictions on the training set
y_train_pred = clf.predict(X_train[:, feat_cols])

# Evaluate and print training accuracy
train_accuracy = acc(y_train, y_train_pred)
print('Training accuracy on selected features: %.3f' % train_accuracy)

# Make predictions on the test set
y_test_pred = clf.predict(X_test[:, feat_cols])

# Evaluate and print testing accuracy
test_accuracy = acc(y_test, y_test_pred)
print('Testing accuracy on selected features: %.3f' % test_accuracy)

Training accuracy on selected features: 0.561
Testing accuracy on selected features: 0.511


#The testing accuracy obtained with 6 features is 0.511 as seen from above