In [None]:
# Naive Bayes and DT Classification on the Soybean Dataset (with missing-value handling)
# ==============================================================================
# This version:
# 1. Loads the Soybean dataset from OpenML
# 2. Replaces missing values
# 3. Encodes categorical features numerically
# 4. Trains a Categorical Naive Bayes classifier
# 5. Evaluates its performance
# 6. Trains a Decision Tree Classifier
# 7. Evaluates its performance

# Step 1: Import libraries
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Step 2: Load dataset
soybean = fetch_openml(name='soybean', version=1, as_frame=True)
X = soybean.data
y = soybean.target

print("Original dataset shape:", X.shape)
print("Number of classes:", len(y.unique()))

# Step 3: Replace missing categorical values with the most frequent value in each column
for col in X.columns:
    X[col] = X[col].fillna(X[col].mode()[0])

# Step 4: Encode categorical features numerically
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

In [None]:
# Step 5: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Step 6: Initialize and train the Naive Bayes model
nb_model = CategoricalNB()
nb_model.fit(X_train, y_train)

# Step 7: Evaluate performance
y_pred = nb_model.predict(X_test)

print("\nModel Evaluation Results")
print("========================")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 8: Example prediction
sample = X_test[0].reshape(1, -1)
predicted_class = nb_model.predict(sample)[0]
print("\nPredicted class for sample 0:", predicted_class)

In [None]:
# Step 9: Initialize the Decision Tree model
# Use 'entropy' for information gain or 'gini' for Gini impurity
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Step 10: Train the model
dt_model.fit(X_train, y_train)

# Step 11: Make predictions on the test data
y_pred = dt_model.predict(X_test)

# Step 12: Evaluate performance
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Step 13: Example prediction
sample = X_test[0].reshape(1, -1)
predicted_class = dt_model.predict(sample)[0]
print("\nPredicted class for sample 0:", predicted_class)

In [None]:
# Step 14 (optional): Visualize the decision tree
plt.figure(figsize=(15, 10))
plt.title("Decision Tree for Soybean Classification")
plot_tree(
    dt_model,
    filled=True,
    feature_names=X.columns,
    class_names=sorted(y.unique()),
    rounded=True,
    fontsize=8
)
plt.show()