In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("species.csv")
print(df)

def fun_naive_bayes_classification():
    # Features and target
    X = df.drop('species', axis=1)
    y = df['species']

    # Encoder (handles unseen categories safely)
    encoder = OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    )

    # Encode features
    X_encoded = encoder.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y, test_size=0.2, random_state=50
    )

    # Train Naive Bayes model
    model = CategoricalNB(alpha=1.0)
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    print(f"\nModel Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

    # -------------------------------
    # New data (IMPORTANT: same columns as training data)
    # -------------------------------
    new_data = pd.DataFrame(
        [['white', 'three', 'short', 'yes']],
        columns=X.columns   # ensures same names & order
    )

    # Encode new data
    new_encoded = encoder.transform(new_data)

    # Prediction
    predicted_species = model.predict(new_encoded)[0]
    probabilities = model.predict_proba(new_encoded)[0]

    # Output
    print("\nNew Data:")
    print(new_data.iloc[0].to_dict())

    print("\nPredicted Species:", predicted_species)

    print("\nPrediction Probabilities:")
    for species, prob in zip(model.classes_, probabilities):
        print(f"{species}: {prob:.4f}")

# Run the function
fun_naive_bayes_classification()

  colour   legs height smelly species
0  white  three  short    yes       M
1  green    two   tall     no       M
2  green  three  short    yes       M
3  white  three  short    yes       M
4  green    two  short     no       H
5  white    two   tall     no       H
6  white    two   tall     no       H
7  white    two  short    yes       H

Model Accuracy: 50.00%

New Data:
{'colour': 'white', 'legs': 'three', 'height': 'short', 'smelly': 'yes'}

Predicted Species: M

Prediction Probabilities:
H: 0.1366
M: 0.8634
