In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p04_mushrooms.csv"

# Load input CSV which contains some data on mushrooms into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Extract Feature (X) and Target (y)
X = df.iloc[:, 1:]    # Features (all rows and columns except 1st column)
y = df.iloc[:, 0]     # Target (all rows with 1st column Only)

# All columns are categorical, so we can one-hot encode with get_dummies
# drop_first=False keeps all levels; RandomForest handles this well.
X_encoded = pd.get_dummies(X, drop_first=False)


# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y, 
    train_size=0.7, # 70% for training and rest 30% for testing
    stratify=y,     # keep edible/poisonous ratio same in train & test
    random_state=42 # random_state=42 is chose to ensure same rows go to train and test every time
)

# Create Logistic Regression Model
# Use liblinear solver, good for smaller datasets and binary classification
model = LogisticRegression(max_iter=2000, solver="liblinear")

# Train the model
model.fit(X_train, y_train)

# Prediction with test data
y_pred = model.predict(X_test)

# Evaluate model with Accuracy Score and Confusion Matrix
acc_score = accuracy_score(y_test, y_pred)           # Calculate Accuracy score
conf_matrix = confusion_matrix(y_test, y_pred)  # Calculate Confusion Matrix

print(f"Accuracy Score with Logistic Regression Model: {acc_score:.4f}")
print(f"Confusion Matrix with Logistic Regression Model:\n{conf_matrix}")

Accuracy Score with Logistic Regression Model: 0.9992
Confusion Matrix with Logistic Regression Model:
[[1263    0]
 [   2 1173]]
