In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p04_mushrooms.csv"

# Load input CSV which contains some data on mushrooms into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Extract Features (X) and Target (y) using ´iloc´ indexer
X = df.iloc[:, 1:]    # Features (all rows and columns except 1st column)
y = df.iloc[:, 0]     # Target (all rows with 1st column Only)

# All columns are categorical, so we can use one-hot encode with ´get_dummies()´ method
X_encoded = pd.get_dummies(
    X, 
    drop_first=False # to keep all dummy columns
)

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y, 
    train_size=0.7, # to split data as 70% for training and rest 30% for testing
    stratify=y,     # to keep the same class ratio in training and test sets
    random_state=42 # to ensure same rows go to train and test sets in every run for consistency purpose
)

# Create Logistic Regression Model
model = LogisticRegression(
    class_weight="balanced",    # automatically handle imbalanced classes by adjusting weights
    max_iter=2000,              # allow more steps so the model can fully converge
    solver="liblinear"          # best solver for binary classification and smaller datasets   
)

# Train the Logistic Regression model
model.fit(X_train, y_train)

# Prediction with test data
y_pred = model.predict(X_test)

# Evaluate model's Accuracy Score and Confusion Matrix
acc_score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy Score with Logistic Regression Model: {acc_score:.4f}")
print(f"\nConfusion Matrix with Logistic Regression Model:\n{conf_matrix}")

Accuracy Score with Logistic Regression Model: 0.9996

Confusion Matrix with Logistic Regression Model:
[[1263    0]
 [   1 1174]]
