In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p01_wine.csv"

# Load input CSV which contains data related to Portuguese wines into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Drop rows with missing values ie nan entries
df_clean = df.dropna().reset_index(drop=True)   # removes null entries and reset index
# print(df_clean.info()) # check if there are no null values

# Extract Features (X) and Target (y) using ´iloc´ indexer
X = df_clean.iloc[:, 1:]    # Features (all rows and columns except 1st column)
y = df_clean.iloc[:, 0]     # Target (all rows with 1st column Only)

# # Lets print sample data and datatypes for Features and Target
# # Note: Regression models works only with numeric data
# print(X.head())
# print(X.info())   # All Features are numeric
# print(y.head())
# print(y.info())   # Target value is non-numeric needs to convert to numeric

# Lets print what diff values on Target column ie ´type´.
# print(y.value_counts())   # values are either "white" or "red"

# Map ´type´ column ie Target to binary: white -> 1, red -> 0
# Note: Regression models works only with numeric data
y = y.map({"white": 1, "red": 0})

# print(y.info())           # confirm the datatype again
# print(y.value_counts())   # confirm the values / distribution for Target ie ´type´ column
# # Target values ie classes are imbalanced 1 (white) -> 4870 and 0 (red) -> 1593

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y,
    train_size=0.7, # to split data as 70% for training and rest 30% for testing
    stratify=y,     # to keep the same class ratio in training and test sets
    random_state=42 # to ensure same rows go to train and test sets in every run for consistency purpose
)

# Create Logistic Regression Model
model = LogisticRegression(
    class_weight="balanced",    # automatically handle imbalanced classes (Target) by adjusting weights
    max_iter=2000,              # allow more steps so the model can fully converge
    solver="liblinear"          # best solver for binary classification and smaller datasets   
)

# Train the Logistic Regression odel
model.fit(X_train, y_train)

# Prediction with test data
y_pred = model.predict(X_test)

# Evaluate model's Accuracy Score and Confusion Matrix
acc_score = accuracy_score(y_test, y_pred)      
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy Score with Logistic Regression Model: {acc_score:.4f}")
print(f"\nConfusion Matrix with Logistic Regression Model:\n{conf_matrix}")

Accuracy Score with Logistic Regression Model: 0.9737

Confusion Matrix with Logistic Regression Model:
[[ 466   12]
 [  39 1422]]
