In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p01_wine.csv"

# Load input CSV which contains data related to Portuguese wines into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Drop rows with missing values ie nan entries
df_clean = df.dropna().reset_index(drop=True)   # removes null entries and reset index
# print(df.info()) # check if there are no null values

# Extract Feature (X) and Target (y)
X = df_clean.iloc[:, 1:]    # Features (all rows and columns except 1st column)
y = df_clean.iloc[:, 0]     # Target (all rows with 1st column Only)

# # Lets print sample data and datatypes for Features and Target
# # Regression models works only with numeric data
# print(X.head())
# print(X.info())   # All Features are numeric
# print(y.head())
# print(y.info())   # Target value is non-numeric

# Lets print what diff values on Target column ie ´type´
# print(y.value_counts())   # values are either "white" or "red"

# Map ´type´ column ie Target to binary: white -> 0, red -> 1
y = y.map({"white": 0, "red": 1})

# print(y.info())           # confirm the datatype again
# print(y.value_counts())   # confirm the values / distribution for Target ie ´type´ column

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    train_size=0.7, # 70% for training and rest 30% for testing
    stratify=y,     # keep class balance since target column is imbalanced
    random_state=42 # random_state=42 is chose to ensure same rows go to train and test every time
)

# print(X_train.shape)
# print(X_test.shape)

# Create Logistic Regression Model
# Use liblinear solver, good for smaller datasets and binary classification
model = LogisticRegression(max_iter=2000, solver="liblinear")

# Train the model
model.fit(X_train, y_train)

# Prediction with test data
y_pred = model.predict(X_test)

# Evaluate model with Accuracy Score and Confusion Matrix
acc_score = accuracy_score(y_test, y_pred)           # Calculate Accuracy score
conf_matrix = confusion_matrix(y_test, y_pred)  # Calculate Confusion Matrix

print(f"Accuracy Score with Logistic Regression Model: {acc_score:.4f}")
print(f"Confusion Matrix with Logistic Regression Model:\n{conf_matrix}")

(4524, 12)
(1939, 12)
Accuracy Score with Logistic Regression Model: 0.9830
Confusion Matrix with Logistic Regression Model:
[[1444   17]
 [  16  462]]
