In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

csv_location = "exrc06p01_wine.csv"

# Load input CSV which contains data related to Portuguese wines into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info()) # prints concise summary about DataFrame's structure
# print(df.head()) # prints first five rows - default

# Drop rows with missing values ie nan entries
df_clean = df.dropna().reset_index(drop=True) # removes null entries and reset index
# print(df.info()) # check if there are no null values


# Extract Feature (X) and Target (y)
X = df_clean.iloc[:, 1:]   # Features (all rows and columns except 1st column)
y = df_clean.iloc[:, 0]    # Target (all rows with 1st column Only)

# # Lets print sample data and datatypes for Features and Target
# # Regression models works only with numeric data
# print(X.head())
# print(X.info())
# print(y.head())
# print(y.info())

# Map type column ie Target to binary: white -> 0, red -> 1
y = y.map({"white": 0, "red": 1})

# print("Class distribution:")
# print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.3, random_state=42 # random_state=42 is chose to ensure same rows go to train and test every time
)

# Train logistic regression model
# Use liblinear solver, good for smaller datasets and binary classification
model = LogisticRegression(max_iter=2000, solver="liblinear")

model.fit(X_train, y_train)

# Evaluate model â€“ score and confusion matrix

# Predictions
y_pred = model.predict(X_test)

# Accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc_score:.4f}")
print(f"\nConfusion matrix:\n{confusion_matrix}")



Accuracy: 0.9759

Confusion matrix:
[[3378   27]
 [  82 1038]]
