# George Sorondo - CSCI 164 Final Project
## Applied Machine Learning with scikit-learn

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                              confusion_matrix, ConfusionMatrixDisplay, roc_curve,
                              mean_absolute_error, mean_squared_error, r2_score)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)


In [None]:

from sklearn.datasets import load_iris, load_breast_cancer

# Iris Dataset
iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = pd.Series(iris.target, name="species")
scaler_iris = StandardScaler()
X_iris_scaled = scaler_iris.fit_transform(X_iris)
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris_scaled, y_iris, test_size=0.2, random_state=42, stratify=y_iris)

# Breast Cancer Dataset
cancer = load_breast_cancer()
X_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y_cancer = pd.Series(cancer.target, name="target")
scaler_cancer = StandardScaler()
X_cancer_scaled = scaler_cancer.fit_transform(X_cancer)
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer_scaled, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer)

# Wine Quality Dataset
wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
X_wine = wine.drop('quality', axis=1)
y_wine = wine['quality']
scaler_wine = StandardScaler()
X_wine_scaled = scaler_wine.fit_transform(X_wine)
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine_scaled, y_wine, test_size=0.2, random_state=42)


In [None]:

def evaluate_classification(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    if len(np.unique(y_test)) == 2:
        y_pred_prob = model.predict_proba(X_test)[:,1]
        print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()


In [None]:

# --- Iris Models ---
print("\n--- Iris Dataset ---")
evaluate_classification(LogisticRegression(max_iter=1000), X_train_iris, X_test_iris, y_train_iris, y_test_iris)
evaluate_classification(KNeighborsClassifier(), X_train_iris, X_test_iris, y_train_iris, y_test_iris)
evaluate_classification(MLPClassifier(max_iter=1000), X_train_iris, X_test_iris, y_train_iris, y_test_iris)

# --- Breast Cancer Models ---
print("\n--- Breast Cancer Dataset ---")
evaluate_classification(LogisticRegression(max_iter=1000), X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer)
evaluate_classification(KNeighborsClassifier(), X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer)
evaluate_classification(MLPClassifier(max_iter=1000), X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer)

# --- Wine Quality Models ---
print("\n--- Wine Quality Dataset ---")
linreg = LinearRegression()
linreg.fit(X_train_wine, y_train_wine)
print("Linear Regression R2:", r2_score(y_test_wine, linreg.predict(X_test_wine)))

mlp_reg = MLPRegressor(max_iter=1000)
mlp_reg.fit(X_train_wine, y_train_wine)
print("MLP Regression R2:", r2_score(y_test_wine, mlp_reg.predict(X_test_wine)))


In [None]:

print("\n--- Hyperparameter Tuning Example (k-NN Iris) ---")
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train_iris, y_train_iris)
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Score:", grid.best_score_)



### Reflection
- MLP models performed best across datasets.
- Wine Quality regression was harder than classification tasks.
- Our results align closely with academic studies like Cortez (2009) and UCI reports.
