In [1]:
# Task 02: Model Training Fundamentals with Wine Quality Dataset
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score
import time

# Load dataset
data = pd.read_csv('winequality-white.csv', sep=';')
X = data.drop('quality', axis=1)
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Train and evaluate models
models = {
    'Linear': LinearRegression(),
    'SGD': SGDRegressor(penalty=None, random_state=42),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)
}
results = []
for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_poly, y_train)
    train_time = time.time() - start_time
    y_pred = model.predict(X_test_poly)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    non_zero_coeffs = np.sum(np.abs(model.coef_) > 1e-10) if hasattr(model, 'coef_') else len(model.coef_)
    results.append({'Model': name, 'RMSE': rmse, 'R2': r2, 'Training Time': train_time, 'Non-zero Coefficients': non_zero_coeffs})
    print(f"{name}: RMSE={rmse:.3f}, R2={r2:.3f}, Time={train_time:.3f}s, Non-zero Coeffs={non_zero_coeffs}")

# Hyperparameter tuning (Ridge example)
param_grid = {'alpha': [0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_poly, y_train)
print("Best Ridge alpha:", grid_search.best_params_)

# Plot learning curves
def plot_learning_curves(model, X, y, name):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    train_errors, val_errors = [], []
    sizes = range(1, len(X_train), 100)
    for m in sizes:
        model.fit(X_train[:m], y_train[:m])
        y_train_pred = model.predict(X_train[:m])
        y_val_pred = model.predict(X_val)
        train_errors.append(np.sqrt(mean_squared_error(y_train[:m], y_train_pred)))
        val_errors.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    plt.plot(sizes, train_errors, 'r-+', label='Training RMSE')
    plt.plot(sizes, val_errors, 'b-', label='Validation RMSE')
    plt.xlabel('Training Set Size')
    plt.ylabel('RMSE')
    plt.title(f'Learning Curves: {name}')
    plt.legend()
    plt.savefig(f'learning_curve_{name}.png')
    plt.close()

for name, model in models.items():
    plot_learning_curves(model, X_train_poly, y_train, name)

# Coefficient analysis plot
plt.figure(figsize=(10, 6))
for name, model in models.items():
    if hasattr(model, 'coef_'):
        plt.plot(np.abs(model.coef_), label=name)
plt.xlabel('Feature Index')
plt.ylabel('Coefficient Magnitude')
plt.title('Coefficient Magnitudes Across Models')
plt.legend()
plt.savefig('coefficient_plot.png')
plt.close()

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv('model_comparison.csv', index=False)

Linear: RMSE=0.787, R2=0.200, Time=0.045s, Non-zero Coeffs=77
SGD: RMSE=79803027662.477, R2=-8223039543245029769216.000, Time=0.012s, Non-zero Coeffs=77
Ridge: RMSE=0.787, R2=0.200, Time=0.012s, Non-zero Coeffs=77
Lasso: RMSE=0.769, R2=0.236, Time=0.008s, Non-zero Coeffs=13
ElasticNet: RMSE=0.780, R2=0.215, Time=0.012s, Non-zero Coeffs=19
Best Ridge alpha: {'alpha': 10}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
