#### Listing 14.1 - import libraries

In [None]:
#Listing 14.1 - import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

#### Listing 14.2 - Load data and preprocess data

In [None]:
#Listing 14.2 - Load data and preprocess data
breast_cancer = load_breast_cancer()
print(breast_cancer.keys())

df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
print('Breast cancer data frame shape: ',df.shape)

X = df.drop(columns=['mean radius'])
y_class = breast_cancer.target
y_reg = df['mean radius']

print("Data matrix X shape:", X.shape)

#### Listing 14.3-Remove missing values

In [None]:
#Listing 14.3 - Remove missing values
df_new = X.copy()
df_new['y_class'] = y_class
df_new['y_reg'] = y_reg
print('Shape of df_new now is:',df_new.shape)

print('The number of missing values in dataset:', df_new.isnull().sum().sum())
df_new = df_new.dropna()

#### Listing 14.4 - Drop duplicates based on features and the target

In [None]:
#Listing 14.4 - Drop duplicates based on features and the target
drdup_class = df_new.drop_duplicates(
    subset=X.columns.tolist() + ['y_class'])
print('Shape of drdup_class is: ',drdup_class.shape)
print('The number of missing values in dataset drdup_class:', drdup_class.isnull().sum().sum())

#### Listing 14.5 - Remove inconsistent rows including same features but different targets

In [None]:
#Listing 14.5 - Remove inconsistent rows including same features but different targets
grouped = df_new.groupby(X.columns.tolist())
inconsistent_indices = []

for _, group in grouped:
    if group['y_class'].nunique() > 1 or group['y_reg'].nunique() > 1:
        inconsistent_indices.extend(group.index.tolist())

print(f'Found {len(inconsistent_indices)} inconsistent rows.')

df_cleaned = df_new.drop(index=inconsistent_indices).reset_index(drop=True)
print('Final cleaned data shape:', df_cleaned.shape)


#### Listing 14.6 - Split data into the training and the test set

In [None]:
#Listing 14.6 - Split data into the training and the test set
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=421)
print('Size of the training set before producing the validation set:\n',X_train.shape)

X_Strain, X_val, y_class_Strain, y_class_val, y_reg_Strain, y_reg_val = train_test_split(
    X_train, y_class_train, y_reg_train, test_size=0.2, random_state=421)

print('The size of the smaller training set:\n', X_Strain.shape)
print('The size of the validation set:\n', X_val.shape)
print('The size of the test set:\n', X_test.shape)
print('The smaller training set class distribution:\n', Counter(y_class_Strain))
print('The validation set class distribution:\n', Counter(y_class_val))
print('The test set class distribution:\n', Counter(y_class_test))

#### Listing 14.7 - Feature selection function

In [None]:
#Listing 14.7 - Feature selection function
def remove_highly_correlated_features(df, threshold):
    df = df.copy()
    removed_features = []

    while True:
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        max_corr = upper.max().max()
        if max_corr < threshold:
            print(f'All correlations below threshold {threshold}.')
            break

        A, B = upper.stack().idxmax()

        avg_corr_A = corr_matrix[A].drop(index=A).mean()
        avg_corr_B = corr_matrix[B].drop(index=B).mean()

        to_remove = A if avg_corr_A > avg_corr_B else B
        df.drop(columns=to_remove, inplace=True)
        removed_features.append(to_remove)

    return df, removed_features


#### Listing 14.8

In [None]:
#Listing 14.8
X_Strain_reduced, removed_features = remove_highly_correlated_features(X_Strain, threshold=0.50)
kept_features = X_Strain_reduced.columns.tolist()
X_train_reduced = X_train[kept_features]
X_val_reduced = X_val[kept_features]
X_test_reduced = X_test[kept_features]
print(f'Kept features: {kept_features}')

#### Listing 14.9 - Train linear regression on original and reduced feature sets

In [None]:
#Listing 14.9 - Train linear regression on original and reduced feature sets
scaler_full = StandardScaler().fit(X_Strain)
X_Strain_scaled = scaler_full.transform(X_Strain)
X_val_scaled = scaler_full.transform(X_val)

model_full = LinearRegression()
model_full.fit(X_Strain_scaled, y_reg_Strain)

y_val_pred_full = model_full.predict(X_val_scaled)
mse_val_full = mean_squared_error(y_reg_val, y_val_pred_full)

print(f'The MSE on the validation set (full features): {mse_val_full:.6f}')

scaler_reduced = StandardScaler().fit(X_Strain_reduced)
X_Strain_reduced_scaled = scaler_reduced.transform(X_Strain_reduced)
X_val_reduced_scaled = scaler_reduced.transform(X_val_reduced)

model_reduced = LinearRegression()
model_reduced.fit(X_Strain_reduced_scaled, y_reg_Strain)

y_val_pred_reduced = model_reduced.predict(X_val_reduced_scaled)
mse_val_reduced = mean_squared_error(y_reg_val, y_val_pred_reduced)

print(f'The MSE on the validation set (reduced features): {mse_val_reduced:.4f}')

#### Listing 14.10 - Ridge regression with alpha tuning

In [None]:
#Listing 14.10 - Ridge regression with alpha tuning
alphas = np.logspace(-2, 2, 25)
val_errors = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_Strain_scaled, y_reg_Strain)
    y_val_pred = ridge.predict(X_val_scaled)
    mse = mean_squared_error(y_reg_val, y_val_pred)
    val_errors.append(mse)

best_alpha_idx = np.argmin(val_errors)
best_alpha = alphas[best_alpha_idx]
best_mse = val_errors[best_alpha_idx]

print(f'Best alpha: {best_alpha:.6f}')
print(f'The MSE on the validation set (Ridge): {best_mse:.6f}')

#### Listing 14.11 - Decompose bias^2 and variance for ridge regression

In [None]:
#Listing 14.11 - Decompose bias^2 and variance for ridge regression
np.random.seed(421)
n_bootstraps = 100
val_preds = np.zeros((len(y_reg_val), n_bootstraps))

for i in range(n_bootstraps):
    indices = np.random.choice(len(X_Strain_scaled), size=len(X_Strain_scaled), replace=True)
    X_samp = X_Strain_scaled[indices, :]
    y_samp = y_reg_Strain.iloc[indices]

    model = Ridge(alpha=best_alpha)
    model.fit(X_samp, y_samp)

    val_preds[:, i] = model.predict(X_val_scaled)

mean_preds = np.mean(val_preds, axis=1)
bias_squared = np.mean((mean_preds - y_reg_val) ** 2)
variance = np.mean(np.var(val_preds, axis=1))
print(f'Squared bias               : {bias_squared:.6f}')
print(f'Variance                   : {variance:.6f}')

#### Listing 14.12 - Final evaluation on the full test set

In [None]:
#Listing 14.12 - Final evaluation on the full test set
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

final_model = Ridge(alpha=best_alpha)
final_model.fit(X_train_scaled, y_reg_train)

y_test_pred = final_model.predict(X_test_scaled)

mse_test = mean_squared_error(y_reg_test, y_test_pred)
r2_test = r2_score(y_reg_test, y_test_pred)

print(f'Final test MSE (Ridge, alpha={best_alpha:.6f}): {mse_test:.6f}')
print(f'Final test coefficient of determination score: {r2_test:.4f}')


In [None]:
#Listing 14.12 - extra
print('Coefficients (all 29 of them):\n', final_model.coef_)
print('Intercept:', final_model.intercept_)
print('Alpha:', final_model.alpha)

#### Listing 14.13

In [None]:
#Listing 14.13
residuals = y_reg_test - y_test_pred

plt.figure(figsize=(8, 5))
plt.scatter(y_test_pred, residuals, edgecolor='k')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.grid(True)
plt.tight_layout()
plt.savefig('residual_vs_prediction.eps', dpi=600)


#### Listing 14.14 - Classification (I)

In [None]:
#Listing 14.14 - Classification (I)
class_model_full = LogisticRegression(penalty=None, solver='lbfgs', tol=1e-8)
class_model_full.fit(X_train_scaled, y_class_train)
y_pred_full = class_model_full.predict(X_test_scaled)

print('Accuracy at threshold = 0.5 for dataset (a) is:', np.round(accuracy_score(y_class_test, y_pred_full),2))
conf_mat = confusion_matrix(y_class_test, y_pred_full)
print('Confusion Matrix:\n',conf_mat)
print("Classification Report:\n", classification_report(y_class_test, y_pred_full))

#### Listing 14.15 - Classification (II)

In [None]:
#Listing 14.15 - Classification (II)
scaler = StandardScaler().fit(X_train_reduced)
X_train_reduced_scaled = scaler.transform(X_train_reduced)
X_test_reduced_scaled = scaler.transform(X_test_reduced)

class_model_reduced = LogisticRegression(penalty=None, solver='lbfgs', tol=1e-8)
class_model_reduced.fit(X_train_reduced_scaled, y_class_train)
y_pred_reduced = class_model_reduced.predict(X_test_reduced_scaled)

print('Accuracy at threshold = 0.5 for dataset (b) is:', np.round(accuracy_score(y_class_test, y_pred_reduced),2))
conf_mat = confusion_matrix(y_class_test, y_pred_reduced)
print('Confusion Matrix:\n',conf_mat)
print("Classification Report:\n", classification_report(y_class_test, y_pred_reduced))