##### Exercise 14.1 - Remove missing values

In [None]:
#Exercise 14.1 - Remove missing values
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Tim', 'Mary', 'David', None],
    'English Score': [59, np.nan, None, 72, 81],
    'Maths Score': [85, 67, 90, 88, 95]}

df = pd.DataFrame(data)
print('Original DataFrame:\n',df)
print('The number of missing values in dataset df:', df.isnull().sum().sum())

df1 = df.drop(columns=['English Score'])
print('Smaller dataset df1:\n',df1)
print('The number of missing values in dataset df1:', df1.isnull().sum().sum())

df_row_drop = df.dropna()
print('\nDataFrame after dropping rows with any missing values:\n', df_row_drop)

df_col_drop = df.dropna(axis=1)
print('\nDataFrame after dropping columns with any missing values:\n', df_col_drop)


#### Exercise 14.2

In [None]:
#Exercise 14.2
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Tim', 'Mary', 'David', None, 'David', 'Alice'],
    'English Score': [59, np.nan, None, 72, 81, 72, 59],
    'Maths Score': [85, 67, 90, 88, 95, None, 85]}

df2 = pd.DataFrame(data)
print('Original DataFrame:\n',df2)
print('Original DataFrame shape:',df2.shape)
print('The number of missing values in dataset df2:', df2.isnull().sum().sum())

df3 = df2.drop_duplicates(subset=['Name'])
print('\n New DataFrame:\n',df3)
print('New DataFrame shape:',df3.shape)
print('The number of missing values in dataset df3:', df3.isnull().sum().sum())

df4 = df2.drop_duplicates()
print('\n Next New DataFrame:\n',df4)
print('Next New DataFrame shape:',df4.shape)
print('The number of missing values in dataset df4:', df4.isnull().sum().sum())

print('\ndf2 original columns:', df2.columns)
print('As list:', df2.columns.tolist())
print('With extra column name:', df2.columns.tolist() + ['Physics Score'])

#### Exercise 14.3 - Load data and preprocess data

In [None]:
#Exercise 14.3 - Load data and preprocess data
from sklearn.datasets import load_breast_cancer
import pandas as pd

breast_cancer = load_breast_cancer()
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

X = df.drop(columns=['mean radius'])

y_class = breast_cancer.target
y_reg = df['mean radius']

print("Data matrix df shape:", df.shape)
print("Data matrix X shape:", X.shape)
print("Classification target shape:", y_class.shape)
print("Regression target shape:", y_reg.shape)


In [None]:
#Exercise 14.3 - continued
df_new = X.copy()
df_new['y_class'] = y_class
df_new['y_reg'] = y_reg
print('Shape of df_new now is:',df_new.shape)

print('The number of missing values in dataset:', df_new.isnull().sum().sum())
df_new = df_new.dropna()

#### Drop duplicates based on features and the target

In [None]:
#Exercise 14.3 - continued
drdup_class = df_new.drop_duplicates(
    subset=X.columns.tolist() + ['y_class']
)
print('Shape of drdup_class is: ',drdup_class.shape)
print('The number of missing values in dataset drdup_class:', drdup_class.isnull().sum().sum())

drdup_reg = df_new.drop_duplicates(
    subset=X.columns.tolist() + ['y_reg']
)
print('Shape of drdup_reg is: ',drdup_reg.shape)
print('The number of missing values in dataset drdup_reg:', drdup_reg.isnull().sum().sum())

drdup_both = pd.merge(drdup_class, drdup_reg, how='inner')
print('The number of missing values in dataset drdup_both:', drdup_both.isnull().sum().sum())
print(f'After removing duplications (per target), the shape is: {drdup_both.shape}')


#### Exercise 14.4 -  Remove inconsistent rows including same features but different targets

In [None]:
#Exercise 14.4 - Remove inconsistent rows including same features but different targets
import pandas as pd
import numpy as np

data = {
    'Length': [59, 43, 59, 72, 81],
    'Width': [85, 67, 85, 88, 95],
    'Class':[1, 1, 0, 0, 0]}

df = pd.DataFrame(data)
print('Original DataFrame:\n', df)

#Class is the target column so remove it
toy_X = df.drop(columns=['Class'])
print('Data Frame with target class removed:\n',toy_X)

# Group rows based on having the same numbers in them
grouped = df.groupby(toy_X.columns.tolist())

inconsistent_indices = []

# If rows in the same group have different values in 'Class' then they are inconsistent
for _, group in grouped:
    if group['Class'].nunique() > 1:
        inconsistent_indices.extend(group.index.tolist())

# Print the indices of the inconsistent rows
print('Indices of any inconsistent rows',inconsistent_indices)
print(f'Found {len(inconsistent_indices)} inconsistent rows.')

df_cleaned = df.drop(index=inconsistent_indices).reset_index(drop=True)
print('Final cleaned data shape:', df_cleaned.shape)
print(df_cleaned)

#### Exercise 14.5 - Split data into the training and the test set - listings needed

In [None]:
#Listing 14.5 - Listings needed

#From Listing 14.1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

#From Listing 14.2 - Load data and preprocess data
breast_cancer = load_breast_cancer()
print(breast_cancer.keys())

df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
print('Breast cancer data frame shape: ',df.shape)

X = df.drop(columns=['mean radius'])
y_class = breast_cancer.target
y_reg = df['mean radius']

print("Data matrix shape:", X.shape)

#From Listing 14.6 - Split data into the training and the test set
from sklearn.model_selection import train_test_split
from collections import Counter

X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=421)

X_Strain, X_val, y_class_Strain, y_class_val, y_reg_Strain, y_reg_val = train_test_split(
    X_train, y_class_train, y_reg_train, test_size=0.2, random_state=421)

print('The size of the smaller training set:\n', X_Strain.shape)
print('The size of the validation set:\n', X_val.shape)
print('The size of the test set:\n', X_test.shape)
print('The smaller training set class distribution:\n', Counter(y_class_Strain))
print('The validation set class distribution:\n', Counter(y_class_val))
print('The test set class distribution:\n', Counter(y_class_test))

#### Exercise 14.5 - Feature selection

In [None]:
#Exercise 14.5 - amended Listing 14.7 to print out the values required
import pandas as pd
import numpy as np

def remove_highly_correlated_features(df, threshold):
    df = df.copy()
    removed_features = []

    while True:
        corr_matrix = df.corr().abs()
        print('\n Original correlation matrix:\n',corr_matrix)
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        print('Upper correlation matrix:\n',upper)
        max_corr = upper.max().max()
        print('max_corr:', max_corr)
        if max_corr < threshold:
            print(f'All correlations below threshold {threshold}.')
            break

        A, B = upper.stack().idxmax()
        print('Features with max correlation:',A,B)
        avg_corr_A = corr_matrix[A].drop(index=A).mean()
        avg_corr_B = corr_matrix[B].drop(index=B).mean()
        print(f'Average correlation with other features for {A} is: {avg_corr_A} ')
        print(f'Average correlation with other features for {B} is: {avg_corr_B}')

        to_remove = A if avg_corr_A > avg_corr_B else B
        print('Therfore remove:',to_remove)
        df.drop(columns=to_remove, inplace=True)
        removed_features.append(to_remove)

    return df, removed_features


In [None]:
#Exercise 14.5 - test code
import pandas as pd
import numpy as np

np.random.seed(421)
size = 50

data = {
    'A': np.random.rand(size),
    'B': np.random.rand(size),
    'C': np.random.rand(size),
    'D': np.random.rand(size) * 0.5,  }

data['E'] = data['B'] * 0.95 + np.random.rand(size) * 0.01

df = pd.DataFrame(data)

X_reduced, removed_features = remove_highly_correlated_features(df, threshold=0.95)

#### Exercise 14.6

In [None]:
#Question 14.6 - imports needed = Listing 14.1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

# Load data = Listing 14.2
breast_cancer = load_breast_cancer()
print(breast_cancer.keys())

df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

X = df.drop(columns=['mean radius'])
y_class = breast_cancer.target
y_reg = df['mean radius']

print("Data matrix shape:", X.shape)

#Split data into the training and the test set = Listing 14.6
from sklearn.model_selection import train_test_split
from collections import Counter

X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=421)

X_Strain, X_val, y_class_Strain, y_class_val, y_reg_Strain, y_reg_val = train_test_split(
    X_train, y_class_train, y_reg_train, test_size=0.2, random_state=421)

print('The size of the smaller training set:\n', X_Strain.shape)
print('The size of the validation set:\n', X_val.shape)
print('The size of the test set:\n', X_test.shape)
print('The smaller training set class distribution:\n', Counter(y_class_Strain))
print('The validation set class distribution:\n', Counter(y_class_val))
print('The test set class distribution:\n', Counter(y_class_test))


In [None]:
#Exercise 14.6 - remove highly correlated features using Listing 14.7
import pandas as pd
import numpy as np

def remove_highly_correlated_features(df, threshold):
    df = df.copy()
    removed_features = []

    while True:
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        max_corr = upper.max().max()
        if max_corr < threshold:
            print(f'All correlations below threshold {threshold}.')
            break

        A, B = upper.stack().idxmax()

        avg_corr_A = corr_matrix[A].drop(index=A).mean()
        avg_corr_B = corr_matrix[B].drop(index=B).mean()

        to_remove = A if avg_corr_A > avg_corr_B else B
        df.drop(columns=to_remove, inplace=True)
        removed_features.append(to_remove)

    return df, removed_features


In [None]:
#Exercise 14.6 - Feature selection = Listing 14.8
X_Strain_reduced, removed_features = remove_highly_correlated_features(X_Strain, threshold=0.50)
kept_features= X_Strain_reduced.columns.tolist()
X_train_reduced = X_train[kept_features]
X_val_reduced = X_val[kept_features]
X_test_reduced = X_test[kept_features]
print(f'Kept features: {kept_features}')



In [None]:
#Exercise 14.6 - PCA - first dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA().fit(X_train_scaled)
proj_X_train = pca.transform(X_train_scaled)
proj_X_test = pca.transform(X_test_scaled)

explained_variance = pca.explained_variance_ratio_
print('The first two PCs capture the variance in percentage:',pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])
print('The first six PCs capture the variance in percentage:',np.sum(pca.explained_variance_ratio_[0:6]))

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].plot(np.arange(1, len(explained_variance) + 1), explained_variance, marker='o')
axes[0].set_xlabel('Principal Component', fontsize=16)
axes[0].set_ylabel('Explained Variance Ratio', fontsize=16)

axes[1].scatter(proj_X_train[y_class_train == 0, 0], proj_X_train[y_class_train == 0, 1], label='Malignant', color='r')
axes[1].scatter(proj_X_train[y_class_train == 1, 0], proj_X_train[y_class_train == 1, 1], label='Benign', color='b')
axes[1].set_xlabel('PC1', fontsize=16)
axes[1].set_ylabel('PC2', fontsize=16)
axes[1].set_xlim([-7, 18])
axes[1].set_ylim([-10, 15])
axes[1].legend()

axes[2].scatter(proj_X_test[y_class_test == 0, 0], proj_X_test[y_class_test == 0, 1], label='Malignant', color='r')
axes[2].scatter(proj_X_test[y_class_test == 1, 0], proj_X_test[y_class_test == 1, 1], label='Benign', color='b')
axes[2].set_xlabel('PC1', fontsize=16)
axes[2].set_ylabel('PC2', fontsize=16)
axes[2].set_xlim([-7, 18])
axes[2].set_ylim([-10, 15])
axes[2].legend()

plt.tight_layout()

plt.savefig("pca_full.eps", dpi=600)

In [None]:
#Exercise 14.6 - PCA - second dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

scaler = StandardScaler().fit(X_train_reduced)
X_train_scaled = scaler.transform(X_train_reduced)
X_test_scaled = scaler.transform(X_test_reduced)
X_train_scaled = scaler.transform(X_train_reduced)

pca = PCA().fit(X_train_scaled)
proj_X_train = pca.transform(X_train_scaled)
proj_X_test = pca.transform(X_test_scaled)

explained_variance = pca.explained_variance_ratio_

print('The first two PCs capture the variance in percentage:',pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
axes[0].plot(np.arange(1, len(explained_variance) + 1), explained_variance, marker='o')
axes[0].set_xlabel('Principal Component', fontsize=16)
axes[0].set_ylabel('Explained Variance Ratio', fontsize=16)

axes[1].scatter(proj_X_train[y_class_train == 0, 0], proj_X_train[y_class_train == 0, 1], label='Malignant', color='r')
axes[1].scatter(proj_X_train[y_class_train == 1, 0], proj_X_train[y_class_train == 1, 1], label='Benign', color='b')
axes[1].set_xlabel('PC1', fontsize=16)
axes[1].set_ylabel('PC2', fontsize=16)
axes[1].set_xlim([-4, 10])
axes[1].set_ylim([-5, 5])
axes[1].legend()

axes[2].scatter(proj_X_test[y_class_test == 0, 0], proj_X_test[y_class_test == 0, 1], label='Malignant', color='r')
axes[2].scatter(proj_X_test[y_class_test == 1, 0], proj_X_test[y_class_test == 1, 1], label='Benign', color='b')
axes[2].set_xlabel('PC1', fontsize=16)
axes[2].set_ylabel('PC2', fontsize=16)
axes[2].set_xlim([-4, 10])
axes[2].set_ylim([-5, 5])
axes[2].legend()

plt.tight_layout()

plt.savefig("pca_reduced.eps", dpi=600)


#### Exercise 14.7

In [None]:
#Exercise 14.7 - Re-run Listing 14.12 - namely, the Ridge Regression with the best alpha
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

best_alpha = 0.215443
final_model = Ridge(alpha=best_alpha)
final_model.fit(X_train_scaled, y_reg_train)

y_test_pred = final_model.predict(X_test_scaled)

mse_test = mean_squared_error(y_reg_test, y_test_pred)
r2_test = r2_score(y_reg_test, y_test_pred)

print(f'Final test MSE (Ridge, alpha={best_alpha:.4f}): {mse_test:.6f}')
print(f'Final test coefficient of determination score : {r2_test:.4f}')


In [None]:
#Exercise 14.7 - Part 1 - Identify outliers
residuals = y_reg_test - y_test_pred
residual_std = np.std(residuals)

outlier_mask = np.abs(residuals) > 2 * residual_std
outlier_indices = outlier_mask[outlier_mask].index
outlier_pos_indices = np.where(outlier_mask)[0]  
print('Outlier actual indices:', outlier_indices)
print("Outlier positional indices:", outlier_pos_indices)
print(f'Identified {outlier_mask.sum()} outliers out of {len(y_reg_test)} test samples.')

In [None]:
#Exercise 14.7 Part 2 - Remove outliers and evaluate the original model on the cleaned test set
non_outlier_mask = ~outlier_mask
X_test_clean = X_test_scaled[non_outlier_mask]
y_test_clean = y_reg_test[non_outlier_mask]

y_test_clean_pred = final_model.predict(X_test_clean)

mse_clean = mean_squared_error(y_test_clean, y_test_clean_pred)
r2_clean = r2_score(y_test_clean, y_test_clean_pred)

print('\n--- Test performance after removing outliers ---')
print(f'MSE (clean test set): {mse_clean:.6f}')
print(f'Coefficient of determination (clean test set): {r2_clean:.4f}')


In [None]:
#Exercise 14.7 - Part 3 - Retrain ridge with augmented training set including test outliers
X_outliers = X_test_scaled[outlier_mask]
y_outliers = y_reg_test[outlier_mask]

X_train_augmented = np.vstack([X_train_scaled, X_outliers])
y_train_augmented = np.concatenate([y_reg_train, y_outliers])

X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_train_augmented, y_train_augmented, test_size=0.2, random_state=421)

scaler = StandardScaler().fit(X_train_new)
X_Strain_scaled = scaler.transform(X_train_new)
X_val_scaled = scaler.transform(X_val_new)

alphas = np.logspace(-2, 2, 25)
val_errors = []

for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_Strain_scaled, y_train_new)
    y_val_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val_new, y_val_pred)
    val_errors.append(mse)

best_alpha_augmented = alphas[np.argmin(val_errors)]
print(f'Best alpha: {best_alpha_augmented:.6f}')

scaler = StandardScaler().fit(X_train_augmented)
X_train_aug_scaled = scaler.transform(X_train_augmented)
X_test_clean_scaled = scaler.transform(X_test_clean)

augmented_model = Ridge(alpha=best_alpha_augmented)
augmented_model.fit(X_train_aug_scaled, y_train_augmented)

y_test_aug_pred = augmented_model.predict(X_test_clean_scaled)
mse_aug = mean_squared_error(y_test_clean, y_test_aug_pred)
r2_aug = r2_score(y_test_clean, y_test_aug_pred)

print('\n--- The fitted model ---')
print('Coefficients:\n', augmented_model.coef_)
print('Intercept:', augmented_model.intercept_)
print('Alpha:', augmented_model.alpha)

print('\n--- Test performance after retraining with outliers ---')
print(f'MSE (Full test set): {mse_aug:.6f}')
print(f'Coefficient of determination  (full test set): {r2_aug:.4f}')


#### Exercise 14.8 - Change the threshold

In [None]:
#Exercise 14.8 - Change the threshold
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score

np.random.seed(421)

n_samples, n_features = 50, 2

mean_0, cov_0 = [1, 2], [[1, 0.5], [0.5, 1]]
mean_1, cov_1 = [3, 3], [[1, -0.3], [-0.3, 1]]

data1 = np.random.multivariate_normal(mean_0, cov_0, n_samples)
data2 = np.random.multivariate_normal(mean_1, cov_1, n_samples)
X = np.vstack((data1, data2))
y = np.hstack((np.zeros(n_samples), np.ones(n_samples)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


class_model = LogisticRegression(penalty=None, solver='lbfgs', tol=1e-8)
class_model.fit(X_train_scaled, y_train)
y_pred = class_model.predict(X_test_scaled)

print('Sklearn logistic regression accuracy at threshold = 0.5 is:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

y_proba = class_model.predict_proba(X_test_scaled)[:, 1]

thresholds = [0.3, 0.5, 0.7]

for thresh in thresholds:
    print(f'\n--- Threshold: {thresh} ---')
    y_pred_thresh = (y_proba >= thresh).astype(int)

    cm = confusion_matrix(y_test, y_pred_thresh)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')

    prec = precision_score(y_test, y_pred_thresh)
    rec = recall_score(y_test, y_pred_thresh)
    print(f'Precision: {prec:.3f}, Recall: {rec:.3f}')


#### Exercise 14.9

In [None]:
#Exercise 14.9 - Early stopping - part 1
import copy

sgd = SGDRegressor(max_iter=1, learning_rate='constant', eta0=0.01, random_state=42, warm_start=True)

n_epochs = 100
patience = 5
best_mse = float('inf')
best_epoch = 0
val_errors = []
train_errors = []

scaler = StandardScaler().fit(X_Strain)
X_Strain_scaled = scaler.transform(X_Strain)
X_val_scaled = scaler.transform(X_val)

for epoch in range(n_epochs):
    sgd.fit(X_Strain_scaled, y_reg_Strain)

    y_train_pred = sgd.predict(X_Strain_scaled)
    y_val_pred = sgd.predict(X_val_scaled)

    mse_train = mean_squared_error(y_reg_Strain, y_train_pred)
    mse_val = mean_squared_error(y_reg_val, y_val_pred)

    train_errors.append(mse_train)
    val_errors.append(mse_val)

    if mse_val < best_mse - 1e-6:
        best_mse = mse_val
        best_epoch = epoch
        best_model = copy.deepcopy(sgd)
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print(f'Early stopping triggered at epoch {epoch}')
            break

print(f'Best validation MSE: {best_mse:.4f} at epoch {best_epoch}')

In [None]:
#Exercise 14.9 - Early stopping - part 2

plt.figure(figsize=(8, 5))
plt.plot(train_errors, label='Train MSE')
plt.plot(val_errors, label='Validation MSE')
plt.axvline(best_epoch, linestyle='--', color='red', label='Best Epoch')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.title('Early Stopping: MSE vs Epoch')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

