<a href="https://colab.research.google.com/github/dawnworlds/2024spring/blob/main/template_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Add any additional imports here (however, the task is solvable without using
# any additional imports)
# import ...

 #### Loading data

In [None]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  


#### Calculating the average RMSE

In [None]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of
    predicting y from X using a linear model with weights w.

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    RMSE = 0
    y_pred = X.dot(w)

    RMSE = np.sqrt(((y - y_pred) ** 2).mean())

    assert np.isscalar(RMSE)

    return RMSE

#### Fitting the regressor

In [None]:
def fit(X, y, lam):
    """
    This function receives training data points, then fits the ridge regression on this data
    with regularization hyperparameter lambda. The weights w of the fitted ridge regression
    are returned.

    Parameters
    ----------
    X: matrix of floats, dim = (135,13), inputs with 13 features
    y: array of floats, dim = (135,), input labels)
    lam: float. lambda parameter, used in regularization term

    Returns
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    """
    w = np.zeros((13,))
     # TODO: Enter your code here

    n_features = X.shape[1]

    I = np.eye(n_features)

    covariance_matrix = np.dot(X.T, X) + lam * I

    inverse_covariance_matrix = np.linalg.inv(covariance_matrix)

    w = np.dot(inverse_covariance_matrix, np.dot(X.T, y))




    assert w.shape == (13,)
    return w

#### Performing computation

In [None]:
"""
Main cross-validation loop, implementing 10-fold CV. In every iteration
(for every train-test split), the RMSE for every lambda is calculated,
and then averaged over iterations.

Parameters
----------
X: matrix of floats, dim = (150, 13), inputs with 13 features
y: array of floats, dim = (150, ), input labels
lambdas: list of floats, len = 5, values of lambda for which ridge regression is fitted and RMSE estimated
n_folds: int, number of folds (pieces in which we split the dataset), parameter K in KFold CV

Compute
----------
avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
"""
X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
RMSE_mat = np.zeros((n_folds, len(lambdas)))


for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Loop over the lambda values
    for lambda_idx, lam in enumerate(lambdas):
        # Fit the ridge regression model on the training set
        w = fit(X_train, y_train, lam)

        # Calculate RMSE on the testing set
        RMSE_mat[fold_idx, lambda_idx] = calculate_RMSE(w, X_test, y_test)

# Calculate the average RMSE for each lambda across all folds
avg_RMSE = RMSE_mat.mean(axis=0)

# Ensure that avg_RMSE has the correct shape
assert avg_RMSE.shape == (len(lambdas),), "avg_RMSE should have the same length as the number of lambdas."

# Print the average RMSE for each lambda
print(avg_RMSE)

[5.38380157 5.36264675 5.36223749 5.88532073 6.20091603]


In [None]:
# Save results in the required format
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

In [None]:
from sklearn.model_selection import KFold
import numpy as np

# Define the fit function according to the ridge regression closed-form solution
def fit(X, y, lam):
    # Add a column of ones for the bias term
    X_with_bias = np.hstack([np.ones((X.shape[0], 1)), X])

    # Create the identity matrix with the appropriate shape
    I = np.eye(X_with_bias.shape[1])
    I[0, 0] = 0  # Exclude the bias term from regularization

    # Compute the ridge regression weights
    w = np.linalg.inv(X_with_bias.T @ X_with_bias + lam * I) @ X_with_bias.T @ y

    return w

# Define the calculate_RMSE function
def calculate_RMSE(w, X, y):
    # Add a column of ones for the bias term to the feature matrix
    X_with_bias = np.hstack([np.ones((X.shape[0], 1)), X])

    # Predict the target values using the weight vector
    y_pred = X_with_bias @ w

    # Compute the RMSE
    rmse = np.sqrt(np.mean((y - y_pred) ** 2))
    return rmse

# Function to perform 10-fold cross-validation and calculate average RMSE
def cross_validate_ridge_regression(X, y, lambdas, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    # Prepare an array to store the RMSE values for each lambda
    RMSE_scores = np.zeros((n_folds, len(lambdas)))

    # Iterate over the folds
    for fold_idx, (train_index, test_index) in enumerate(kf.split(X)):
        # Split the data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Iterate over the values of lambda
        for i, lam in enumerate(lambdas):
            # Fit the model on the training set
            w = fit(X_train, y_train, lam)
            # Evaluate the model on the testing set
            RMSE_scores[fold_idx, i] = calculate_RMSE(w, X_test, y_test)

    # Calculate the average RMSE across all folds for each lambda
    avg_RMSE = RMSE_scores.mean(axis=0)
    return avg_RMSE

# Load your data into X and y before calling this function
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())
X = data.to_numpy()


# Specify the lambdas and number of folds
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10

# Perform 10-fold cross-validation and compute average RMSE
avg_RMSE = cross_validate_ridge_regression(X, y, lambdas, n_folds)

# Print or save the average RMSE values
print("Average RMSE for each lambda:", avg_RMSE)
# np.savetxt("avg_RMSE.csv", avg_RMSE, delimiter=",", fmt="%.12f")  # Save to CSV file


         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  
Average RMSE for each lambda: [5.36917927 5.37542771 5.36458218 5.52153872 5.61106793]


In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())
X = data.to_numpy()

# Regularization parameters
lambdas = [0.1, 1, 10, 100, 200]

# Prepare 10-fold cross-validation
kf = KFold(n_splits=10)

# Dictionary to store the RMSE for each lambda
rmse_results = {l: [] for l in lambdas}

# Perform 10-fold cross-validation for each lambda
for l in lambdas:
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Ridge regression model
        model = Ridge(alpha=l)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate RMSE and store it
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_results[l].append(rmse)

# Calculate the average RMSE for each lambda
average_rmse = {l: np.mean(rmses) for l, rmses in rmse_results.items()}
average_rmse

print("Average RMSE for each lambda:", average_rmse)

np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")


         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  
Average RMSE for each lambda: {0.1: 5.501809445057858, 1: 5.499838741278099, 10: 5.483631486072288, 100: 5.636642135414034, 200: 5.721233719861128}
