In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "./clean_weather.csv"
df = pd.read_csv(file_path)

# Display the first few rows
df.head()

Unnamed: 0.1,Unnamed: 0,tmax,tmin,rain,tmax_tomorrow
0,1970-01-01,60.0,35.0,0.0,52.0
1,1970-01-02,52.0,39.0,0.0,52.0
2,1970-01-03,52.0,35.0,0.0,53.0
3,1970-01-04,53.0,36.0,0.0,52.0
4,1970-01-05,52.0,35.0,0.0,50.0


In [None]:
# Convert categorical to numeric
#df['Extracurricular Activities'] = df['Extracurricular Activities'].map({
                                                                        #'Yes': 1, 'No': 0})

# Extract features and target
X = df.drop(columns=['tmax_tomorrow', 'Unnamed: 0']).astype(float)
y = df['tmax_tomorrow'].astype(float)
# Replace NaN values in X with column means
col_means = np.nanmean(X, axis=0)  # Compute mean without NaNs
X = np.where(np.isnan(X), col_means, X)  # Replace NaNs with mean

# Replace NaNs in y with mean value
y = np.where(np.isnan(y), np.nanmean(y), y)

# # Compute mean and standard deviation of each feature
# X_mean = np.mean(X, axis=0)
# X_std = np.std(X, axis=0)

# # Standardize: (X - mean) / std
# X_standardized = (X - X_mean) / X_std

# # Ensure no division by zero (replace std=0 with 1 to avoid NaNs)
# X_standardized[:, X_std == 0] = 0


# Get the shapes
#X_np.shape, y_np.shape

# Add bias term (intercept) - a column of ones
X = np.c_[np.ones(X.shape[0]), X]

# Split into train/test sets
def train_test_split(X, y, test_ratio=0.2, seed=42):
    np.random.seed(seed)
    indices = np.random.permutation(X.shape[0])
    test_size = int(len(X) * test_ratio)
    test_idx, train_idx = indices[:test_size], indices[test_size:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10808, 4), (2701, 4), (10808,), (2701,))

In [None]:
def loss(beta,X,y):
        """Compute MSE loss."""
        return np.mean((X @ beta - y) ** 2)

In [None]:
def backtracking_line_search(X, y, beta, grad, eta_init=0.5, alpha=0.1, c=1e-4):
    """
    Implements Backtracking Line Search to determine step size.

    Parameters:
    X : np.array -> Feature matrix
    y : np.array -> Target values
    beta : np.array -> Current coefficients
    grad : np.array -> Gradient at beta
    eta_init : float -> Initial step size (typically 1)
    alpha : float -> Contraction factor (typically 0.5)
    c : float -> Armijo condition parameter

    Returns:
    eta : float -> Optimized step size
    """



    eta = eta_init
    loss_old = loss(beta, X,y)
    direction = -grad  # d_k = -∇f(x_k)

    while True:
        beta_new = beta + eta * direction  # Test new beta
        loss_new = loss(beta_new,X,y)
        print(f"Loss Old: {loss_old}, Loss New: {loss_new}")

        # Check Armijo condition
        if loss_new <= loss_old + c * eta * np.dot(grad, direction):
            break  # Stop if sufficient decrease

        eta *= alpha  # Reduce step size if condition is not met

    return eta


In [None]:
def gradient_descent_backtracking(X, y, max_iter=100000, epsilon = 1e-5):
    """
    Gradient Descent with Backtracking Line Search.
    """

    n, d = X.shape
    beta = np.zeros(d)  # Initialize beta

    def compute_gradient(beta, X, y, h=1e-5):
        #"""Compute gradient of MSE loss."""
        return (2 / n) * X.T @ (X @ beta - y)

        # """Compute gradient using finite differences approximation."""
        # d = len(beta)
        # grad = np.zeros(d)

        # for i in range(d):
        #     e_i = np.zeros(d)
        #     e_i[i] = h  # Unit vector along i-th direction

        #     # Compute finite difference approximation of the gradient
        #     grad[i] = (loss(beta + e_i, X, y) - loss(beta, X, y)) / h

        # return grad
    i=1
    while(True):
        print(f"Iteration {i+1}:")
        grad = compute_gradient(beta, X, y)  # Compute gradient
        eta = backtracking_line_search(X, y, beta, grad)  # Adapt step size
        beta_new = beta - eta * grad  # Update beta

        norm = np.linalg.norm(grad)
        print(f"gradient norm: {norm}")
        # 🚨 Stopping Criterion: If gradient norm is very small, stop
        if norm < epsilon:
            print(f"Converged in {i} iterations!")
            break


        beta = beta_new
        i+=1

    return beta


In [None]:
beta_opt = gradient_descent_backtracking(X_train, y_train)
print("Optimized Coefficients:", beta_opt)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loss Old: 21.533408980057526, Loss New: 21.53340898005753
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.533408980057526
gradient norm: 8.401209047985338e-06
Iteration 10189790:
Loss Old: 21.533408980057526, Loss New: 21.533408981598217
Loss Old: 21.533408980057526, Loss New: 21.533408980072306
Loss Old: 21.533408980057526, Loss New: 21.533408980057615
Loss Old: 21.533408980057526, Loss New: 21.53340898005753
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.53340898005753
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.53340898005753
Loss Old: 21.533408980057526, Loss New: 21.533408980057533
Loss Old: 21.533408980057526, Loss New: 21.5334089800575

KeyboardInterrupt: 

In [None]:


def prediction_accuracy(y_true, y_pred):
    """Computes prediction accuracy based on MSE and variance of y."""
    mse = np.mean((y_true - y_pred) ** 2)
    var_y = np.var(y_true)  # Variance of y

    # Avoid division by zero
    if var_y == 0:
        print("Variance of y is zero! Accuracy is undefined.")
        return None

    accuracy = (1 - (mse / var_y)) * 100  # Convert to percentage

    print(f"MSE: {mse:.4f}")
    print(f"Variance of y: {var_y:.4f}")
    print(f"Prediction Accuracy: {accuracy:.2f}%")

    return accuracy

# Example usage
#beta_opt = [ 8.69529915 , 0.73503273,  0.17648365, -2.00327188]
y_pred = X_test @ beta_opt  # Predicted values using trained beta
prediction_accuracy(y_test, y_pred)


MSE: 24.1406
Variance of y: 70.5146
Prediction Accuracy: 65.77%


np.float64(65.76504419911046)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def backtracking_line_search(X, y, beta, grad, eta_init=1, alpha=0.5, c=1e-4):
    """Backtracking line search with safe loss computation."""

    def loss(beta):
        try:
            return np.mean((X @ beta - y) ** 2)
        except:
            return np.inf

    eta = eta_init
    loss_old = loss(beta)
    direction = -grad / np.linalg.norm(grad)  # Normalize gradient

    while True:
        beta_new = beta + eta * direction
        loss_new = loss(beta_new)

        if np.isnan(loss_new):
            eta *= alpha
            continue
        print(f"Loss Old: {loss_old}, Loss New: {loss_new}")
        if loss_new <= loss_old + c * eta * np.dot(grad, direction):
            break

        eta *= alpha
        if eta < 1e-10:
            break

    return eta

def gradient_descent_visualize(X, y, max_iter=1000, tol=1e-6):
    """Gradient Descent with Backtracking + Visualization."""

    n, d = X.shape
    beta = np.zeros(d)
    loss_history = []
    step_sizes = []
    grad_norms = []

    def compute_gradient(beta, X, y, h=1e-5):
        #"""Compute gradient of MSE loss."""
        #return (2 / n) * X.T @ (X @ beta - y)

        """Compute gradient using finite differences approximation."""
        d = len(beta)
        grad = np.zeros(d)

    def loss(beta):
        return np.mean((X @ beta - y) ** 2)

    for i in range(max_iter):
        grad = compute_gradient(beta, X,y)
        grad_norms.append(np.linalg.norm(grad, ord=2))

        eta = backtracking_line_search(X, y, beta, grad)
        step_sizes.append(eta)

        beta_new = beta - eta * grad
        loss_history.append(loss(beta_new))

        if np.linalg.norm(beta_new - beta, ord=2) < tol:
            print(f"Converged in {i+1} iterations.")
            break

        beta = beta_new

    # 📊 Plot Results
    fig, ax = plt.subplots(1, 3, figsize=(18, 5))

    ax[0].plot(loss_history, label="Loss")
    ax[0].set_title("Loss Over Iterations")
    ax[0].set_xlabel("Iteration")
    ax[0].set_ylabel("Loss")
    ax[0].legend()

    ax[1].plot(step_sizes, label="Step Size (η)", color="orange")
    ax[1].set_title("Step Size Over Iterations")
    ax[1].set_xlabel("Iteration")
    ax[1].set_ylabel("Step Size")
    ax[1].legend()

    ax[2].plot(grad_norms, label="Gradient Norm", color="red")
    ax[2].set_title("Gradient Norm Over Iterations")
    ax[2].set_xlabel("Iteration")
    ax[2].set_ylabel("||∇f(x)||")

    plt.show()

    return beta

# Run gradient descent with visualization
beta_opt = gradient_descent_visualize(X_train, y_train)


ValueError: Improper number of dimensions to norm.

In [None]:
def gradient_descent(X, y, lr=0.01, max_iter=1000, tol=1e-6):
    """
    Performs gradient descent for linear regression.
    X: Feature matrix (with bias term)
    y: Target values
    lr: Learning rate
    max_iter: Max iterations
    tol: Convergence tolerance
    """
    n, d = X.shape  # Number of samples (n) and features (d)
    beta = np.zeros(d)  # Initialize coefficients

    for i in range(max_iter):
        gradient = (1 / n) * X.T @ (X @ beta - y)  # Compute gradient
        beta_new = beta - lr * gradient  # Update beta

        # Check convergence
        if np.linalg.norm(beta_new - beta, ord=2) < tol:
            print(f"Converged in {i+1} iterations.")
            break

        beta = beta_new  # Update beta for next iteration

    return beta


In [None]:
beta_opt = gradient_descent(X_train, y_train, lr=0.01, max_iter=1000)
print("Optimized Coefficients:", beta_opt)


Optimized Coefficients: [nan nan nan nan nan nan]


In [None]:
y_pred = X_test @ beta_opt  # Predictions

# Compute Mean Squared Error (MSE)
mse = np.mean((y_test - y_pred) ** 2)
print("Test MSE:", mse)


Test MSE: nan


In [None]:

# Soft-thresholding operator for L1 regularization
def soft_thresholding(z, alpha):
    """
    Applies the soft-thresholding operator element-wise.

    Parameters:
    - z: input array
    - alpha: threshold parameter

    Returns:
    - Thresholded array
    """
    return np.sign(z) * np.maximum(np.abs(z) - alpha, 0)

# ISTA algorithm for solving the LASSO problem
def ista(X, y, lambda_, step_size, max_iter=1000, tol=1e-6):
    """
    Iterative Soft-Thresholding Algorithm (ISTA) to solve:
        min_beta (1/2n)||Xβ - y||² + λ||β||₁

    Parameters:
    - X: Feature matrix (standardized)
    - y: Target vector
    - lambda_: Regularization parameter controlling sparsity
    - step_size: Learning rate (must be <= 1 / Lipschitz constant of ∇f)
    - max_iter: Maximum number of iterations
    - tol: Convergence tolerance for stopping criterion

    Returns:
    - beta: Estimated coefficient vector
    """
    n, p = X.shape
    beta = np.zeros(p)  # Initialize coefficients to zeros

    for i in range(max_iter):
        # Compute gradient of the smooth part: (1/n) * Xᵗ(Xβ - y)
        gradient = (1 / n) * X.T @ (X @ beta - y)

        # Gradient descent step followed by soft-thresholding
        beta_new = soft_thresholding(beta - step_size * gradient, lambda_ * step_size)

        # Check convergence using L2 norm of change in beta
        if np.linalg.norm(beta_new - beta, ord=2) < tol:
            print(f"Converged in {i} iterations.")
            break

        beta = beta_new  # Update beta for next iteration

    return beta

# Set hyperparameters
#lambda_ = 0.1  # Regularization strength
# Compute safe step size: 1 / Lipschitz constant of gradient
#step_size = 1.0 / np.linalg.norm(X_np.T @ X_np / len(X_np), 2)

# Run ISTA algorithm
#beta_est = ista(X_np, y_np, lambda_=lambda_, step_size=step_size)

# Display the estimated coefficients
#print(beta_est)


In [None]:
# Step 1: Split data into train and test
def train_test_split(X, y, test_ratio=0.2, seed=42):
    """
    Randomly splits data into training and testing sets.
    """
    np.random.seed(seed)
    n = X.shape[0]
    indices = np.random.permutation(n)
    test_size = int(n * test_ratio)
    test_idx = indices[:test_size]
    train_idx = indices[test_size:]

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_ratio=0.2)

# Step 2: Recompute step size for training set
n_train = X_train.shape[0]
step_size = 1.0 / np.linalg.norm(X_train.T @ X_train / n_train, 2)

# Step 3: Fit ISTA on training set
beta_est = ista(X_train, y_train, lambda_=0.1, step_size=step_size)

# Step 4: Predict on test set
y_pred = X_test @ beta_est

# Step 5: Evaluate performance (Mean Squared Error)
mse = np.mean((y_test - y_pred) ** 2)

print("Estimated Coefficients:", beta_est)
print("Test MSE:", mse)


Converged in 6 iterations.
Estimated Coefficients: [ 7.07397718 17.8005892   0.19128215  1.06695716  1.09442799]
Test MSE: 3051.635133463572
