**ASSIGNMENT-5**

---



Generate a dataset with atleast seven highly correlated columns and a target variable.
Implement Ridge Regression using Gradient Descent Optimization. Take different
values of learning rate (such as 0.0001,0.001,0.01,0.1,1,10) and regularization
parameter (10-15,10-10,10-5,10- 3,0,1,10,20). Choose the best parameters for which ridge
regression cost function is minimum and R2_score is maximum.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# ------------- data example (replace with your data) -------------
np.random.seed(0)
n = 500
z = np.random.randn(n,1)
X = np.hstack([z + 0.01*np.random.randn(n,1) for _ in range(8)])  # 8 correlated features
X = np.hstack([X, np.random.randn(n,2)])  # +2 random features
true_w = np.array([2.5, -1.2, 1.8, 0.0, 0.7, -0.5, 1.0, 0.3, 0.0, 0.0])
y = X.dot(true_w) + 0.5*np.random.randn(n)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# ------------- preprocessing -------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# ------------- safe ridge by gradient descent -------------
def ridge_cost_and_grad(w, X, y, alpha):
    m = X.shape[0]
    Xb = np.hstack([np.ones((m,1)), X])
    preds = Xb.dot(w)
    error = preds - y
    cost = (1.0/(2*m)) * np.sum(error**2) + (alpha/(2*m)) * np.sum(w[1:]**2)
    grad = (1.0/m) * Xb.T.dot(error)
    grad[1:] += (alpha/m) * w[1:]
    return cost, grad

def ridge_gd_safe(X, y, alpha=1.0, lr=1e-3, n_iter=20000, grad_clip=1e3, tol=1e-9, verbose=False):
    m, d = X.shape
    w = np.zeros(d+1, dtype=np.float64)
    costs = []
    for i in range(n_iter):
        cost, grad = ridge_cost_and_grad(w, X, y, alpha)
        # detect bad cost
        if not np.isfinite(cost):
            if verbose: print(f"Aborting: cost became non-finite at iter {i}, cost={cost}")
            return None, costs, False
        # gradient clipping to avoid single huge jump
        grad = np.clip(grad, -grad_clip, grad_clip)
        w = w - lr * grad
        costs.append(cost)
        # detect NaN/Inf in weights
        if not np.all(np.isfinite(w)):
            if verbose: print(f"Aborting: weights became non-finite at iter {i}")
            return None, costs, False
        # simple convergence check
        if i>0 and abs(costs[-2] - costs[-1]) < tol:
            return w, costs, True
    return w, costs, True

# ------------- hyperparameter sweep (safe defaults) -------------
learning_rates = [1e-4, 1e-3, 1e-2, 1e-1]            # avoid large lrs like 1, 10
alphas = [1e-5, 1e-3, 0.01, 0.1, 1, 10]             # common ridge alphas
results = []

for lr in learning_rates:
    for alpha in alphas:
        w, costs, ok = ridge_gd_safe(X_train_s, y_train, alpha=alpha, lr=lr,
                                    n_iter=20000, grad_clip=1e4, tol=1e-10, verbose=False)
        if not ok or w is None:
            # mark as failed (didn't converge or exploded)
            results.append({'lr': lr, 'alpha': alpha, 'status': 'failed'})
            continue
        # evaluate
        Xb_test = np.hstack([np.ones((X_test_s.shape[0],1)), X_test_s])
        preds = Xb_test.dot(w)
        if not np.all(np.isfinite(preds)):
            results.append({'lr': lr, 'alpha': alpha, 'status': 'failed_preds'})
            continue
        mse = mean_squared_error(y_test, preds)
        r2  = r2_score(y_test, preds)
        results.append({'lr': lr, 'alpha': alpha, 'status': 'ok', 'final_cost': costs[-1],
                        'mse_test': mse, 'r2_test': r2, 'n_iters': len(costs)})

res_df = pd.DataFrame(results)
print(res_df.sort_values(['status','r2_test'], ascending=[True, False]).head(10))

       lr     alpha status  final_cost  mse_test   r2_test  n_iters
10  0.001   1.00000     ok    0.120167  0.219727  0.989340    20000
9   0.001   0.10000     ok    0.117177  0.219742  0.989339    20000
8   0.001   0.01000     ok    0.116878  0.219744  0.989339    20000
7   0.001   0.00100     ok    0.116848  0.219744  0.989339    20000
6   0.001   0.00001     ok    0.116844  0.219744  0.989339    20000
16  0.010   1.00000     ok    0.120081  0.219756  0.989338    20000
11  0.001  10.00000     ok    0.149976  0.219757  0.989338    20000
17  0.010  10.00000     ok    0.149970  0.219761  0.989338     8772
23  0.100  10.00000     ok    0.149970  0.219762  0.989338     1336
15  0.010   0.10000     ok    0.117040  0.219779  0.989337    20000


Load the Hitters dataset from the following link
https://drive.google.com/file/d/1qzCKF6JKKMB0p7ul_lLy8tdmRk3vE_bG/view?usp=sharing

(a) Pre-process the data (null values, noise, categorical to numerical encoding)

(b) Separate input and output features and perform scaling

(c) Fit a Linear, Ridge (use regularization parameter as 0.5748), and LASSO (use
regularization parameter as 0.5748) regression function on the dataset.

(d) Evaluate the performance of each trained model on test set. Which model
performs the best and Why?

In [2]:
# Q2_hitters.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# --- A) Load data ---
# Option 1: raw gist url (public CSV). Use whichever copy you prefer.
csv_url = "https://gist.githubusercontent.com/keeganhines/59974f1ebef97bbaa44fb19143f90bad/raw/Hitters.csv"
df = pd.read_csv(csv_url, delim_whitespace=False)
# The CSV may have the first column as names item with a leading '-'; adjust as needed
# If salary has 'NA' strings, pandas will parse them as NaN.

# Quick look
print(df.shape)
print(df.head())

# (a) Preprocess: handle nulls, noise, categorical -> numeric
# Replace 'NA' or 'NaN' as appropriate; Salary target might have missing values -> drop those rows
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df = df.dropna(subset=['Salary']).reset_index(drop=True)

# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
# In this dataset, common categorical cols: 'League','Division','NewLeague' (plus player names)
# Remove player name column if present (it might be first column)
# If the first column is a player name, drop it
if df.columns[0].lower() not in [c.lower() for c in ['AtBat','Hits','HmRun','Runs','RBI']]:
    df = df.iloc[:,1:]  # drop name column

# Re-evaluate cat columns
cat_cols = [c for c in df.select_dtypes(include=['object']).columns if c not in ['Player','Name']]
print("Categorical columns:", cat_cols)

# (b) Separate input/output and perform scaling
X = df.drop(columns=['Salary'])
y = df['Salary'].values

# Build ColumnTransformer: One-hot encode categorical, passthrough numerical, then scale
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
])

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# (c) Fit Linear, Ridge, Lasso (alpha=0.5748 for Ridge and Lasso)
alpha_val = 0.5748

models = {
    'Linear': Pipeline([('pre', preprocessor), ('lr', LinearRegression())]),
    'Ridge' : Pipeline([('pre', preprocessor), ('rg', Ridge(alpha=alpha_val))]),
    'Lasso' : Pipeline([('pre', preprocessor), ('ls', Lasso(alpha=alpha_val, max_iter=10000))])
}

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)
    print(f"{name}: RMSE={rmse:.3f}, R2={r2:.3f}")

  df = pd.read_csv(csv_url, delim_whitespace=False)


(322, 21)
          Unnamed: 0  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  \
0     -Andy Allanson    293    66      1    30   29     14      1     293   
1        -Alan Ashby    315    81      7    24   38     39     14    3449   
2       -Alvin Davis    479   130     18    66   72     76      3    1624   
3      -Andre Dawson    496   141     20    65   78     37     11    5628   
4  -Andres Galarraga    321    87     10    39   42     30      2     396   

   CHits  ...  CRuns  CRBI  CWalks  League Division PutOuts  Assists  Errors  \
0     66  ...     30    29      14       A        E     446       33      20   
1    835  ...    321   414     375       N        W     632       43      10   
2    457  ...    224   266     263       A        W     880       82      14   
3   1575  ...    828   838     354       N        E     200       11       3   
4    101  ...     48    46      33       N        E     805       40       4   

   Salary  NewLeague  
0     NaN          A  


Cross Validation for Ridge and Lasso Regression

Explore Ridge Cross Validation (RidgeCV) and Lasso Cross Validation (LassoCV)
function of Python. Implement both on Boston House Prediction Dataset (load_boston
dataset from sklearn.datasets).

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

try:
    from sklearn.datasets import load_boston
    b = load_boston()
    X = b.data
    y = b.target
except Exception:
    url = r"http://lib.stat.cmu.edu/datasets/boston"
    raw = pd.read_csv(url, sep=r"\s+", header=None, skiprows=22)
    data = np.hstack([raw.values[::2, :], raw.values[1::2, :2]])
    X = data[:, :-1]
    y = data[:, -1]

scaler = StandardScaler()
X_s = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_s, y, test_size=0.2, random_state=0)

alphas = np.logspace(-6, 6, 200)
ridge_cv = RidgeCV(alphas=alphas, store_cv_results=True)
ridge_cv.fit(X_train, y_train)
preds_ridge = ridge_cv.predict(X_test)
print("RidgeCV best alpha:", ridge_cv.alpha_)
print("RidgeCV R2:", r2_score(y_test, preds_ridge))
print("RidgeCV MSE:", mean_squared_error(y_test, preds_ridge))

lasso_cv = LassoCV(cv=5, max_iter=10000)
lasso_cv.fit(X_train, y_train)
preds_lasso = lasso_cv.predict(X_test)
print("LassoCV best alpha:", lasso_cv.alpha_)
print("LassoCV R2:", r2_score(y_test, preds_lasso))
print("LassoCV MSE:", mean_squared_error(y_test, preds_lasso))

RidgeCV best alpha: 26.126752255633264
RidgeCV R2: 0.5211263118777945
RidgeCV MSE: 20.07762625463206
LassoCV best alpha: 0.2576340575560743
LassoCV R2: 0.5342871059328007
LassoCV MSE: 19.52583668923175


Multiclass Logistic Regression:
Implement Multiclass Logistic Regression (step-by step) on Iris dataset using one vs. rest strategy?

In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

iris = load_iris()
X = iris.data
y = iris.target

scaler = StandardScaler()
X_s = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_s, y, test_size=0.2, random_state=42)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def train_binary_logistic(X, y_bin, lr=0.1, n_iter=5000, tol=1e-6):
    m, d = X.shape
    Xb = np.hstack([np.ones((m,1)), X])
    w = np.zeros(d+1)
    for _ in range(n_iter):
        preds = sigmoid(Xb @ w)
        grad = (1/m) * Xb.T @ (preds - y_bin)
        w -= lr * grad
        if np.linalg.norm(grad) < tol:
            break
    return w

K = len(np.unique(y_train))
weights = []
for k in range(K):
    y_bin = (y_train == k).astype(int)
    w_k = train_binary_logistic(X_train, y_bin, lr=0.3, n_iter=10000)
    weights.append(w_k)
weights = np.vstack(weights)

def predict_ovr(X, weights):
    Xb = np.hstack([np.ones((X.shape[0],1)), X])
    probs = sigmoid(Xb @ weights.T)
    return np.argmax(probs, axis=1)

y_pred = predict_ovr(X_test, weights)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
