1.  **Regularization**:

    - Use the `diabetes` dataset from `sklearn.datasets`.
    - Compare the performance (Mean Squared Error) of `LinearRegression`, `Ridge`, and `Lasso` models.
    - Tune the `alpha` parameter for `Ridge` and `Lasso` using `GridSearchCV` with cross-validation to find the optimal regularization strength.

    ```python
    from sklearn.datasets import load_diabetes

    # Load the diabetes dataset
    diabetes = load_diabetes()
    ```

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.datasets import make_regression, make_classification
import numpy as np

from sklearn.datasets import load_diabetes

# Load the diabetes dataset
diabetes = load_diabetes()

In [55]:
X = diabetes.data
y = diabetes.target

In [57]:
X.shape, y.shape

((442, 10), (442,))

In [58]:
np.info(X)

class:  ndarray
shape:  (442, 10)
strides:  (80, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x5bdd320
byteorder:  little
byteswap:  False
type: float64


In [59]:
X[:10]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632753, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567042, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286131, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665608,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02268774, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187239,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03198764, -0.04664087],
       [-0.09269548, -0.04464164, -0.04069594, -0.01944183, -0.06899065,
        -0.07928784,  0.04127682, -0.0763945 , -0.04117617, -0.09634616],
       [-0.04547248,  0.05068012, -0.04716281, -0.01599898, -0.04009564,
        -0.02480001,  0.00077881, -0.03949338

In [60]:
y[:10]

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310.])

In [61]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Linear Regression model on the training data.

linear_reg = LinearRegression()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
linear_reg.fit(X_train, y_train)
linear_preds = linear_reg.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_preds)

print(f'Linear MSE: {linear_mse}')

Linear MSE: 3424.259334298693


In [63]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
ridge_preds = ridge_reg.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_preds)

print(f'Ridge MSE: {ridge_mse}')

Ridge MSE: 3430.1067624845687


In [64]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)

lasso_preds = lasso_reg.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_preds)

print(f'Lasso MSE: {lasso_mse}')

Lasso MSE: 3434.536284214465


In [65]:
from sklearn.model_selection import GridSearchCV

# Grid search for Ridge
ridge_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)

print(f'Best Ridge alpha: {ridge_grid.best_params_}')
print(f'Best Ridge CV MSE: {-ridge_grid.best_score_}')

ridge_best = ridge_grid.best_estimator_
ridge_best_preds = ridge_best.predict(X_test)
ridge_best_mse = mean_squared_error(y_test, ridge_best_preds)
print(f'Ridge Best MSE: {ridge_best_mse}')

# Grid search for Lasso
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
lasso_grid = GridSearchCV(Lasso(), lasso_params, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)

print(f'\nBest Lasso alpha: {lasso_grid.best_params_}')
print(f'Best Lasso CV MSE: {-lasso_grid.best_score_}')

lasso_best = lasso_grid.best_estimator_
lasso_best_preds = lasso_best.predict(X_test)
lasso_best_mse = mean_squared_error(y_test, lasso_best_preds)
print(f'Lasso Best MSE: {lasso_best_mse}')

Best Ridge alpha: {'alpha': 10}
Best Ridge CV MSE: 2873.3262810476854
Ridge Best MSE: 3429.236228387488

Best Lasso alpha: {'alpha': 1}
Best Lasso CV MSE: 2864.383004415219
Lasso Best MSE: 3412.664193575005


2.  **Ensemble Methods**:

    - Use the `breast_cancer` dataset from `sklearn.datasets`.
    - Compare the performance (F1 Score and AUC) of `DecisionTreeClassifier`, `RandomForestClassifier`, and `GradientBoostingClassifier`.
    - Tune the hyperparameters of each classifier using `GridSearchCV` with cross-validation.

    ```python
    from sklearn.datasets import load_breast_cancer

    # Load the breast cancer dataset
    breast_cancer = load_breast_cancer()
    ```

In [78]:
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
breast_cancer = load_breast_cancer()
 

In [79]:
X = breast_cancer.data
y = breast_cancer.target

In [80]:
X[:5]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [81]:
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [82]:
X.shape, y.shape

((569, 30), (569,))

In [83]:
np.sum(y)

357

In [84]:
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
model = DecisionTreeClassifier(criterion='gini', random_state=0)
param_grid = {
    'max_depth': [None, 2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [2, 4, 6, 8],
}

# Use f1 score as the metrics
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6; total time=   0.0s
[CV

In [86]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score:.4f}")

preds = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
fpr, tpr, thresholds = roc_curve(y_test, grid_search.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f"AUC: {roc_auc:.2f}")

Best Parameters: {'max_depth': None, 'min_samples_leaf': 8, 'min_samples_split': 2}
Best Score: 0.9441
Accuracy: 0.94
Precision: 0.97
Recall: 0.93
F1 Score: 0.95
AUC: 0.99


In [88]:
# Grid search for RandomForestClassifier
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=0), rf_params, cv=5, n_jobs=-1, verbose=2, scoring='f1')
rf_grid.fit(X_train, y_train)



Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_

In [89]:

rf_best = rf_grid.best_estimator_
rf_best_preds = rf_best.predict(X_test)
print(f'Best RandomForest params: {rf_grid.best_params_}')
print(f'Best RandomForest CV F1 Score: {rf_grid.best_score_:.4f}')

rf_accuracy = accuracy_score(y_test, rf_best_preds)
rf_precision = precision_score(y_test, rf_best_preds)
rf_recall = recall_score(y_test, rf_best_preds)
rf_f1 = f1_score(y_test, rf_best_preds)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_best.predict_proba(X_test)[:,1])
rf_auc = auc(rf_fpr, rf_tpr)



print(f'RandomForest Accuracy: {rf_accuracy:.2f}')
print(f'RandomForest Precision: {rf_precision:.2f}')
print(f'RandomForest Recall: {rf_recall:.2f}')
print(f'RandomForest F1 Score: {rf_f1:.2f}')
print(f'RandomForest AUC: {rf_auc:.2f}')

Best RandomForest params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best RandomForest CV F1 Score: 0.9663
RandomForest Accuracy: 0.96
RandomForest Precision: 0.98
RandomForest Recall: 0.96
RandomForest F1 Score: 0.97
RandomForest AUC: 1.00


In [90]:
# Grid search for GradientBoostingClassifier
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=0), gb_params, cv=5, n_jobs=-1, verbose=2, scoring='f1')
gb_grid.fit(X_train, y_train)



Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; 

In [91]:
print(f'Best GradientBoosting params: {gb_grid.best_params_}')
print(f'Best GradientBoosting CV F1 Score: {gb_grid.best_score_:.4f}')

gb_best = gb_grid.best_estimator_
gb_best_preds = gb_best.predict(X_test)

gb_accuracy = accuracy_score(y_test, gb_best_preds)
gb_precision = precision_score(y_test, gb_best_preds)
gb_recall = recall_score(y_test, gb_best_preds)
gb_f1 = f1_score(y_test, gb_best_preds)
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_best.predict_proba(X_test)[:,1])
gb_auc = auc(gb_fpr, gb_tpr)

print(f'GradientBoosting Accuracy: {gb_accuracy:.2f}')
print(f'GradientBoosting Precision: {gb_precision:.2f}')
print(f'GradientBoosting Recall: {gb_recall:.2f}')
print(f'GradientBoosting F1 Score: {gb_f1:.2f}')
print(f'GradientBoosting AUC: {gb_auc:.2f}')

Best GradientBoosting params: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best GradientBoosting CV F1 Score: 0.9781
GradientBoosting Accuracy: 0.96
GradientBoosting Precision: 0.96
GradientBoosting Recall: 0.99
GradientBoosting F1 Score: 0.97
GradientBoosting AUC: 1.00
