In [1]:
import numpy as np
import pandas as pd

### Exercise 1: K-Fold

In [2]:
from sklearn.model_selection import KFold


X = np.array(np.arange(1,21).reshape(10,-1))
y = np.array(np.arange(1,11))

kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X)):

    print(f"Fold {i}:")

    print(f"  Train: index={train_index}")

    print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[2 3 4 5 6 7 8 9]
  Test:  index=[0 1]
Fold 1:
  Train: index=[0 1 4 5 6 7 8 9]
  Test:  index=[2 3]
Fold 2:
  Train: index=[0 1 2 3 6 7 8 9]
  Test:  index=[4 5]
Fold 3:
  Train: index=[0 1 2 3 4 5 8 9]
  Test:  index=[6 7]
Fold 4:
  Train: index=[0 1 2 3 4 5 6 7]
  Test:  index=[8 9]


### Exercise 2: Cross validation (k-fold)

Import California Housing data set and split it in a train set and a test set (10%). Fit a linear regression on the data set.

In [3]:
# imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)
# pipeline
pipeline = [('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('lr', LinearRegression())]
pipe = Pipeline(pipeline)


##### 1. Cross validate the Pipeline using cross_validate with 10 folds. Print the scores on each validation sets, the mean score on the validation sets and the standard deviation on the validation sets. The expected output is:

In [11]:
from sklearn.model_selection import cross_validate

scores = cross_validate(pipe,cv=10,X=X_train,y=y_train)

print(f"""
      scores:
      {scores["test_score"]}
      
      mean score:
      {scores["test_score"].mean()}
      
      score std:
      {scores["test_score"].std()}
      """)


      scores:
      [0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055
 0.54630341 0.60742976 0.60014575 0.59574508]
      
      mean score:
      0.60201392526743
      
      score std:
      0.021498382277346514
      


### Exercise 3: GridsearchCV

Import California Housing dataset, split it into a train and a test set (10%), and fit a linear regression on the dataset.

In [12]:
# imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)


##### 1. GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

params = {
    "max_depth": [3,10,17],
    "n_estimators": [5,25,75]
}


gs = GridSearchCV(RandomForestRegressor(),params,n_jobs=-1,scoring="neg_mean_squared_error",cv=5)

gs.fit(X_train,y_train)


In [21]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.cv_results_)

-0.25651419170139184
{'max_depth': 17, 'n_estimators': 75}
{'mean_fit_time': array([ 0.71401229,  5.15041833, 19.70704246,  3.71968117, 16.94802709,
       54.50932012,  5.8641705 , 25.73737807, 71.52938385]), 'std_fit_time': array([0.05809767, 2.48340725, 2.46942152, 0.44960276, 0.93439149,
       1.89342675, 0.38975466, 1.12371502, 8.31944346]), 'mean_score_time': array([0.00644708, 0.07303758, 0.15892658, 0.0262517 , 0.07668123,
       0.24366021, 0.02862353, 0.12535911, 0.39007583]), 'std_score_time': array([0.0028581 , 0.0578816 , 0.07604941, 0.01163342, 0.01754787,
       0.02667528, 0.00568905, 0.0134795 , 0.11039135]), 'param_max_depth': masked_array(data=[3, 3, 3, 10, 10, 10, 17, 17, 17],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[5, 25, 75, 5, 25, 75, 5, 25, 75],
             mask=[False, False, False, False, False, False, False

### Exercise 4: Validation curve and Learning curve

In [5]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
import numpy as np

max_depth_range = np.arange(1,21)

X, y = make_classification(n_samples=100000,
                        n_features= 30,
                        n_informative=10,
                        flip_y=0.2 )

In [None]:


(train_score, test_score) = validation_curve(
    RandomForestClassifier(),
    X,y,    param_name="max_depth",
    param_range=max_depth_range,
    cv=5,
    n_jobs=-1,
    scoring="accuracy"
)

plt.plot(max_depth_range,train_score.mean(axis=1),label="Training Score")
plt.plot(max_depth_range,test_score.mean(axis=1),label="Testing Score")

plt.set_title("Validation Curve: max_depth")
plt.set_xlabel("max_depth")
plt.set_ylabel("Score: ROC AUC")
plt.set_ylim(0.0, 1.1)
plt.show()

In [6]:
from sklearn.model_selection import learning_curve
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=12)

train_sizes, train_scores, test_scores = learning_curve(
    clf, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure()
plt.title("Learning Curve")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show()

