In [49]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as mpl

from collections import defaultdict
from functools import reduce
from path import Path
from pprint import pprint

%matplotlib inline
mpl.style.use('ggplot')
mpl.rcParams['figure.figsize'] = 16,6

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from scipy.stats import rv_continuous, kstest
from cv import PurgedKFold

# Code from Chapter 9

class TheNewPipe(Pipeline):
    def fit(self, X, y, sample_weight=None, **fit_params):
        if sample_weight is not None:
            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight
        return super(TheNewPipe, self).fit(X, y, **fit_params)

def clfHyperFit(feat, lbl, t1, pipe_clf, param_grid, cv=3, bagging=[0, None, 1.0],
                rndSearchIter=0, n_jobs=-1, pctEmbargo=0, **fit_params):
    if set(lbl.values) == {0, 1}:
        scoring = 'f1' # f1 for meta-labeling
    else:
        scoring = 'neg_log_loss' # symmetric towards all classes
    
    # 1) hyperparameter searching, on train data
    inner_cv = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)
    if rndSearchIter == 0:
        gs = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
    else:
        gs = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_grid, scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False, n_iter=rndSearchIter)
    gs = gs.fit(feat, lbl, **fit_params).best_estimator_
    # 2) fit validated model on the entirety of the data
    if bagging[1] > 0:
        gs = BaggingClassifier(bare_estimator=TheNewPipe(gs.steps), n_estimators=int(bagging[0]), max_samples=float(bagging[1]),
                              max_features=float(bagging[2]), n_jobs=n_jobs)
        gs = gs.fit(feat, lbl, sample_weight=fit_params[gs.base_estimator.steps[-1][0] + '__sample_weight'])
        gs = Pipeline([('bag', gs)])
    return gs
        
class logUniform_gen(rv_continuous):
    # random numbers log-uniformly distributed between 1 and e
    def _cdf(self, x):
        return np.log(x / self.a) / np.log(self.b / self.a)
    
def logUniform(a=1, b=np.exp(1)):
    return logUniform_gen(a=a, b=b, name='logUniform')


# 9.1a

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

Use `GridSearchCV` on 10-fold-CV to find the `C, gamma` optimal hyper-parameters on a SVC with RBF kernel, where `param_grid={'C': [1E-2, 1E-1, 1, 10, 100], 'gamma': [1E-2, 1E-1, 1, 10, 100]}` and the scoring function is `neg_log_loss`

In [51]:
from feature_imp import getTestData
from sklearn.svm import SVC

param_grid = {'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [1e-2, 1e-1, 1, 10, 100]}

testing = False
n_samples = 1000 if testing else 10000
n_splits = 3 if testing else 10

trnsX, cont = getTestData(n_features=10, n_informative=5, n_redundant=0, n_samples=n_samples)

In [52]:
pipe_clf = SVC(probability=True)

inner_cv = PurgedKFold(n_splits=n_splits, t1=cont.index.to_series())
gs1 = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring='neg_log_loss', cv=inner_cv,
                   n_jobs=-1, iid=False, return_train_score=True)

gs1 = gs1.fit(X=trnsX, y=cont['bin'])
gs1_results = pd.DataFrame(gs1.cv_results_)


# 9.1b

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

How many nodes are there in the grid?

# 9.1c

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

How many fits did it take to find the optimal solution?

In [54]:
print("It took %s fits" % len(gs1_results))

It took 25 fits


# 9.1d

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

How long did it take to find this solution?

In [71]:
print("It took {:.0f} seconds.".format(gs1_results['mean_fit_time'].sum()))

It took 328 seconds.


# 9.1e

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

How can you access the optimal result?

In [56]:
be1 = gs1.best_estimator_
be1

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# 9.1f

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

What is the CV score of the optimal parameter combination?

In [77]:
best1_idx = gs1_results['mean_test_score'].idxmax()
best1 = gs1_results['mean_test_score'].max()
gs1_results.iloc[best1_idx][['mean_test_score', 'params']].to_frame()

Unnamed: 0,16
mean_test_score,-0.288744
params,"{'C': 10, 'gamma': 0.1}"


In [78]:
print("The CV score is {:.3f}".format(best1))

The CV score is -0.289


# 9.1g

Using the function `getTestData` from Chapter 8, form a synthetic dataset of 10,000 observations with 10 features, where 5 are informative and 5 are noise. 

How can you pass sample weights to the SVC?

# 9.2a

Using the same dataset from exercise 1,

Use `RandomizedSearchCV` on 10-fold-CV to find the `C, gamma` optimal hyper-parameters on a SVC with RBF kernel, where `param_distributions={'C': logUniform(a=1E-2, b=1E2), 'gamma': logUniform(a=1E-2, b=1E2)}, n_iter=25` and the scoring function is `neg_log_loss`

In [58]:
inner_cv = PurgedKFold(n_splits=n_splits, t1=cont.index.to_series())
param_distributions = {'C': logUniform(a=1e-2, b=1e2), 'gamma': logUniform(a=1e-2, b=1e2)}
n_iter = 25
gs2 = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_distributions, scoring='neg_log_loss',
                         cv=inner_cv, n_jobs=-1, iid=False, n_iter=n_iter, return_train_score=True)

gs2 = gs2.fit(X=trnsX, y=cont['bin'])
gs2_results = pd.DataFrame(gs2.cv_results_)


# 9.2b

Using the same dataset from exercise 1,

How long did it take to find this solution?

In [74]:
print("It took {:.0f} seconds.".format(gs1_results['mean_fit_time'].sum()))

It took 328 seconds.


# 9.2c

Using the same dataset from exercise 1,

Is the optimal parameter combination similar to the one found in exercise 1?

In [60]:
be2 = gs2.best_estimator_
be2

SVC(C=10.7875109732391, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.07549547952136182,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

# 9.2d

Using the same dataset from exercise 1,

What is the CV score of the optimal parameter combination? How does it compare to the CV score from exercise 1?

In [79]:
best2 = gs2_results['mean_test_score'].max()
print("The CV score is {:.3f}, and therefore higher than {:.3f} from the first exercise.".format(best2, best1))

The CV score is -0.282, and therefore higher than -0.289 from the first exercise.


# 9.3a

From exercise 1,

Compute the Sharpe ratio of the resulting in-sample forecasts, from point 1.a 

In [80]:
def sharpe(r):
    return r.mean() / r.std()

predictions1 = be1.predict(trnsX)
bin_returns = cont['bin'] * 2 - 1

print("The Sharpe ratio is {:.2f}.".format(sharpe(predictions1 * bin_returns)))

The Sharpe ratio is 0.85.


# 9.3b

From exercise 1,

Repeat point 1.a, this time with `accuracy` as the scoring function. Compute the in-sample forecasts derived from the hyper-tuned parameters.

In [82]:
inner_cv = PurgedKFold(n_splits=10, t1=cont.index.to_series())
gs3 = GridSearchCV(estimator=pipe_clf, param_grid=param_grid, scoring='accuracy', cv=inner_cv,
                   n_jobs=-1, iid=False, return_train_score=True)

gs3 = gs3.fit(X=trnsX, y=cont['bin'])
gs3_results = pd.DataFrame(gs3.cv_results_)
be3 = gs3.best_estimator_

predictions3 = be3.predict(trnsX)

print("The Sharpe ratio is {:.2f}.".format(sharpe(predictions3 * bin_returns)))

The Sharpe ratio is 0.85.


# 9.3c

What scoring method leads to higher (in-sample) Sharpe ratio?

**A: In this instance GridSearchCV with either accuracy or neg_log_loss picks the same set of parameters.**

# 9.4a

From exercise 2,

Compute the Sharpe ratio of the resulting in-sample forecasts, from point 2.a 

In [83]:
predictions2 = be2.predict(trnsX)

print("The Sharpe ratio is {:.2f}.".format(sharpe(predictions2 * bin_returns)))

The Sharpe ratio is 0.81.


# 9.4b

From exercise 2,

Repeat point 2.a, this time with `accuracy` as the scoring function. Compute the in-sample forecasts derived from the hyper-tuned parameters.

In [84]:
gs4 = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_distributions, scoring='accuracy', cv=inner_cv, n_jobs=-1, iid=False, n_iter=n_iter)

gs4 = gs4.fit(X=trnsX, y=cont['bin'])
be4 = gs4.best_estimator_

predictions4 = be4.predict(trnsX)

sharpe(predictions4 * bin_returns)
print("The Sharpe ratio is {:.2f}.".format(sharpe(predictions4 * bin_returns)))

The Sharpe ratio is 0.78.


# 9.4c

From exercise 2,

What scoring method leads to higher (in-sample) Sharpe ratio?

**A: For randomized search, negative log-loss leads to higher in-sample Sharpe ratio.**

# 9.5a

Read the definition of log loss, $L[Y,P]$.

Why is the scoring function `neg_log_loss` defined as the negative log loss, $-L[Y,P]$?

**A: Because for most it's more intuitive to maximize a scoring function.**

# 9.5b

Read the definition of log loss, $L[Y,P]$.

What would be the outcome of maximizing the log loss, rather than the negitive log loss?

**A: I'd expect this to select for the model with the least predictive power.**

# 9.6

Consider an investment strategy that sizes its bets equally, regardless of the forecast's confidence. In this case, what is the more appropriate scoring function for hyper-parameter tuning, accuracy or cross-entropy loss?

**A: Accuracy accounts equally for erronous predictions with high or low probabilities while Log loss computes the log-likelihood of the classifier given the true label, which takes predictions' probabilities into account.**

