### PART3 RollingSignalGenerator

#### 1.Load packages

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 23 10:57:36 2020

@author: Evan
"""
#%% import packages
import pandas as pd 
import numpy as np
import  sys, os

from tqdm import notebook 
from tqdm import tqdm
from tqdm import tqdm_notebook

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline 

ROOT = '../'
# from FeatureEngineering import FeatureEngineering
sys.path.append(os.path.join(ROOT, '03 feature selection'))
from naiveSelection import naiveSelection
from treeSelection import treeSelection
from SVCL1Selection import SVCL1Selection
from varianceThresholdSelection import varianceThresholdSelection
from pcaSelection import pcaSelection

sys.path.append(os.path.join(ROOT, '04 build classifier model'))

from MyDecisionTreeClassifier import MyDecisionTreeClassifier
from MyClassifier import *
from MySVMClassifier import MySVMClassifier
from MyXGBoostClassifier import MyXGBoostClassifier
# <<<<<<< HEAD
# from MyDeepLearningClassifier import MyDeepLearningClassifier
# =======
from MyKNNClassifier import MyKNNClassifier
from sklearn.svm import SVC
import ffn
# >>>>>>> cba755d863f0fcf468ecabcb8370f5eb74e79c83

#### 2.Main Predict Engine

In [3]:
class RollingSignalGenerator:
    def __init__(self, rawXs, rawYs, startDate = None, endDate = None, predictWindow = 20):
        # load input data
        self.rawXs = rawXs
        self.rawYs = rawYs 
        
        # if not select startDate or endDate, determine either from the input data
        if startDate == None:
            self.startDate = str(self.rawYs.index[0].date())
        else:
            self.startDate = startDate
        if endDate == None:
            self.endDate = str(self.rawYs.index[-1].date())
        else:
            self.endDate = endDate
        
        # determine the date to train a new model for future use
        # we use 20 business day for a period in default
        try:
            self.changeHandDates = pd.date_range(self.startDate, self.endDate,freq = "{}B".format(predictWindow))
        # exception often occurs when user determine startDate and endDate in a bad format
        except Exception as e:
            print('Please input startDate and endDate as format YYYY-mm-dd')
            print(e.args[0])
    
    
    def generateOnePeriodSignal(self, X_train, y_train, X_test, y_test, featureSelectionFunction, predictModel):
        """
        This is function that fit and predict future n (predictWindow) days results.
        for each period, we use featureSelectionFunction to preprocess our input data. the featureSelectionFunction is 
        defined outside. We just simply call the function regardless what it done inside it.
        We can put any preprocess method inside, or do nothing, take naiveSelection for example.
        
        we call the predictModel's constructor with predictModel being ANY model defined outside.
        predictModel should be defined properly and overload our template predictModel- MyClassifier
        
        """ 
        # preprocess using featureSelectionFunction
        X_train_selected, X_test_selected = featureSelectionFunction(X_train, y_train, X_test, y_test, verbal = False)
        
        # fit
        model = predictModel()
        model.fit(X_train_selected, y_train)
        
        # predict
        y_true = y_test
        y_pred = model.predict(X_test_selected)
        y_pred = pd.Series(y_pred)
        y_pred.index = y_test.index
        return(pd.Series(y_pred), model)
    
    def generateSignal(self, predictModel, featureSelectionFunction, minTrainDays = 1800, trainMode = 'extention', recordModels = True):
        """
        This function that perform rolling prediction for the whole backtest period
        for each period call the generateOnePeriodSignal method
        record the model and performance if needed
        return the whole prediction, and each model fit in each time period.
        """  
        modelRecord = {}
        outputPrediction = pd.Series()
        
        for predictStartDate, predictEndDate in tqdm_notebook(zip(self.changeHandDates, self.changeHandDates[1:])):
            # check if we have enough data 
            print('start predict from {} to {}'.format(predictStartDate, predictEndDate))
            trainDataDays = np.busday_count(np.datetime64(self.startDate), np.datetime64(predictStartDate.date()))
            if trainDataDays < minTrainDays:
                # we don't have enough data
                print('We only have {} trainDataDays'.format(trainDataDays))
                continue
            
            # split the traing and testing set of each time period
            if trainMode == 'extention':
                # train from the start of the whole data set
                trainStartDate = self.startDate
            elif trainMode == 'rolling':
                # train the past minTrainDays business days
                trainStartDate = predictStartDate-pd.Timedelta(minTrainDays, unit = 'B')
            
            # get our train test data set
            X_train, y_train = self.rawXs[trainStartDate:predictStartDate], self.rawYs[trainStartDate:predictStartDate]
            print('train shape (X, y):{}'.format(X_train.shape, y_train.shape))
            X_test, y_test = self.rawXs[predictStartDate:predictEndDate], self.rawYs[predictStartDate:predictEndDate]
            print('test  shape (X, y):{}'.format(X_test.shape, y_test.shape))
            
            # fit and predict
            y_predictSeries, model = self.generateOnePeriodSignal(X_train, y_train, X_test, y_test,\
                                                                  featureSelectionFunction, predictModel)
            #  concat outputs
            outputPrediction = pd.concat([outputPrediction, y_predictSeries])
            
            # record all models if needed
            if recordModels:
                performance = {}
                
                performance.update({
                    'precision':metrics.precision_score(y_test, y_predictSeries),
                    'recall':metrics.recall_score(y_test, y_predictSeries),
                    'f1_score':metrics.f1_score(y_test, y_predictSeries)
                    })
                               
                modelRecord.update({
                    str(predictStartDate.date()):{
                        'trainStartDate' :trainStartDate,
                        'predictStartDate' :predictStartDate,
                        'predictEndDate' :predictEndDate, 
                        'model' :model,
                        'performance':performance
                    }           
                })
    
        if recordModels:
            return(outputPrediction, modelRecord)
        else:
            return(outputPrediction)

#### 3.Sample of MyClassifier method

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

class MySVMClassifier:
    def __init__(self):
        self.parameter = None
        self.model = None
        
    def getPara(self):
        # do some how cv or things to decide the hyperparameter
        if self.parameter == None:
            print('Hi~ please first use fit function to get parameter :)')
        else:
            print('haha! We already do CV and find the best parameters~')
            return self.parameter
        
    def fit(self, X, y):
        # do what ever plot or things you like 
        # just like your code
        # Set the parameters by cross-validation
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3],
                             'C': [0.01, 0.1, 1]},]
        scores = ['precision']
        
        for score in scores:
            print("# Tuning hyper-parameters for %s" % score)
            clf = GridSearchCV(
                SVC(), tuned_parameters, scoring='%s_macro' % score
            )
            clf.fit(X, y)
            print("Best parameters set found on development set:")
            self.parameter = clf.best_params_
            self.model = clf
            print(clf.best_params_)
            print("Grid scores on development set:")
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds, clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r"
                      % (mean, std * 2, params))
        
            print("Detailed classification report:")
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
        return(self.model.fit(X, y))
        
    def predict(self, X):
        return(self.model.predict(X))