# Stock Price Classification
By: Jared Berry

In [None]:
# Set relevant scikit-learn functions/modules

# Tests for stationarity 
from statsmodels.tsa.stattools import adfuller

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# LightGBM
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

# Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Evaluation
from sklearn import metrics

import warnings

## Set-up

In [None]:
def prepare_model_structures(X, y, holdout, labeled=False, ema_gamma=1):
    """
    Given a dataframe of features, a target array, and
    holdout test set; label and smooth if necessary.
    Returns a tuple of prepared structures for modeling
    """
    
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    targets = y.copy()
    
    if labeled:
        targets_smoothed = targets.copy()
        orig_targets = targets.copy()
    else:
        # Compute EMA smoothing of target prior to constructing classes
        EMA = 0
        gamma_ = ema_gamma
        for ti in range(len(targets)):
            EMA = gamma_*targets[ti] + (1-gamma_)*EMA
            targets[ti] = EMA  

        targets_smoothed = np.where(targets > 0, 1, 0)    
        orig_targets = np.where(y.copy() > 0, 1, 0)
        print("\n{} targets changed by smoothing.".format(np.sum(targets_smoothed != orig_targets)))
        
    return features, feature_names, test_features, targets_smoothed, orig_targets
    

In [None]:
def benchmark_target(target, groups):
    """
    Benchmark classification metrics for the target
    against a one-class target and random-walk per
    time-series literature; allows a grouping object
    to conduct the random-walk shifting at the entity
    level.
    Returns nothing; prints benchmark statistics.
    """
    one_class = np.ones(len(target), dtype='int')
    if groups:
        pass
    else:
        pass
    

In [1]:
# From Wheat Classification notebook 
def fit_sklearn_classifier(X, y, holdout, ticker, ema_gamma, valid_splits, model, label, param_search={}, export=False,
                           valid_method="ts", labeled=False, groups=None, **kwargs):
    """
    Flexible function for fitting any number of sci-kit learn
    classifiers, with optional grid search.
    """
    
    warnings.filterwarnings('always')
    start = time.time()
    
    # Prepare modeling structures
    features, feature_names, test_features, targets_smoothed, orig_targets = \
    prepare_model_structures(X, y, holdout, labeled, ema_gamma)

    # Compute some baselines
    all_pos_acc = np.mean(np.ones(test_features.shape[0], dtype='int') == labels_smoothed)
    random_walk_classes = np.array(pd.Series(labels_smoothed).shift(1).tolist()[1:], dtype="int") # Inappropriate for panel
    rw_acc = np.mean(random_walk_classes == np.array(labels_smoothed[1:], dtype="int"))
    print("Baseline accuracy is: {}%".format(100*np.round(all_pos_acc, 2)))
    print("RW accuracy is {}%".format(100*np.round(rw_acc, 2)))
    
    # Set time-series, cross-validation indices
    if valid_method == "panel":
        splits = PanelSplit(n_folds=valid_splits, groups=groups)
        search_splits = PanelSplit(n_folds=valid_splits, groups=groups)
    elif valid_method == "ts":
        splits = TimeSeriesSplit(n_splits=valid_splits).split(X)
        search_splits = TimeSeriesSplit(n_splits=valid_splits).split(X)
    elif valid_method == "kfold":
        splits = KFold(n_splits = valid_splits).split(X)
        search_splits = Kfold(n_splits = valid_splits).split(X)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Dictionary of lists for recording validation and training scores
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    # Perform a grid-search on the provided parameters to determine best options
    if param_search:
        print("Performing grid search for hyperparameter tuning")
        gridsearch = GridSearchCV(estimator=model(), 
                                  cv=search_splits,
                                  param_grid=param_search)

        # Fit to extract best parameters later
        gridsearch_model = gridsearch.fit(features, labels_smoothed)

    opt_thresholds = []; split_counter = 1
    for train_indices, valid_indices in splits:
        print("Training model on validation split #{}".format(split_counter))
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels_smoothed[valid_indices]
        
        expected = valid_labels
        
        # Generate a model given the optimal parameters established in grid search
        if param_search:
            estimator = model(**gridsearch_model.best_params_)
        else:
            estimator = model(**kwargs)
            
        # Train the estimator
        estimator.fit(train_features, train_labels)

        # Fit the fitted model on the test set and store positive class probabilities
        probs = estimator.predict_proba(valid_features)
        pos_probs = [p[1] for p in probs]

        # Dynamic classification threshold selection
        thresholds = list(np.arange(0.30, 0.90, 0.05))
        preds = [[1 if y >= t else 0 for y in pos_probs] for t in thresholds]
        scores_by_threshold_ = [metrics.f1_score(valid_labels, p) for p in preds]
        opt_thresh_ = thresholds[scores_by_threshold_.index(max(scores_by_threshold_))]
        opt_thresholds.append(opt_thresh_)

        # Generate class predictions
        predicted = [1 if y >= opt_thresh_ else 0 for y in pos_probs]

        # Append scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
        
        # Store variable importances 
        if model in [RandomForestClassifier, GradientBoostingClassifier]:
            feature_importance_values += estimator.feature_importances_ / valid_splits
            
        # Iterate counter
        split_counter += 1
            
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
    
    # Fit on full sample 
    if param_search:
        estimator = model(**gridsearch_model.best_params_)
    else:
        estimator = model(**kwargs)
        
    estimator.fit(features, labels_smoothed)
    
    # Test on hold out set
    out_of_sample_probs = estimator.predict_proba(holdout)
    pos_probs = [p[1] for p in out_of_sample_probs]
    predicted = [1 if y >= opt_thresh_ else 0 for y in pos_probs]

    # Store values for later reporting/use in app
    evals = {'precision':np.mean(scores['precision']), 
             'recall':np.mean(scores['recall']), 
             'accuracy':np.mean(scores['accuracy']), 
             'f1':np.mean(scores['f1']),
             'probabilities':pos_probs,
             'predictions':predicted,
             'importances':sorted_importances
             }
    
    # Report
    print("Build, hyperparameter selection, and validation of {} took {:0.3f} seconds\n".format(label, time.time()-start))
    print("Hyperparameters are as follows:")
    if param_search:
        for key in gridsearch_model.best_params_.keys():
            print("{}: {}\n".format(key, gridsearch_model.best_params_[key]))
    print("Validation scores are as follows:")
    print(pd.DataFrame(scores).mean())
    
    # Output the evals dictionary
    if export:
        outpath = "{}_{}.pickle".format(label.tolower().replace(" ", "_"), ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(evals, f)

In [None]:
def fit_lgbm_classifier(X, y, holdout, ticker, ema_gamma=1, valid_splits=12, export=False, valid_method="ts", groups=None,
                       labeled=False):
    
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    labels = y.copy()
    
    # Compute EMA smoothing of target prior to constructing classes
    EMA = 0
    gamma_ = ema_gamma
    for ti in range(len(labels)):
        EMA = gamma_*labels[ti] + (1-gamma_)*EMA
        labels[ti] = EMA
        
    if labeled:
        labels_smoothed = labels.copy()
        orig_lables = labels.copy()
    else:
        # Compute EMA smoothing of target prior to constructing classes
        EMA = 0
        gamma_ = ema_gamma
        for ti in range(len(labels)):
            EMA = gamma_*labels[ti] + (1-gamma_)*EMA
            labels[ti] = EMA  

        labels_smoothed = np.where(labels > 0, 1, 0)    
        orig_labels = np.where(y.copy() > 0, 1, 0)
        print("\n{} labels changed by smoothing.".format(np.sum(labels_smoothed != orig_labels)))

    # Compute some baselines
    all_pos_acc = np.mean(np.ones(test_features.shape[0], dtype='int') == labels_smoothed)
    random_walk_classes = np.array(pd.Series(labels_smoothed).shift(1).tolist()[1:], dtype="int")
    rw_acc = np.mean(random_walk_classes == np.array(labels_smoothed[1:], dtype="int"))
    print("Baseline accuracy is: {}%".format(100*np.round(all_pos_acc, 2)))
    print("RW accuracy is {}%".format(100*np.round(rw_acc, 2)))
    
    ## pd.DataFrame(list(zip(labels_smoothed, y.copy()))).plot()

    # Set time-series, cross-validation indices
    if valid_method == "panel":
        splits = PanelSplit(n_folds=valid_splits, groups=groups)
    elif valid_method == "ts":
        splits = TimeSeriesSplit(n_splits=valid_splits).split(X)
    elif valid_method == "kfold":
        splits = KFold(n_splits = valid_splits).split(X)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    valid_accs = []
    train_accs = []
    
    # Iterate through each fold
    for train_indices, valid_indices in splits: 

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels_smoothed[valid_indices]

        # Create the bst
        bst = LGBMClassifier(n_estimators=10000, objective = 'binary', 
                             class_weight = 'balanced', learning_rate = 0.01,
                             max_bin = 50, num_leaves = 50, max_depth = 2,
                             reg_alpha = 0.1, reg_lambda = 0.1, 
                             subsample = 0.8, random_state = 101,
                             boosting = 'gbdt'
                            )

        # Train the bst
        bst.fit(train_features, train_labels, eval_metric = ['auc', 'binary_error'],
                eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 0)

        # Record the best iteration
        best_iteration = bst.best_iteration_

        # Record the feature importances
        feature_importance_values += bst.feature_importances_ / valid_splits
        
        # Make predictions
        test_predictions += bst.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / valid_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = bst.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = bst.best_score_['valid']['auc']
        train_score = bst.best_score_['train']['auc']
        valid_acc = bst.best_score_['valid']['binary_error']
        train_acc = bst.best_score_['train']['binary_error']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        valid_accs.append(valid_acc)
        train_accs.append(train_acc)
        
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
        
    # Set up an exportable dictionary with results from the model
    results = {
        'train_auc':train_scores,
        'train_acc':train_acc,
        'validation_auc':valid_scores,
        'validation_accuracy':valid_acc,
        'valid_preds':out_of_fold,
        'feature_importances':sorted_importances,
        'test_predictions':test_predictions
    }
    
    # Output the results dictionary
    if export:
        outpath = "lgbc_{}.pickle".format(ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(results, f)
    
    print("Average AUC across {} splits: {}".format(valid_splits, np.mean(valid_scores)))
    print("Average accuracy across {} splits: {}%".format(valid_splits, 100*np.round((1-np.mean(valid_acc)),2)))
    for i in range(6):
        print(sorted_importances[i])