# Ensemble PCA

The goal here is to compare the results of the best PCA (select a posteriori the dimension leading to the best test error --> possible overfitting) to an ensemble of PCAs with fixed subsets (10, 20 and 30).
The results are printed but **need to be formatted** to allow a better reading.

In [1]:
import math
import datetime
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from scipy.stats import randint as sp_randint

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import RandomizedSearchCV
import methodsMLinterns

In [2]:
stocks = ['DNB', 'NRG', 'CL', 'ANTM', 'NEE', 'PAYX', 'VAR', 'NI', 'MNST', 'JNJ', 'TGNA', 'NOV', 'FIS', 'BLK', 'HBI', 'NVDA', 'DLTR', 'MRO', 'EMN', 'AMT', 'FLR', 'IBM', 'BK', 'NFX', 'AGN', 'LRCX', 'DIS', 'LH', 'C', 'MNK']

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

In [4]:
features_1p2_extra = ['acr', 'aeo', 'adl', 'aep', 'acy', 'aez', 'afa', 'aab', 'zkg', 'zmd', 'zla', 'zme', 'zkn', 'zmo', 'zmp', 'zhq', 'zpe']
features_1p4 = ['aab', 'aac', 'aad', 'aae', 'aaf', 'aag', 'aah', 'abj', 'abm', 'abn', 'abo', 'abp', 'abq', 'abr', 'abs', 'abt', 'abu', 'abv', 'abw', 'abx', 'aby', 'abz', 'aca', 'acb', 'acc', 'acd', 'ace', 'acf', 'acr', 'acw', 'acx', 'acy', 'adi', 'adj', 'adl', 'ado', 'adp', 'adq', 'adr', 'ads', 'adt', 'adu', 'adv', 'adw', 'adx', 'ady', 'adz', 'aea', 'aeb', 'aec', 'aed', 'aee', 'aef', 'aeg', 'aeh', 'aei', 'aej', 'aek', 'ael', 'aem', 'aen', 'aeo', 'aep', 'aeq', 'aer', 'aes', 'aex', 'aey', 'aez', 'afa', 'afj', 'afl', 'afo', 'afp', 'afq', 'afr', 'afs', 'aft', 'afu', 'afv', 'afw', 'afx', 'afy', 'afz', 'aga', 'agb', 'agc', 'agd', 'age', 'agf', 'agg', 'agh', 'agi', 'agj', 'agk', 'agl', 'agm', 'agn', 'ago', 'agp', 'agq', 'agr', 'ags', 'agt', 'agu', 'agv', 'agw', 'agx', 'agy', 'ahf', 'ahg', 'ahh', 'ahi', 'ahj', 'ahk', 'ahl', 'ahm', 'ahn', 'aho']
features_1p4_extra =['aab', 'aac', 'aad', 'aae', 'aaf', 'aag', 'aah', 'abj', 'abm', 'abn', 'abo', 'abp', 'abq', 'abr', 'abs', 'abt', 'abu', 'abv', 'abw', 'abx', 'aby', 'abz', 'aca', 'acb', 'acc', 'acd', 'ace', 'acf', 'acr', 'acw', 'acx', 'acy', 'adi', 'adj', 'adl', 'ado', 'adp', 'adq', 'adr', 'ads', 'adt', 'adu', 'adv', 'adw', 'adx', 'ady', 'adz', 'aea', 'aeb', 'aec', 'aed', 'aee', 'aef', 'aeg', 'aeh', 'aei', 'aej', 'aek', 'ael', 'aem', 'aen', 'aeo', 'aep', 'aeq', 'aer', 'aes', 'aex', 'aey', 'aez', 'afa', 'afj', 'afl', 'afo', 'afp', 'afq', 'afr', 'afs', 'aft', 'afu', 'afv', 'afw', 'afx', 'afy', 'afz', 'aga', 'agb', 'agc', 'agd', 'age', 'agf', 'agg', 'agh', 'agi', 'agj', 'agk', 'agl', 'agm', 'agn', 'ago', 'agp', 'agq', 'agr', 'ags', 'agt', 'agu', 'agv', 'agw', 'agx', 'agy', 'ahf', 'ahg', 'ahh', 'ahi', 'ahj', 'ahk', 'ahl', 'ahm', 'ahn', 'aho', 'zhq', 'zhr', 'zhs', 'zht', 'zhu', 'zhv', 'zhw', 'ziy', 'zjb', 'zjc', 'zjd', 'zje', 'zjf', 'zjg', 'zjh', 'zji', 'zjj', 'zjk', 'zjl', 'zjm', 'zjn', 'zjo', 'zjp', 'zjq', 'zjr', 'zjs', 'zjt', 'zju', 'zkg', 'zkl', 'zkm', 'zkn', 'zkx', 'zky', 'zla', 'zld', 'zle', 'zlf', 'zlg', 'zlh', 'zli', 'zlj', 'zlk', 'zll', 'zlm', 'zln', 'zlo', 'zlp', 'zlq', 'zlr', 'zls', 'zlt', 'zlu', 'zlv', 'zlw', 'zlx', 'zly', 'zlz', 'zma', 'zmb', 'zmc', 'zmd', 'zme', 'zmf', 'zmg', 'zmh', 'zmm', 'zmn', 'zmo', 'zmp', 'zmy', 'zna', 'znd', 'zne', 'znf', 'zng', 'znh', 'zni', 'znj', 'znk', 'znl', 'znm', 'znn', 'zno', 'znp', 'znq', 'znr', 'zns', 'znt', 'znu', 'znv', 'znw', 'znx', 'zny', 'znz', 'zoa', 'zob', 'zoc', 'zod', 'zoe', 'zof', 'zog', 'zoh', 'zoi', 'zoj', 'zok', 'zol', 'zom', 'zon', 'zou', 'zov', 'zow', 'zox', 'zoy', 'zoz', 'zpa', 'zpb', 'zpc', 'zpd', 'zpe']

In [5]:
random_state = 0
Cs = np.logspace(-4, 5)
cv = 5
ratio_threshold = 0.65
date_test_set = datetime.date(2016, 1, 1)

## Prepare the data and clean with all possible features

In [6]:
clf_portfolio_dic = methodsMLinterns.ClassificationPortfolio(stocks=stocks, minutes_forward=30)
clf_portfolio_dic.loadData()
clf_portfolio_dic.cleanUpData(features_1p4_extra)
clf_portfolio_dic.getTrainTestSetDate(date_test_set)

In [7]:
len(features_1p4_extra)

239

## Use PCA
A new class *LogisticClassifierPCA* has been added to the *methodsMLinterns* file to allow for **Voting Classifier Ensembling** to take place.

In [9]:
## Ensembles PCA 20, 30 and 40 classifiers
acc_train, acc_test = clf_portfolio_dic.getEnsembleModelPCA(Cs, cv, features_1p4_extra)

DNB accuracy train 55.09 accuracy test 53.5
NRG accuracy train 56.51 accuracy test 54.43
CL accuracy train 55.44 accuracy test 51.27
ANTM accuracy train 56.55 accuracy test 48.22
NEE accuracy train 56.8 accuracy test 50.95
done 20%
PAYX accuracy train 54.15 accuracy test 49.84
VAR accuracy train 56.6 accuracy test 54.08
NI accuracy train 56.37 accuracy test 50.51
MNST accuracy train 54.92 accuracy test 51.44
JNJ accuracy train 54.67 accuracy test 49.61
TGNA accuracy train 55.47 accuracy test 48.59
done 40%
NOV accuracy train 56.38 accuracy test 51.54
FIS accuracy train 56.03 accuracy test 51.57
BLK accuracy train 57.84 accuracy test 54.41
HBI accuracy train 57.25 accuracy test 51.55
NVDA accuracy train 56.73 accuracy test 51.53
DLTR accuracy train 54.28 accuracy test 49.88
done 60%
MRO accuracy train 54.43 accuracy test 51.63
EMN accuracy train 54.93 accuracy test 52.25
AMT accuracy train 56.45 accuracy test 53.12
FLR accuracy train 55.73 accuracy test 50.77
IBM accuracy train 56.18 ac

In [10]:
experiment14 = methodsMLinterns.ExperimentPerformance(
    methodName="Ensemble PCA + LogReg - 1.4extra",
    stocks=stocks, originalFeatures=features_1p4_extra)

In [11]:
experiment14.setTrainResults(acc_train)
experiment14.setTestResults(acc_test)
print("The test accuracy (mean - std) is:")
print(experiment14.getTestAccuracyMinusSigma())

The test accuracy (mean - std) is:
49.9303719933


In [12]:
with open("pickles/acc_ensemble_pca.p",'wb') as f:
    pickle.dump( experiment14, f, protocol=pickle.HIGHEST_PROTOCOL)