In [1]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from statistics import mean
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
modelNames = ['ZScorePredictor','FiveDayPredictor','TmmrwUpPredictor']
verbose = False

In [2]:
def scoreCalculator(predictor, attributes, labels):
    predicted = predictor.predict(attributes)
    errorScore = 0
    for guessIndex in range(len(predicted)):
        guess = predicted[guessIndex]
        actual = labels[guessIndex]
        if guess == 'hold' :
            errorScore += 0
        elif guess == 'long':
            if actual == 'short':
                errorScore += -15
            elif actual == 'hold':
                errorScore += -5
            else :#actual == 'long'
                errorScore += 20
        else: #short
            if actual == 'long':
                errorScore += -20
            elif actual == 'hold':
                errorScore += 0
            else: #actual == short
                errorScore += 20
    if verbose:
        myMatrix = plot_confusion_matrix(predictor, attributes, labels,
                                     cmap=plt.cm.Blues, normalize = 'all')    
        plt.show()
    return  errorScore/len(predicted)

In [3]:
def catBoostMaker(sectorDF,zScoreAnswer,fiveDayChangeAnswer, tmmrwUpAnswer, sector, catLabels, count, toSave):
    catIndicies = []
    counter = -1
    for col in sectorDF.columns: #finding indices for cat variables
        counter += 1
        if col in catLabels:
            catIndicies.append(counter)
    masterList = sectorDF.values
    masterTrainList, masterTestList, zScoreTrainList, zScoreTestList, fiveDayChangeTrainList, fiveDayChangeTestList, tmmrwUpTrainList, tmmrwUpTestList = \
    train_test_split(masterList, zScoreAnswer, fiveDayChangeAnswer, tmmrwUpAnswer, test_size = .25)
    
    answerTrainList = [zScoreTrainList,fiveDayChangeTrainList, tmmrwUpTrainList]
    answerTestList = [zScoreTestList,fiveDayChangeTestList, tmmrwUpTestList]
    trainPools = [Pool(data = masterTrainList, label = x, cat_features = catIndicies) for x in answerTrainList]
    testPools = [Pool(data = masterTestList, label = x, cat_features = catIndicies) for x in answerTestList]
    
    folderRoot = 'catPredictors/'
    accuracyList = []
    scoreList = []
    featureDF = pd.DataFrame(data = 
                         {'featureName': [col for col in sectorDF.columns],
                        'importance': [0] * len([col for col in sectorDF.columns])}).set_index('featureName')
    for name, train, test in zip(modelNames, trainPools, testPools):
        train.set_feature_names([x for x in sectorDF.columns])
        model = CatBoostClassifier(bootstrap_type = 'Bernoulli')
        model.fit(train, eval_set = test, logging_level = 'Silent')
        sector = sector.replace(' ', '_')
        if toSave:
            model.save_model('{}{}-{}-{}.cbm'.format(folderRoot, name, sector, count))
        accuracy =  model.score(test)
        error = scoreCalculator(model, test,test.get_label())
        accuracyList.append(accuracy)
        scoreList.append(error)
        featureList = CatBoostClassifier.get_feature_importance(model, prettified = True)
        featureDFPart = pd.DataFrame(data = 
                     {'featureName': featureList['Feature Id'],
                    'importance': featureList['Importances']}).set_index('featureName')
        featureDF = featureDF + featureDFPart
        if verbose:
            print(name)
            print('Accuracy: ', accuracy)
            print('Error Score:', error)
        
    averageAccuracy = sum([x for x in accuracyList])/len(accuracyList)
    averageScore = sum([x for x in scoreList])/len(scoreList)
    featureList = CatBoostClassifier.get_feature_importance(model, prettified = True)
    return averageAccuracy, averageScore, featureDF

In [4]:
#returns feature to remove, mean error score, # of significant (non answer or date) columns
def buildForest(droppedColumns, toSave):
    basePath = 'data/smallFiltered-{}.csv'
    secretColumns = ['zScoreOfChangeTmmrw','percentChangeInFiveDays', 'sector', 'tmmrwChngAsPerc']
    masterScore = []
    masterAccuracy = []
    masterFeatureDF = None
    colCount = 0
    for x in range(10):###############################################################change to 10
        combinedDF = pd.read_csv(basePath.format(x), parse_dates = True)
        combinedDF.drop(columns = ['Date'], inplace = True)
        if len(droppedColumns) != 0:
            combinedDF.drop(columns = droppedColumns, inplace = True)
        catLabels = ['ticker','industry']
        if x == 0 and verbose:
            for col in combinedDF.columns:
                print(col)
        colCount = len(combinedDF.columns) - len(secretColumns)
        
        if verbose:
            print('-------------------------------------------\n\n')
        sectorList = combinedDF['sector'].copy().unique()
        catAccuracy = []
        catError = []
        for sector in sectorList:
            if verbose:
                print('Sector is: ', sector)
            sectorDF = combinedDF[combinedDF['sector'] == sector].copy()
            zScoreAnswer = sectorDF['zScoreOfChangeTmmrw']
            zScoreAnswer = zScoreAnswer.astype('float')
            zScoreAnswer = ['long' if x > .1 else 'short' if x < -.1 else 'hold' for x in zScoreAnswer ]
            fiveDayChangeAnswer = sectorDF['percentChangeInFiveDays']
            fiveDayChangeAnswer = ['long' if x > .005 else 'short' if x < -.005 else 'hold' for x in fiveDayChangeAnswer]
            tmmrwUpAnswer = sectorDF['tmmrwChngAsPerc']
            tmmrwUpAnswer = ['long' if x > 0 else 'short' for x in tmmrwUpAnswer]

            sectorDF.drop(columns = secretColumns, inplace = True)
            accuracy, error, featureDF = \
            catBoostMaker(sectorDF,zScoreAnswer,fiveDayChangeAnswer, tmmrwUpAnswer, sector, catLabels, x, toSave)
            catAccuracy.append(accuracy)
            catError.append(error)
            if x == 0:
                masterFeatureDF = featureDF
            else:
                masterFeatureDF = masterFeatureDF + featureDF
            if verbose:
                print('-------------------------------------------\n\n')
            break
        if verbose:
            print('Catboost accuracy: ', mean(catAccuracy))
            print('Catboost score: ' ,mean(catError))
        masterScore.append(mean(catError))
        masterAccuracy.append(mean(catAccuracy))
        
    print('-------------------------------------------\n\n')
    print('Final Catboost accuracy: ', mean(catAccuracy))
    print('Final Catboost score: ' ,mean(catError))
    print('Final Feature Importance:', masterFeatureDF)
    featureToRemove = masterFeatureDF['importance'].idxmin()
    print(featureToRemove)
    return featureToRemove, mean(catError), colCount, masterFeatureDF

In [5]:
pd.set_option('display.max_rows', None)
basePath = 'data/smallFiltered-{}.csv'
colCount = 10000
secretColumns = ['zScoreOfChangeTmmrw','percentChangeInFiveDays', 'sector', 'tmmrwChngAsPerc']
countSinceLastIncreaseInScore = 0
droppedColumns = ['DPRIMElastChangeP']
lastScore = 0
lastAccuracy = 0
toContinue = True
newToRemove = None
first = True
oldScore = None
while (toContinue):
    toRemove, newScore, colCount, _ = buildForest(droppedColumns, False)
    if not first:
        print('change in score by removing {} is {}'.format(toRemove, newScore - oldScore))
        if newScore <= oldScore :
            droppedColumns.pop()
            toContinue = False
        else:
            droppedColumns.append(toRemove)
            print(droppedColumns)
    else : #if first
        first = False
        droppedColumns.append(toRemove)
        print(droppedColumns)
    oldScore = newScore
        

buildForest(droppedColumns, True)

-------------------------------------------


Final Catboost accuracy:  0.6009341535657325
Final Catboost score:  6.0740867319814695
Final Feature Importance:                                        importance
featureName                                      
Asia (ex China)                          3.880627
BOPGSTBValue                             7.925693
BOPGSTBbPercent                         10.341071
BOPGSTBfiveVsTenTickAverage              7.846684
BOPGSTBlastChangeP                      14.876947
BUSLOANSValue                            1.437795
BUSLOANSbPercent                         5.639337
BUSLOANSfiveVsTenTickAverage             6.656198
BUSLOANSlastChangeP                      7.654412
Bull-Bear Spread                        21.635949
CASS-CTLI-Index                          3.038574
CCLACBW027SBOGValue                      4.021920
CCLACBW027SBOGbPercent                  17.992506
CCLACBW027SBOGfiveVsTenTickAverage      22.459908
CCLACBW027SBOGlastChangeP               2

-------------------------------------------


Final Catboost accuracy:  0.6073137388926861
Final Catboost score:  6.304587225639857
Final Feature Importance:                                        importance
featureName                                      
Asia (ex China)                          4.736303
BOPGSTBValue                             7.896309
BOPGSTBbPercent                          8.887164
BOPGSTBfiveVsTenTickAverage              7.586103
BOPGSTBlastChangeP                      13.018169
BUSLOANSValue                            1.546891
BUSLOANSbPercent                         5.864158
BUSLOANSfiveVsTenTickAverage             7.091719
BUSLOANSlastChangeP                      7.559929
Bull-Bear Spread                        21.405967
CASS-CTLI-Index                          3.299621
CCLACBW027SBOGValue                      3.118904
CCLACBW027SBOGbPercent                  17.412457
CCLACBW027SBOGfiveVsTenTickAverage      20.078106
CCLACBW027SBOGlastChangeP               24

('PCEC96Value',
 6.304587225639857,
 157,
                                        importance
 featureName                                      
 Asia (ex China)                          4.736303
 BOPGSTBValue                             7.896309
 BOPGSTBbPercent                          8.887164
 BOPGSTBfiveVsTenTickAverage              7.586103
 BOPGSTBlastChangeP                      13.018169
 BUSLOANSValue                            1.546891
 BUSLOANSbPercent                         5.864158
 BUSLOANSfiveVsTenTickAverage             7.091719
 BUSLOANSlastChangeP                      7.559929
 Bull-Bear Spread                        21.405967
 CASS-CTLI-Index                          3.299621
 CCLACBW027SBOGValue                      3.118904
 CCLACBW027SBOGbPercent                  17.412457
 CCLACBW027SBOGfiveVsTenTickAverage      20.078106
 CCLACBW027SBOGlastChangeP               24.313512
 CONSUMERValue                            1.759712
 CONSUMERbPercent                       

In [6]:
folderRoot = 'catPredictors/'
for name in modelNames:
    for sector in ['Real_Estate', 'Basic_Materials', 'Healthcare', 'Consumer_Cyclical', 'Financial_Services',
            'Industrials', 'Technology', 'Communication_Services', 'Energy', 'Consumer_Defensive', 'Utilities', 'Financial']:
        models = [CatBoostClassifier().load_model('{}{}{}-{}.cbm'.format(folderRoot, name, sector, count)) for count in range(10)]
        vclf = VotingClassifier(estimators=[('catboost{}'.format(count), models[count]) for count in range(10)], voting='hard')
        pickle.dump(vclf, open('{}bagPredictor-{}-{}.pkl'.format(folderRoot, name, sector), 'wb'))

CatBoostError: c:/program files (x86)/go agent/pipelines/buildmaster/catboost.git/catboost/libs/model/model_import_interface.h:19: Model file doesn't exist: catPredictors/ZScorePredictorReal_Estate-5.cbm