In [1]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from statistics import mean
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier

def scoreCalculator(predictor, attributes, labels):
    predicted = predictor.predict(attributes)
    errorScore = 0
    for guessIndex in range(len(predicted)):
        guess = predicted[guessIndex]
        actual = labels[guessIndex]
        if guess == 'hold' :
            errorScore += 0
        elif guess == 'long':
            if actual == 'short':
                errorScore += -15
            elif actual == 'hold':
                errorScore += -5
            else :#actual == 'long'
                errorScore += 20
        else: #short
            if actual == 'long':
                errorScore += -20
            elif actual == 'hold':
                errorScore += 0
            else: #actual == short
                errorScore += 20
    myMatrix = plot_confusion_matrix(predictor, attributes, labels,
                                     cmap=plt.cm.Blues, normalize = 'all')    
    plt.show()
    return  errorScore/len(predicted)

In [2]:
def catBoostMaker(sectorDF,zScoreAnswer,fiveDayChangeAnswer, tmmrwUpAnswer, sector, catLabels, count):
    catIndicies = []
    counter = -1
    for col in sectorDF.columns: #finding indices for cat variables
        counter += 1
        if col in catLabels:
            catIndicies.append(counter)
    masterList = sectorDF.values
    masterTrainList, masterTestList, zScoreTrainList, zScoreTestList, fiveDayChangeTrainList, fiveDayChangeTestList, \
    tmmrwUpTrainList, tmmrwUpTestList = train_test_split(masterList, zScoreAnswer, fiveDayChangeAnswer,test_size = .25)
    
    answerTrainList = [zScoreTrainList,fiveDayChangeTrainList, tmmrwUpTrainList]
    answerTestList = [zScoreTestList,fiveDayChangeTestList, tmmrwUpTestList]
    trainPools = [Pool(data = masterTrainList, label = x, cat_features = catIndicies) for x in answerTrainList]
    testPools = [Pool(data = masterTestList, label = x, cat_features = catIndicies) for x in answerTestList]
    modelNames = ['ZScorePredictor','FiveDayPredictor','TmmrwUpPredictor']
    folderRoot = 'catPredictors/'
    accuracyList = []
    scoreList = []
    for name, train, test in zip(modelNames, trainPools, testPools):
        train.set_feature_names([x for x in sectorDF.columns])
        print(name)
        model = CatBoostClassifier(bootstrap_type = 'Bernoulli')
        model.fit(train, eval_set = test, logging_level = 'Silent')
        sector = sector.replace(' ', '_')
        model.save_model('{}{}-{}-{}.cbm'.format(folderRoot, name, sector, count))
        accuracy =  model.score(test)
        print('Accuracy: ', accuracy)
        error = scoreCalculator(model, test,test.get_label())
        print('Error Score:', error)
        accuracyList.append(accuracy)
        scoreList.append(error)
        
    averageAccuracy = sum([x for x in accuracyList])/len(accuracyList)
    averageScore = sum([x for x in scoreList])/len(scoreList)
    print('type of feat imp: ', type(CatBoostClassifier.get_feature_importance(model, prettified = True)))
    print(CatBoostClassifier.get_feature_importance(model, prettified = True))
    print('[1:] ver.')
    print(CatBoostClassifier.get_feature_importance(model, prettified = True)[1:])
    featureList = CatBoostClassifier.get_feature_importance(model, prettified = True)[1:]
    featureDF = pd.DataFrame(data = 
                             {'featureName': [featureList[x][1] for x in len(featureList)],
                            'importance': [featureList[x][1] for x in len(featureList)]}).set_index('featureName')
    return averageAccuracy, averageScore, featureDF

In [None]:
def buildForest():
    masterScore = []
    masterAccuracy = []
    masterFeatureDF = pd.DataFrame(data = {'featureName': [], 'importance': []})
    firstPass = True
    for x in range(10):
        combinedDF = pd.read_csv(basePath.format(x), parse_dates = True)
        combinedDF.drop(columns = ['Date'], inplace = True)
        if len(droppedColumns) != 0:
            combinedDF.drop(columns = droppedColumns, inplace = True)
        if firstPass:
            colCount = len(combinedDF.columns) - len(secretColumns)
            initDF = pd.DataFrame(data = {'featureName': combinedDF.columns, 'importance': [0] * len(combinedDF.columns)})
            masterFeatureDF.append(initDF)
            firstPass = False
        catLabels = ['ticker','industry']
        if x == 0:
            for col in combinedDF.columns:
                print(col)
        print('-------------------------------------------\n\n')
        sectorList = combinedDF['sector'].copy().unique()
        catAccuracy = []
        catError = []
        for sector in sectorList:
            print('Sector is: ', sector)
            sectorDF = combinedDF[combinedDF['sector'] == sector].copy()
            zScoreAnswer = sectorDF['zScoreOfChangeTmmrw']
            zScoreAnswer = zScoreAnswer.astype('float')
            zScoreAnswer = ['long' if x > .5 else 'short' if x < -.5 else 'hold' for x in zScoreAnswer ]
            fiveDayChangeAnswer = sectorDF['percentChangeInFiveDays']
            fiveDayChangeAnswer = ['long' if x > .015 else 'short' if x < -.015 else 'hold' for x in fiveDayChangeAnswer]
            tmmrwUpAnswer = sectorDF['tmmrwChngAsPerc']
            tmmrwUpAnswer = ['long' if x > 0 else 'short' for x in tmmrwUpAnswer]

            sectorDF.drop(columns = secretColumns, inplace = True)
            accuracy, error, featureDF = \
            catBoostMaker(sectorDF,zScoreAnswer,fiveDayChangeAnswer, tmmrwUpAnswer, sector, catLabels, x)
            catAccuracy.append(accuracy)
            catError.append(error)
            masterFeatureDF = masterFeatureDF + featureDF
            print('-------------------------------------------\n\n')
        print('Catboost accuracy: ', mean(catAccuracy))
        print('Catboost score: ' ,mean(catError))
        masterScore.append(mean(catError))
        masterAccuracy.append(mean(catAccuracy))
        
    print('-------------------------------------------\n\n')
    print('Final Catboost accuracy: ', mean(catAccuracy))
    print('Final Catboost score: ' ,mean(catError))
    print('Final Feature Importance:', masterFeatureDF)
    masterFeatureDF['features'] = masterFeatureDF.index
    
    featureToRemove = masterFeatureDF.get_value(masterFeatureDF['features'].min, 'features') 
    print(featureToRemove)

In [3]:
pd.set_option('display.max_rows', None)
basePath = 'data/smallFiltered-{}.csv'
colCount = 10000
secretColumns = ['zScoreOfChangeTmmrw','percentChangeInFiveDays', 'sector', 'tmmrwChngAsPerc']
countSinceLastIncreaseInScore = 0
droppedColumns = []
lastScore = 0
lastAccuracy = 0
while (countSinceLastIncreaseInScore < colCount):
    masterScore = []
    masterAccuracy = []
    masterFeatureDF = pd.DataFrame(data = {'featureName': [], 'importance': []})
    firstPass = True
    for x in range(10):
        combinedDF = pd.read_csv(basePath.format(x), parse_dates = True)
        combinedDF.drop(columns = ['Date'], inplace = True)
        if len(droppedColumns) != 0:
            combinedDF.drop(columns = droppedColumns, inplace = True)
        if firstPass:
            colCount = len(combinedDF.columns) - len(secretColumns)
            initDF = pd.DataFrame(data = {'featureName': combinedDF.columns, 'importance': [0] * len(combinedDF.columns)})
            masterFeatureDF.append(initDF)
            firstPass = False
        catLabels = ['ticker','industry']
        if x == 0:
            for col in combinedDF.columns:
                print(col)
        print('-------------------------------------------\n\n')
        sectorList = combinedDF['sector'].copy().unique()
        catAccuracy = []
        catError = []
        for sector in sectorList:
            print('Sector is: ', sector)
            sectorDF = combinedDF[combinedDF['sector'] == sector].copy()
            zScoreAnswer = sectorDF['zScoreOfChangeTmmrw']
            zScoreAnswer = zScoreAnswer.astype('float')
            zScoreAnswer = ['long' if x > .5 else 'short' if x < -.5 else 'hold' for x in zScoreAnswer ]
            fiveDayChangeAnswer = sectorDF['percentChangeInFiveDays']
            fiveDayChangeAnswer = ['long' if x > .015 else 'short' if x < -.015 else 'hold' for x in fiveDayChangeAnswer]
            tmmrwUpAnswer = sectorDF['tmmrwChngAsPerc']
            tmmrwUpAnswer = ['long' if x > 0 else 'short' for x in tmmrwUpAnswer]

            sectorDF.drop(columns = secretColumns, inplace = True)
            accuracy, error, featureDF = \
            catBoostMaker(sectorDF,zScoreAnswer,fiveDayChangeAnswer, tmmrwUpAnswer, sector, catLabels, x)
            catAccuracy.append(accuracy)
            catError.append(error)
            masterFeatureDF = masterFeatureDF + featureDF
            print('-------------------------------------------\n\n')
        print('Catboost accuracy: ', mean(catAccuracy))
        print('Catboost score: ' ,mean(catError))
        masterScore.append(mean(catError))
        masterAccuracy.append(mean(catAccuracy))
        
    print('-------------------------------------------\n\n')
    print('Final Catboost accuracy: ', mean(catAccuracy))
    print('Final Catboost score: ' ,mean(catError))
    print('Final Feature Importance:', masterFeatureDF)
    masterFeatureDF['features'] = masterFeatureDF.index
    
    featureToRemove = masterFeatureDF.get_value(masterFeatureDF['features'].min, 'features') 
    print(featureToRemove)
    ###
    
    break

SyntaxError: invalid syntax (<ipython-input-3-d9c8a7873f3e>, line 66)

In [None]:
folderRoot = 'catPredictors/'
for name in ['ZScorePredictor','FiveDayPredictor','TmmrwUpPredictor']:
    for sector in ['Real_Estate', 'Basic_Materials', 'Healthcare', 'Consumer_Cyclical', 'Financial_Services',
            'Industrials', 'Technology', 'Communication_Services', 'Energy', 'Consumer_Defensive', 'Utilities', 'Financial']:
        models = [CatBoostClassifier().load_model('{}{}{}-{}.cbm'.format(folderRoot, name, sector, count)) for count in range(5)]
        vclf = VotingClassifier(estimators=[('catboost{}'.format(count), models[count]) for count in range(5)], voting='hard')
        pickle.dump(vclf, open('{}bagPredictor-{}-{}.pkl'.format(folderRoot, name, sector), 'wb'))