In [1]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm
from glob import glob
from time import time
from word_list.analysis import words
from data_mani.utils import merge_market_and_gtrends
from data_mani.utils import get_ticker_name
from data_mani.utils import target_ret_to_directional_movements
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from feature_selection.mda import mean_decrease_accuracy

In [2]:
# Variables
N_SPLITS = 5 # number of CV splits
N_CORES = 2 # number of cores to use
MAX_LAG = 20 # maximum number of lags to create
             # google trends features
OUT_FOLDER = "nyse" # name of the marked data folder
DEBUG = True # param to debug the script
TEST_SIZE = 0.5 # pct of the train/test split
THRESHOLD = 252 * 2 # treshold to filted merged datframes
                    # 252 = business days in a year

In [3]:
path = "data/crsp/nasdaq/AAPL US Equity.csv"
merged, _ = merge_market_and_gtrends(path, test_size=TEST_SIZE)
target_ret_to_directional_movements(merged, y_name="target_return")
merged.head(2)

Unnamed: 0_level_0,target_return,BUY AND HOLD,DOW JONES,act,arts,bank,banking,blacklist,bonds,bubble,...,virginia,voters,votes,war,washington,water,william,wisconsin,world,york
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-02,0,0.0,1.0,3.0,0.0,38.0,-1.0,1.0,-4.0,-2.0,...,8.0,0.0,0.0,5.0,20.0,2.0,4.0,0.0,14.0,11.0
2004-01-05,1,-1.0,0.0,3.0,3.0,-7.0,-3.0,0.0,-1.0,-3.0,...,1.0,1.0,1.0,4.0,-1.0,3.0,5.0,1.0,4.0,-15.0


In [4]:
feature_names = []
max_lag = MAX_LAG
merged_df = merged.copy()

for word in tqdm(words, desc="add shift"):
    for shift in range(1, max_lag + 1):
        new_feature = word.replace(" ", "_") + "_{}".format(shift)
        merged_df.loc[:, new_feature] = merged_df[word].shift(shift)
        feature_names.append(new_feature)

add shift: 100%|██████████| 182/182 [01:04<00:00,  2.81it/s]


In [5]:
df = merged_df[["target_return"] + feature_names].dropna()
fn = ["bank_16", "banking_9", "BUY_AND_HOLD_1", "bank_17", "DOW_JONES_7"]

imp = mean_decrease_accuracy(df=df,
                             feature_names=fn,
                             target_name="target_return",
                             random_state=233,
                             n_splits=3)
imp

Unnamed: 0,feature,feature_score
0,banking_9,2.504761
1,bank_17,2.143867
2,DOW_JONES_7,1.453184
3,bank_16,0.532886
4,BUY_AND_HOLD_1,0.0


In [6]:
imp = mean_decrease_accuracy(df=df,
                             feature_names=fn,
                             target_name="target_return",
                             random_state=233,
                             n_splits=3)
imp

Unnamed: 0,feature,feature_score
0,banking_9,2.504761
1,bank_17,2.143867
2,DOW_JONES_7,1.453184
3,bank_16,0.532886
4,BUY_AND_HOLD_1,0.0
