In [1]:
import os
import numpy as np
import pandas as pd 
from tqdm import tqdm
from glob import glob
from time import time
from word_list.analysis import words
from data_mani.utils import merge_market_and_gtrends
from data_mani.utils import get_ticker_name
from data_mani.utils import target_ret_to_directional_movements
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Variables
N_SPLITS = 5 # number of CV splits
N_CORES = 2 # number of cores to use
MAX_LAG = 20 # maximum number of lags to create
             # google trends features
OUT_FOLDER = "nyse" # name of the marked data folder
DEBUG = True # param to debug the script
TEST_SIZE = 0.5 # pct of the train/test split
THRESHOLD = 252 * 2 # treshold to filted merged datframes
                    # 252 = business days in a year

In [3]:
path = "data/crsp/nasdaq/AAPL US Equity.csv"
merged, _ = merge_market_and_gtrends(path, test_size=TEST_SIZE)
target_ret_to_directional_movements(merged, y_name="target_return")
merged.head(2)

Unnamed: 0_level_0,target_return,BUY AND HOLD,DOW JONES,act,arts,bank,banking,blacklist,bonds,bubble,...,virginia,voters,votes,war,washington,water,william,wisconsin,world,york
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-02,0,0.0,1.0,3.0,0.0,38.0,-1.0,1.0,-4.0,-2.0,...,8.0,0.0,0.0,5.0,20.0,2.0,4.0,0.0,14.0,11.0
2004-01-05,1,-1.0,0.0,3.0,3.0,-7.0,-3.0,0.0,-1.0,-3.0,...,1.0,1.0,1.0,4.0,-1.0,3.0,5.0,1.0,4.0,-15.0


In [4]:
feature_names = []
max_lag = MAX_LAG
merged_df = merged.copy()

for word in tqdm(words, desc="add shift"):
    for shift in range(1, max_lag + 1):
        new_feature = word.replace(" ", "_") + "_{}".format(shift)
        merged_df.loc[:, new_feature] = merged_df[word].shift(shift)
        feature_names.append(new_feature)

add shift: 100%|██████████| 182/182 [00:48<00:00,  3.74it/s]


In [5]:
init = time()

# fn = ["bank_16", "banking_9", "BUY_AND_HOLD_1", "bank_17", "DOW_JONES_7"]
fn = feature_names[:]
scores = {f:[] for f in fn}
# scores = {f:[] for f in feature_names}


df = merged_df[["target_return"] + feature_names].dropna()
n_splits = 2
tscv = TimeSeriesSplit(n_splits=n_splits)
for train_index, test_index in tscv.split(df):
    df_train = df.iloc[train_index]
    df_test = df.iloc[test_index]
    X_train, y_train = df_train[feature_names].values, df_train["target_return"].values
    X_test, y_test = df_test[feature_names].values, df_test["target_return"].values 
    model = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
    pred = model.predict(X_test)
    acc0 = accuracy_score(y_test, pred)
    for feature in tqdm(fn, desc="features"):
        new_test = df_test[feature_names].copy()
        np.random.shuffle(new_test.loc[:, feature])
        new_X_test = new_test.values
        new_pred = model.predict(new_X_test)
        acc = accuracy_score(y_test, new_pred)
        acc_diff =  acc0 - acc
        imp = 1/(1.0 - acc_diff)
        scores[feature].append(imp)
        del new_test
    
result = pd.DataFrame(scores).transpose().mean(1).reset_index()
result.columns = ["feature", "feature_score"]
result = result.sort_values("feature_score", ascending=False).reset_index(drop=True)
total = np.round(time() - init,2)


features: 100%|██████████| 3640/3640 [05:54<00:00, 10.28it/s]
features: 100%|██████████| 3640/3640 [06:33<00:00,  9.26it/s]


In [7]:
print("{} seconds".format(total))

result.head(20)

747.96 seconds


Unnamed: 0,feature,feature_score
0,party_12,1.007364
1,texas_11,1.007364
2,union_9,1.006618
3,pennsylvania_13,1.005874
4,act_13,1.005132
5,firm_17,1.005132
6,police_18,1.005132
7,district_18,1.004392
8,police_20,1.004382
9,money_1,1.003655
