In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from time import time
from word_list.analysis import words
from feature_selection.sfi import get_sfi_scores
from feature_selection.mdi import get_mdi_scores
from data_mani.utils import merge_market_and_gtrends
from data_mani.utils import get_ticker_name

In [3]:
# Variables
N_SPLITS = 5 # number of CV splits
N_CORES = 2 # number of cores to use
MAX_LAG = 20 # maximum number of lags to create
             # google trends features
OUT_FOLDER = "nyse" # name of the marked data folder
DEBUG = True # param to debug the script
TEST_SIZE = 0.5 # pct of the train/test split
THRESHOLD = 252 * 2 # treshold to filted merged datframes
                    # 252 = business days in a year

In [9]:
path = "data/crsp/nasdaq/AAPL US Equity.csv"
merged, _ = merge_market_and_gtrends(path, test_size=TEST_SIZE)

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
feature_names = []
max_lag = MAX_LAG
merged_df = merged.copy()

for word in tqdm(words, desc="add shift"):
    for shift in range(1, max_lag + 1):
        new_feature = word.replace(" ", "_") + "_{}".format(shift)
        merged_df.loc[:, new_feature] = merged_df[word].shift(shift)
        feature_names.append(new_feature)

add shift: 100%|██████████| 182/182 [00:50<00:00,  3.59it/s]


In [17]:
merged_df.shape

(2087, 3823)

In [19]:
.shape

(2067, 3823)

In [20]:
target_name = "target_return"
random_state = 220458

df_ = merged_df.dropna()

X, y = df_[feature_names].values, df_[target_name].values
rf = RandomForestRegressor(
    max_features=1,
    n_estimators=100,
    random_state=random_state)
rf.fit(X, y)

RandomForestRegressor(max_features=1, random_state=220458)

In [32]:
a = pd.DataFrame(list(zip(feature_names, rf.feature_importances_)), columns=["feature", "feature_score"])
a = a.sort_values("feature_score", ascending=False).reset_index(drop=True)
a.head(10)

Unnamed: 0,feature,feature_score
0,federal_18,0.001101
1,stock_market_10,0.001085
2,market_1,0.000996
3,bank_4,0.000975
4,oil_18,0.000889
5,gold_7,0.000852
6,community_5,0.000849
7,money_4,0.000811
8,arts_5,0.000795
9,labor_18,0.000789


In [41]:
fi_estimators = {
    i: dt.feature_importances_ for i,
    dt in enumerate(
        rf.estimators_)}
fi_estimators = pd.DataFrame.from_dict(
    fi_estimators, orient="index", columns=feature_names)
fi_estimators = fi_estimators.replace(0, np.nan)
mean = fi_estimators.mean()
std = fi_estimators.std()
n = fi_estimators.shape[0]
std = std * np.power(n, -0.5)
imp = pd.concat({"mean": mean, "std": std}, axis=1)
imp /= imp["mean"].sum()
b = imp.sort_values("mean", ascending=False)
b.head(10)

Unnamed: 0,mean,std
short_selling_16,0.001522,0.000367
short_sell_6,0.001499,
consume_1,0.001211,0.000318
BUY_AND_HOLD_13,0.001206,0.000169
short_selling_4,0.001193,4.3e-05
short_sell_12,0.001041,0.000147
market_1,0.000979,0.000186
short_sell_16,0.000972,0.000181
short_selling_9,0.000917,0.000138
short_sell_15,0.000875,0.000149


In [51]:
fi_estimators.mean().sort_values(ascending=False).head(10)

short_selling_16    0.004992
short_sell_6        0.004915
consume_1           0.003971
BUY_AND_HOLD_13     0.003953
short_selling_4     0.003912
short_sell_12       0.003414
market_1            0.003212
short_sell_16       0.003188
short_selling_9     0.003008
short_sell_15       0.002870
dtype: float64

In [54]:
fi_estimators.mean().sort_values(ascending=False).dropna().tail(10)

BUY_AND_HOLD_3      5.241653e-05
BUY_AND_HOLD_1      4.918179e-05
short_selling_12    4.469329e-05
BUY_AND_HOLD_20     3.497783e-05
greed_3             2.935375e-05
short_sell_11       2.541216e-05
BUY_AND_HOLD_2      2.381126e-05
BUY_AND_HOLD_18     2.347404e-05
BUY_AND_HOLD_17     1.611027e-06
rare_earths_15      3.297732e-07
dtype: float64

In [7]:

result = get_sfi_scores(merged_df=merged,
                        target_name="target_return",
                        words=words,
                        max_lag=MAX_LAG,
                        verbose=True,
                        n_splits=N_SPLITS)

add shift: 100%|██████████| 182/182 [01:23<00:00,  2.18it/s]
cv r2: 100%|██████████| 182/182 [02:57<00:00,  1.03it/s]


In [45]:
random_state = 220458
# random_state = None

result_mdi = get_mdi_scores(merged_df=merged,
                            target_name="target_return",
                            words=words,
                            max_lag=MAX_LAG,
                            verbose=False,
                            random_state=random_state)

# df = pd.concat([result.head(5).feature.to_frame().add_prefix("SFI "),
#                 result_mdi.head(5).feature.to_frame().add_prefix("MDI ")],1)
# df

In [40]:
result_mdi.head(10)

Unnamed: 0,feature,feature_score
0,short_selling_16,0.001522
1,short_sell_6,0.001499
2,consume_1,0.001211
3,BUY_AND_HOLD_13,0.001206
4,short_selling_4,0.001193
5,short_sell_12,0.001041
6,market_1,0.000979
7,short_sell_16,0.000972
8,short_selling_9,0.000917
9,short_sell_15,0.000875
