# Single Feature Importance Test

In [7]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from time import time
from word_list.analysis import words
from feature_selection.sfi import get_sfi_scores
import statsmodels.formula.api as smf
from sklearn.model_selection import TimeSeriesSplit

In [8]:
def merge_market_gtrends(market, gtrends):
    merged = pd.merge_asof(market, gtrends, left_index=True, right_index=True)
    return merged.dropna()

In [9]:
# ## Loading trends
gtrends = pd.read_csv("data/gtrends.csv")
gtrends.loc[:, "date"] = pd.to_datetime(gtrends.date)
gtrends = gtrends.set_index("date")

# ## Loading and preprossesing market data

path = "data/crsp/nyse/CYN US Equity.csv"
name = path.split("/")[-1].split(".")[0]
target_name = name.replace(" ", "_") + "_return"
market = pd.read_csv(path)
market = market.drop([0, 1], 0)
market = market.rename(columns={"ticker": "date",
                                name: target_name})
market.loc[:, "date"] = pd.to_datetime(market.date)
market.loc[:, target_name] = market[target_name].astype("float") / 100
market = market.set_index("date")

# using only the training sample
market = market["2000":"2010"]

merged = merge_market_gtrends(market, gtrends)

In [10]:
init = time()

result = get_sfi_scores(merged_df=merged.copy(),
                        target_name=target_name,
                        words=words,
                        max_lag=30,
                        verbose=True,
                        n_splits=2)

tot_time = time() - init
tot_time = tot_time / 60
print("total time = {:.3f} (minutes)".format(tot_time))

add shift: 100%|██████████| 182/182 [01:29<00:00,  2.03it/s]
cv r2: 100%|██████████| 182/182 [01:36<00:00,  1.89it/s]

total time = 3.100 (minutes)





### Results Example

In [11]:
result.head(10)

Unnamed: 0,feature,mean_r2
0,DOW_JONES_2,0.001852
1,cash_5,0.001845
2,forex_26,0.001737
3,seats_4,0.001706
4,election_12,0.001498
5,dividend_4,0.001485
6,revenue_2,0.001388
7,nyse_4,0.00127
8,minister_21,0.001207
9,nasdaq_4,0.001153


In [17]:
type(True)

bool

### Time

In [15]:
n_tickers = 2163 + 10185
final_time = tot_time * n_tickers
final_time = final_time / 60

print("Estimated time to analize all ticker = {:.3f} (hours)".format(final_time))

Estimated time to analize all ticker = 637.912 (hours)
