# Single Feature Importance Test

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from time import time
from feature_selection.ml import single_feature_importance_cv
from word_list.analysis import words
from sfi import get_sfi_scores
import statsmodels.formula.api as smf
from sklearn.model_selection import TimeSeriesSplit

In [2]:
def merge_market_gtrends(market, gtrends):
    merged = pd.merge_asof(market, gtrends, left_index=True, right_index=True)
    return merged.dropna()

In [3]:
# ## Loading trends
gtrends = pd.read_csv("data/gtrends.csv")
gtrends.loc[:, "date"] = pd.to_datetime(gtrends.date)
gtrends = gtrends.set_index("date")

# ## Loading and preprossesing market data

path = "data/crsp/nyse/CYN US Equity.csv"
name = path.split("/")[-1].split(".")[0]
target_name = name.replace(" ", "_") + "_return"
market = pd.read_csv(path)
market = market.drop([0, 1], 0)
market = market.rename(columns={"ticker": "date",
                                name: target_name})
market.loc[:, "date"] = pd.to_datetime(market.date)
market.loc[:, target_name] = market[target_name].astype("float") / 100
market = market.set_index("date")

# using only the training sample
market = market["2000":"2010"]

merged = merge_market_gtrends(market, gtrends)

In [4]:
result = get_sfi_scores(merged_df=merged.copy(),
                        target_name=target_name,
                        words=["act", "bank", "president"],
                        max_lag=3,
                        verbose=False,
                        n_splits=2)

In [5]:
word, lag = result.feature[0].split("_")
lag = int(lag)

In [6]:
feature_name = "{}_{}".format(word,lag)
target_name = target_name
merged.loc[:, "{}_{}".format(word,lag)] = merged[word].shift(lag)

In [7]:
r2_OOS = []
tscv = TimeSeriesSplit(n_splits=2)
for train_index, test_index in tscv.split(merged):
    formula = "{} ~ {}".format(target_name, feature_name)
    df_train = merged.iloc[train_index]
    df_test = merged.iloc[test_index]
    lr = smf.ols(formula=formula, data=df_train).fit()
    y_pred = lr.predict(df_test).values
    y_true = df_test[target_name].values
    num = np.sum((y_true - y_pred)**2)
    dem = np.sum((y_true)**2)
    r2 = 1 - (num / dem)
    r2_OOS.append(r2)

calulated = np.mean(r2_OOS)
function = result["mean_r2"][0]
assert np.isclose(calulated,function)