In [15]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
import os
import sys
import inspect
import unittest

currentdir = os.path.dirname(
    os.path.abspath(
        inspect.getfile(
            inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from src.feature_selection.mdi import mdi_feature_importance  # noqa
from src.feature_selection.sfi import single_feature_importance_cv  # noqa
from src.feature_selection.mda import mean_decrease_accuracy  # noqa

from src.feature_selection.huang import run_huang_methods, run_granger_causality


# variables
SIG_LEVEL = 0.05
MAX_LAG = 20 # maximum number of lags to create
CORREL_THRESHOLD = 0.5 # correlation threshold to apply filter
CONSTANT_THRESHOLD = 0.9 # constant threshold to apply filter


In [16]:

# creating artificial dataset
n_features = 4
n_informative = 3
top_n = n_features
X, y, coef = make_regression(n_samples=5000,
                             n_features=n_features,
                             n_informative=n_informative,
                             random_state=1233,
                             coef=True)
feature_names = ["f{}".format(i) for i in range(n_features)]
columns = feature_names + ["target_return"]

y = y.reshape(-1, 1)
df = pd.DataFrame(np.hstack([X, y]), columns=columns)
true_imp = pd.DataFrame({"feature": feature_names,
                            "feature_score": coef}).sort_values("feature_score",
                                                                ascending=False).reset_index(drop=True)


In [22]:

result_huang = run_huang_methods(merged_df=df,
                                 target_name='target_return',
                                 words=feature_names,
                                 max_lag=MAX_LAG,
                                 verbose=False,
                                 sig_level=SIG_LEVEL,
                                 correl_threshold=CORREL_THRESHOLD,
                                 constant_threshold=CONSTANT_THRESHOLD,
                                 asset_name=None)

In [23]:
result_huang

Unnamed: 0,feature,feature_score
0,f1,
1,f2,
2,f0,
3,f3,
4,target_return,


In [19]:
result_granger = run_granger_causality(merged_df=df,
                                       target_name='target_return',
                                       words=feature_names,
                                       max_lag=MAX_LAG,
                                       sig_level=SIG_LEVEL,
                                       correl_threshold=CORREL_THRESHOLD,
                                       constant_threshold=CONSTANT_THRESHOLD,
                                       verbose=False)

In [20]:
result_granger

Unnamed: 0,feature,feature_score


In [11]:
true_imp.feature.head()

0    f3
1    f1
2    f2
3    f0
Name: feature, dtype: object