In [1]:
from glob import glob
from time import time
import numpy as np
from multiprocessing import Pool
import os

from word_list.analysis import words
from data_mani.utils import path_filter
from data_mani.utils import merge_market_and_gtrends
from data_mani.utils import get_ticker_name
from feature_selection.huang import run_granger_causality
import random
import statsmodels.api as sm


# variables
SIG_LEVEL = 0.05
MAX_LAG = 20 # maximum number of lags to create
N_CORES = 30 # number of cores to use
OUT_FOLDER = "spx" # name of the marked data folder
DEBUG = False # param to debug the script
TEST_SIZE = 0.5 # pct of the train/test split
THRESHOLD = 252 * 2 # treshold to filted merged datframes
                    # 252 = business days in a year
PAR = True # enable run in paralell
CORREL_THRESHOLD = 0.5 # correlation threshold to apply filter
CONSTANT_THRESHOLD = 0.9 # constant threshold to apply filter

# ajuste pra path do windows
PATHS = sorted(glob("data/index/{}/*.csv".format(OUT_FOLDER)))

In [2]:
paths = path_filter(paths=PATHS,
                    threshold=THRESHOLD)

filter: 100%|██████████| 1103/1103 [12:32<00:00,  1.47it/s]


In [3]:
 merged, _ = merge_market_and_gtrends(PATHS[0], test_size=TEST_SIZE)

In [4]:
merged.head()

Unnamed: 0_level_0,target_return,BUY AND HOLD,DOW JONES,act,arts,bank,banking,blacklist,bonds,bubble,...,virginia,voters,votes,war,washington,water,william,wisconsin,world,york
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-02,-0.010282,0.0,1.0,3.0,0.0,38.0,-1.0,1.0,-4.0,-2.0,...,8.0,0.0,0.0,5.0,20.0,2.0,4.0,0.0,14.0,11.0
2004-01-05,0.005343,-1.0,0.0,3.0,3.0,-7.0,-3.0,0.0,-1.0,-3.0,...,1.0,1.0,1.0,4.0,-1.0,3.0,5.0,1.0,4.0,-15.0
2004-01-06,0.001476,0.0,1.0,10.0,1.0,37.0,5.0,0.0,-1.0,1.0,...,21.0,0.0,-1.0,-12.0,33.0,9.0,2.0,11.0,-7.0,29.0
2004-01-07,-0.010024,0.0,-1.0,-4.0,-4.0,-8.0,-1.0,1.0,2.0,2.0,...,-7.0,0.0,0.0,1.0,-9.0,-3.0,1.0,-5.0,0.0,-8.0
2004-01-08,-0.006849,0.0,1.0,2.0,2.0,1.0,1.0,0.0,-1.0,-1.0,...,-9.0,0.0,0.0,7.0,-7.0,-2.0,0.0,2.0,-3.0,-1.0


In [6]:
x_name = 'DOW JONES'
y_name = 'target_return'

test_result = sm.tsa.stattools.grangercausalitytests(x=merged[[y_name] + [x_name]],
                                                     maxlag=MAX_LAG,
                                                     verbose=DEBUG)

In [8]:
test_result.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [9]:
test_result[1]

({'ssr_ftest': (0.07782753307495034, 0.7803212602769055, 996.0, 1),
  'ssr_chi2test': (0.07806195335529657, 0.7799411759009051, 1),
  'lrtest': (0.07805890362942591, 0.7799453638828201, 1),
  'params_ftest': (0.07782753307484022, 0.7803212602770574, 996.0, 1.0)},
 [<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fc4ebae9730>,
  <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7fc4ebae9d00>,
  array([[0., 1., 0.]])])

In [15]:
test_result[1][1][0].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,6.292
Date:,"Sat, 06 Mar 2021",Prob (F-statistic):,0.0123
Time:,16:43:47,Log-Likelihood:,3054.6
No. Observations:,999,AIC:,-6105.0
Df Residuals:,997,BIC:,-6095.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0792,0.032,-2.508,0.012,-0.141,-0.017
const,0.0005,0.000,1.442,0.150,-0.000,0.001

0,1,2,3
Omnibus:,100.358,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690.191
Skew:,0.078,Prob(JB):,1.34e-150
Kurtosis:,7.069,Cond. No.,87.6


In [16]:
test_result[1][1][1].summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,3.182
Date:,"Sat, 06 Mar 2021",Prob (F-statistic):,0.0419
Time:,16:44:07,Log-Likelihood:,3054.7
No. Observations:,999,AIC:,-6103.0
Df Residuals:,996,BIC:,-6089.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0792,0.032,-2.509,0.012,-0.141,-0.017
x2,-0.0002,0.001,-0.279,0.780,-0.001,0.001
const,0.0005,0.000,1.468,0.143,-0.000,0.001

0,1,2,3
Omnibus:,100.846,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,697.488
Skew:,0.079,Prob(JB):,3.49e-152
Kurtosis:,7.09,Cond. No.,88.7
