In [1]:
from utils import * 
from config import parameters

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from math import sqrt
from statsmodels.tsa.stattools import grangercausalitytests

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fred_gdp_quarterly_csv_filepath = parameters.fred_gdp_quarterly_csv_filepath 
fred_unemp_quarterly_csv_filepath = parameters.fred_unemp_quarterly_csv_filepath 
fred_unemp_not_adjusted_quarterly_csv_filepath = os.path.join(parameters.data_dir, 'fred_unemp_not_adjusted_quarterly.csv') 
fred_interest_rate_quarterly_csv_filepath = os.path.join(parameters.data_dir, 'fred_interest_rate_quarterly.csv') 
fred_ted_spread_quarterly_csv_filepath = os.path.join(parameters.data_dir, 'fred_ted_spread_quarterly.csv') 
yahoo_djia_quarterly_csv_filepath = os.path.join(parameters.data_dir, 'yahoo_djia_quarterly.csv')
fred_index_filepath_dict = {'gdp': fred_gdp_quarterly_csv_filepath, 'unemployment': fred_unemp_quarterly_csv_filepath, 'unemployment_not_adjusted': fred_unemp_not_adjusted_quarterly_csv_filepath, 'interest_rate': fred_interest_rate_quarterly_csv_filepath, 'ted_spread': fred_ted_spread_quarterly_csv_filepath, 'djia': yahoo_djia_quarterly_csv_filepath}
fred_index_dict_pkl_filepath = os.path.join(parameters.output_base_dir, 'fred_index_dict.pkl')
fred_index_dict = get_index_dict(fred_index_dict_pkl_filepath, fred_index_filepath_dict)

bigram_uniqueness_strength_pkl_filepath = os.path.join(parameters.output_base_dir, 'bigram-emerging_topic_score-strength_20191123-19-10-46.pkl')
bigram_uniqueness_strength_dict = load_pkl(bigram_uniqueness_strength_pkl_filepath)

base_dir: /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
Parameters(
  sentence_length_outlier = [0, 1, 2, 181, 252]
  bigram_window_size = 15
  bigram_max_rank = None
  stopword_list = ['financial', 'market', 'federal', 'bank', 'banking', 'bankers', 'speech', 'bi', 'review', 'year', 'reserve', 'policy', 'state', 'central', 'board', 'percent', 'rate', 'mr', 'alan', 'greenspan', 'ben', 'bernanke', 'janet', 'yellen', 'jerome', 'powell', 'vol', 'ha', 'wa', 'ii']
  ws_quarterly = 4
  ws_semiannually = 2
  ws_annually = 1
  strength_alpha = 0.9
  base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency
  data_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data
  output_base_dir = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/output
  bis_raw_pkl_filepath = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data/bis_w_content_FINAL.pkl
  fred_gdp_quarterly_csv_filepath = /home/dmlab/jihye/GIT/bis_speeches_text_bigram_frequency/data/fred_g

In [178]:
def rescale_rolling(values, rolling_window):
    df = pd.DataFrame(values)
    normalized = (df - df.rolling(rolling_window).mean()) / df.rolling(rolling_window).std()
    return normalized

def show_plt_chart_rolling(x_, y_, x_ticks_list, rolling_window, x_label, y_label, max_lag=5, with_legend=True, show=True):
    x = np.array(x_)
    y = np.array(y_)

    x_normalized = rescale_rolling(x, rolling_window)[rolling_window-1:].fillna(0).reset_index(drop=True)
    y_normalized = rescale_rolling(y, rolling_window)[rolling_window-1:].fillna(0).reset_index(drop=True)
    x_ticks_list = pd.Series(x_ticks_list)[rolling_window-1:].fillna(0).reset_index(drop=True)
    
    if show:
        plt.plot(x_normalized, 'g', label=x_label)
        plt.plot(y_normalized,'r', label=y_label)
        plt.xticks(list(range(len(x_ticks_list))), x_ticks_list, rotation='vertical')    
        plt.grid()
        if with_legend:
            plt.legend()
    
    x_normalized_df = x_normalized
    y_normalized_df = y_normalized
    df = pd.concat([y_normalized_df, x_normalized_df], axis=1)   # the predictor (X) is in the second column.
    return grangercausalitytests(df, maxlag=max_lag)

plt.rcParams["figure.figsize"] = (20,5)

### frequency, emerging_topic_score, strength

In [38]:
# parameter
top_n = 50
rolling_window = 3

In [32]:
indices = ['gdp', 'unemployment', 'unemployment_not_adjusted', 'interest_rate', 'ted_spread', 'djia']
period_dict = dict()
proposed_data_x_dict = {'frequency': dict(), 'emerging_topic_score': dict(), 'strength': dict()}
fred_data_y_dict = dict()
for index_ in indices:
    period_dict[index_] = list()
    proposed_data_x_dict['frequency'][index_] = list()
    proposed_data_x_dict['emerging_topic_score'][index_] = list()
    proposed_data_x_dict['strength'][index_] = list()
    fred_data_y_dict[index_] = list()

# align
for index_name in fred_data_y_dict.keys():
    _period_dict = bigram_uniqueness_strength_dict['quarterly']
    data_of_previous_period = False
    for _period in sorted(_period_dict.keys()): 
        
        # TO TEST ROBUSTNESS
#         if _period[:4] not in ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010']:
#             continue
            
        # in case of fred_index_dict has no recent or past information.
        if _period not in fred_index_dict[index_name]:
            continue
        count = 0
        _sum_freq, _sum_emerging_topic_score, _sum_strength = 0, 0, 0
        for _bigram, (_freq, _emerging_topic_score, _strength) in sorted(_period_dict[_period].items(),
                                                               key=lambda t: t[-1][0],
                                                               reverse=True):  # CONTROL PRINTING ORDER
            count += 1
            _sum_freq += _freq
            _sum_emerging_topic_score += _emerging_topic_score
            _sum_strength += _strength
            
            if count == top_n:
                if not data_of_previous_period:
                    data_of_previous_period_freq = -_sum_freq
                    data_of_previous_period_emerging_topic_score = -_sum_emerging_topic_score
                    data_of_previous_period_strength = -_sum_strength
                    data_of_previous_period = True
                    continue
                    
                # data append
                period_dict[index_name].append(_period)
                proposed_data_x_dict['frequency'][index_name].append(data_of_previous_period_freq)
                proposed_data_x_dict['emerging_topic_score'][index_name].append(data_of_previous_period_emerging_topic_score)
                proposed_data_x_dict['strength'][index_name].append(data_of_previous_period_strength)
                fred_data_y_dict[index_name].append(float(fred_index_dict[index_name][_period]))
                
                data_of_previous_period_freq = -_sum_freq
                data_of_previous_period_emerging_topic_score = -_sum_emerging_topic_score
                data_of_previous_period_strength = -_sum_strength
                
                break

In [218]:
test_periods_dict = dict()
amount = 28
end_ = int(84 / amount)
for i in range(1, end_+1):
    test_periods_dict[i] = (amount*(i-1) , amount*i)  
test_periods_dict

{1: (0, 28), 2: (28, 56), 3: (56, 84)}

In [219]:
for s_, e_ in test_periods_dict.values():
    print(period_dict['unemployment_not_adjusted'][s_:e_])

['1998_Q3', '1998_Q4', '1999_Q1', '1999_Q2', '1999_Q3', '1999_Q4', '2000_Q1', '2000_Q2', '2000_Q3', '2000_Q4', '2001_Q1', '2001_Q2', '2001_Q3', '2001_Q4', '2002_Q1', '2002_Q2', '2002_Q3', '2002_Q4', '2003_Q1', '2003_Q2', '2003_Q3', '2003_Q4', '2004_Q1', '2004_Q2', '2004_Q3', '2004_Q4', '2005_Q1', '2005_Q2']
['2005_Q3', '2005_Q4', '2006_Q1', '2006_Q2', '2006_Q3', '2006_Q4', '2007_Q1', '2007_Q2', '2007_Q3', '2007_Q4', '2008_Q1', '2008_Q2', '2008_Q3', '2008_Q4', '2009_Q1', '2009_Q2', '2009_Q3', '2009_Q4', '2010_Q1', '2010_Q2', '2010_Q3', '2010_Q4', '2011_Q1', '2011_Q2', '2011_Q3', '2011_Q4', '2012_Q1', '2012_Q2']
['2012_Q3', '2012_Q4', '2013_Q1', '2013_Q2', '2013_Q3', '2013_Q4', '2014_Q1', '2014_Q2', '2014_Q3', '2014_Q4', '2015_Q1', '2015_Q2', '2015_Q3', '2015_Q4', '2016_Q1', '2016_Q2', '2016_Q3', '2016_Q4', '2017_Q1', '2017_Q2', '2017_Q3', '2017_Q4', '2018_Q1', '2018_Q2', '2018_Q3', '2018_Q4', '2019_Q1', '2019_Q2']


### Let's check..

In [228]:
max_lag = 5

to_check_list = list()

indices = ['djia', 'unemployment_not_adjusted', 'unemployment', 'gdp', 'interest_rate', 'ted_spread']
for index_name in indices:
    for i in ['frequency', 'strength', 'emerging_topic_score']:
        for test_key, (s_, e_) in test_periods_dict.items():
            x_ = proposed_data_x_dict[i][index_name][s_:e_]
            y_ = fred_data_y_dict[index_name][s_:e_]
            x_ticks_list = period_dict[index_name][s_:e_]
            x_label = 'standardized: -(sum of top5 ' + i +' )'
            y_label = 'standardized: Real '+ index_name +' (Percent Change)'
            df = show_plt_chart_rolling(x_, y_, x_ticks_list, rolling_window, x_label, y_label, max_lag=max_lag, with_legend=False, show=False)

            print('\n\n','='*10)
            title = index_name + ',   ' + str(i) + ',   test_period=' + str(test_key)
            print(title)
            print('='*10)
            print('n_lag \t p-value')
            for n_lag in range(1, max_lag+1):
                p_value_str = '%.4f' % df[n_lag][0]['params_ftest'][1]
                if float(p_value_str) < 0.1:
                    p_value_str += '★'
                    to_check_list.append(title)
                elif float(p_value_str) < 0.05:
                    p_value_str += '★★'
                    to_check_list.append(title)
                print('%d \t %s'% (n_lag, p_value_str))


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.7271  , p=0.4030  , df_denom=22, df_num=1
ssr based chi2 test:   chi2=0.8262  , p=0.3634  , df=1
likelihood ratio test: chi2=0.8128  , p=0.3673  , df=1
parameter F test:         F=0.7271  , p=0.4030  , df_denom=22, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.1247  , p=0.3454  , df_denom=19, df_num=2
ssr based chi2 test:   chi2=2.8413  , p=0.2416  , df=2
likelihood ratio test: chi2=2.6853  , p=0.2611  , df=2
parameter F test:         F=1.1247  , p=0.3454  , df_denom=19, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.4969  , p=0.2534  , df_denom=16, df_num=3
ssr based chi2 test:   chi2=6.4553  , p=0.0914  , df=3
likelihood ratio test: chi2=5.6898  , p=0.1277  , df=3
parameter F test:         F=1.4969  , p=0.2534  , df_denom=16, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.1180  , p=0.3899  , df_d

ValueError: Insufficient observations. Maximum allowable lag is 0

In [221]:
to_check_list

['djia,   frequency,   test_period=3',
 'djia,   frequency,   test_period=3',
 'djia,   strength,   test_period=2',
 'djia,   emerging_topic_score,   test_period=2',
 'djia,   emerging_topic_score,   test_period=3',
 'unemployment_not_adjusted,   strength,   test_period=2',
 'unemployment_not_adjusted,   strength,   test_period=2',
 'unemployment_not_adjusted,   emerging_topic_score,   test_period=1',
 'unemployment_not_adjusted,   emerging_topic_score,   test_period=2',
 'unemployment,   strength,   test_period=1',
 'unemployment,   strength,   test_period=2',
 'unemployment,   strength,   test_period=2',
 'unemployment,   strength,   test_period=2',
 'unemployment,   emerging_topic_score,   test_period=3']

In [None]:
'djia,   emerging_topic_score,   test_period=2',
'djia,   emerging_topic_score,   test_period=3',

'unemployment_not_adjusted,   emerging_topic_score,   test_period=1',
'unemployment_not_adjusted,   emerging_topic_score,   test_period=2',

'unemployment,   strength,   test_period=1',
'unemployment,   strength,   test_period=2',
'unemployment,   strength,   test_period=2',
'unemployment,   strength,   test_period=2',

In [229]:
index_name = 'djia'
i = 'frequency'
s_, e_ = 28, 84

x_ = proposed_data_x_dict[i][index_name][s_:e_]
y_ = fred_data_y_dict[index_name][s_:e_]
x_ticks_list = period_dict[index_name][s_:e_]
x_label = 'standardized: -(sum of top5 ' + i +' )'
y_label = 'standardized: Real '+ index_name +' (Percent Change)'
df = show_plt_chart_rolling(x_, y_, x_ticks_list, rolling_window, x_label, y_label, max_lag=max_lag, with_legend=False, show=False)

print('\n\n','='*10)
title = index_name + ',   ' + str(i) + ',   test_period=' + str(test_key)
print('='*10)
print('n_lag \t p-value')
for n_lag in range(1, max_lag+1):
    p_value_str = '%.4f' % df[n_lag][0]['params_ftest'][1]
    if float(p_value_str) < 0.1:
        p_value_str += '★'
        to_check_list.append(title)
    elif float(p_value_str) < 0.05:
        p_value_str += '★★'
        to_check_list.append(title)
    print('%d \t %s'% (n_lag, p_value_str))


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.9859  , p=0.3255  , df_denom=50, df_num=1
ssr based chi2 test:   chi2=1.0451  , p=0.3066  , df=1
likelihood ratio test: chi2=1.0349  , p=0.3090  , df=1
parameter F test:         F=0.9859  , p=0.3255  , df_denom=50, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.3607  , p=0.2664  , df_denom=47, df_num=2
ssr based chi2 test:   chi2=3.0108  , p=0.2219  , df=2
likelihood ratio test: chi2=2.9269  , p=0.2314  , df=2
parameter F test:         F=1.3607  , p=0.2664  , df_denom=47, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.8923  , p=0.4527  , df_denom=44, df_num=3
ssr based chi2 test:   chi2=3.1027  , p=0.3761  , df=3
likelihood ratio test: chi2=3.0120  , p=0.3898  , df=3
parameter F test:         F=0.8923  , p=0.4527  , df_denom=44, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.9938  , p=0.1135  , df_d