In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
def get_log_return(df_market,
                   close_price_column='PX_LAST',
                   date_column='date'):
    """
    get the log returns from the close price of an asset df.
    log returns = log(p_t) - log(p_{t-1}) 


    :param df_market: asset dataframe
    :type df_market: pd.DataFrame
    :param close_price_column: column with the close price
    :type close_price_column: str
    :param date_column: column with the date information
    :type date_column: str
    :return: transformed dataframe
    :rtype: pd.DataFrame
    """
    df_market.loc[:, date_column] = pd.to_datetime(df_market[date_column])
    df_market = df_market.set_index(date_column)
    df_market = df_market[[close_price_column]].dropna()
    df_market = pd.DataFrame(df_market[close_price_column].resample(rule='W-MON').last())
    log_pt =  np.log(df_market[close_price_column])
    log_pt_minus_1 =  np.log(df_market[close_price_column].shift(1))
    df_market.loc[:, "log_pct_change"] = log_pt - log_pt_minus_1
    return df_market.drop(close_price_column, 1).dropna()


def merge_market_trend_dfs(market_df, trends_df,
                           date_column='date'):
    """
    merge market data with the trends dataframe.    
    the values in the trends data are shifted one day forward


    :param df_market: asset dataframe
    :type df_market: pd.DataFrame
    :param trends_df: trends dataframe
    :type trends_df: pd.DataFrame
    :param date_column: column with the date information
    :type date_column: str
    :return: merged dataframe
    :rtype: pd.DataFrame
    """
    trends_df.loc[:,date_column] = pd.to_datetime(trends_df[date_column])
    trends_df = trends_df.set_index(date_column)
    trends_df = trends_df.asfreq('D').shift(1).dropna()
    
    df = pd.merge(market_df, trends_df, right_index=True, left_index=True)
    return df.loc[:, (df != 0).any(axis=0)]


def test_cointegration(ts1,ts2):
    """
    Test cointegration between the time series ts1 and ts2
    using the Engle-Granger Test for Cointegration


    :param ts1: dependent ts
    :type ts1: pd.Series
    :param ts2: independent ts
    :type ts2: pd.Series
    :return: p-value from the Engle-Granger test
    :rtype: float
    """
    lm = sm.OLS(ts1,ts2).fit()
    residuals = (lm.fittedvalues - ts1)
    adf_stats = adfuller(residuals)
    p_value = adf_stats[1]
    return p_value

### Merging market and trends dfs

In [3]:
trends_path = os.path.join('data','politics_google.csv')
trends_df = pd.read_csv(trends_path)
asset = 'es1'
path_market = os.path.join('data','{}.txt'.format(asset))
market_df = get_log_return(pd.read_csv(path_market, sep='\t'))
merged_df =  merge_market_trend_dfs(market_df, trends_df)
merged_df.head(2)

Unnamed: 0_level_0,log_pct_change,act,return,election,party,earnings,stock,investment,york,movie,...,governor,office,car,culture,security,religion,labor,marriage,finance,banking
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-07-20,0.013045,2.0,2.0,0.0,7.0,1.0,7.0,1.0,12.0,32.0,...,0.0,14.0,22.0,1.0,5.0,1.0,3.0,2.0,3.0,2.0
2015-07-27,-0.027471,2.0,2.0,0.0,7.0,0.0,6.0,1.0,13.0,32.0,...,0.0,14.0,22.0,1.0,5.0,1.0,3.0,2.0,3.0,2.0
