# Trends: data exploration


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
from sklearn.model_selection import TimeSeriesSplit
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score
from IPython.display import display, HTML, Markdown
from vol4life.vol4life.plot import plot_acf, plot_ccf
from vol4life.vol4life.stats import autocorrelation_f
from word_list.basic import politics1, politics2
from word_list.basic import business

## Create trends df from daily information

In [None]:
trends_path = os.path.join('data','daily_trend',"*.csv")
daily_dfs_path =  glob(trends_path)
daily_dfs_path.sort()
daily_dfs = [pd.read_csv(path) for path in daily_dfs_path]
daily_dfs_names = [i.split("/")[2] for i in daily_dfs_path]
daily_dfs_names = [i.split(".")[0] for i in daily_dfs_names]

trends_df = []
for name, df in zip(daily_dfs_names, daily_dfs):
    df.index = pd.to_datetime(df.date)
    ts = df[name]
    new_name = name.replace(" ", "_")
    ts.name = new_name
    trends_df.append(ts)

final_date = "2020-07-25"
trends_df = pd.concat(trends_df,1)
trends_df = trends_df[:final_date]
trends_df = trends_df.fillna(0.0)
word_features = list(trends_df.columns)
word_features.sort()
trends_df = trends_df[word_features]
trends_df_train = trends_df[:"2010"]
trends_df_test = trends_df["2010":]
display(Markdown("### Google trends data"))
display(HTML(trends_df_train.head(5).to_html()))
display(Markdown("#### data shape = {}".format(trends_df_train.shape))) 

In [None]:
trends_df_p = trends_df_train.copy()
trends_df_p.columns = [""]* trends_df_p.shape[1]

politics1_ts = trends_df_train[politics1].mean(1)
politics1_ts.name = "politics1"
politics2_ts = trends_df_train[politics2].mean(1)
politics2_ts.name = "politics2"
business_ts = trends_df_train[business].mean(1)
business_ts.name = "business"

display(Markdown("### Word Signal"))
fig, ax = plt.subplots(figsize=(15,8))
trends_df_p.plot(ax=ax,legend=False,alpha= 0.05,  color="mistyrose");
trends_df_train.mean(1).plot(ax=ax,legend=False, color="k", label="mean signal");
ax.set_title("All Signals");
ax.legend(loc="best");


fig, ax = plt.subplots(figsize=(15,8))
politics1_ts.plot(ax=ax);
politics2_ts.plot(ax=ax);
business_ts.plot(ax=ax);
ax.set_title("Word Signal by Category");
ax.legend(loc="best");

### Google Trends Original Data Correlation

In [None]:
corr = trends_df_train[politics1].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(14,10))
ax.set_title("Politics1 correlation", fontsize=18)
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0, linewidths=0.5, annot=True, fmt=".1f", ax=ax);
plt.xticks(rotation=90);


corr = trends_df_train[politics2].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(14,10))
ax.set_title("Politics2 correlation", fontsize=18)
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0, linewidths=0.5, annot=True, fmt=".1f", ax=ax);
plt.xticks(rotation=90);


corr = trends_df_train[business].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(14,10))
ax.set_title("Bussines correlation", fontsize=18)
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0, linewidths=0.5, annot=True, fmt=".1f", ax=ax);
plt.xticks(rotation=90);

non_related_words = ["happy", "garden", "fun",
                     "food", "fine", "color",
                     "arts", "travel", "housing",
                     "legal", "leverage", "lifestyle",
                     "BUY_AND_HOLD", "DOW_JONES"]

corr = trends_df_train[non_related_words].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(14,10))
ax.set_title("Non related words correlation", fontsize=18)
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0, linewidths=0.5, annot=True, fmt=".1f", ax=ax);
plt.xticks(rotation=45);

corr = trends_df_train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr_df = pd.DataFrame(corr.mask(cond=mask).values.flatten()).dropna()
corr_df.columns = ["correlation"]
fig, ax = plt.subplots(figsize=(8,8))
corr_df.boxplot(ax=ax, grid=False);
ax.set_title(r"Correlation distribution for all word pairs in the trends dataset");

In [None]:
trends_df.head()

In [None]:
trends_df.diff().head()


## Applying PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
pca = PCA()
trends_std = scaler.fit_transform(trends_df_train)
pca =  pca.fit(trends_std)

fig, ax = plt.subplots(figsize=(10,5))
ax.plot(pca.explained_variance_ratio_[:5])
ax.set_xlabel('Principal component index', fontsize=14)
ax.set_ylabel('Explained variance ratio', fontsize=14)
ax.set_title("Selecting the number of PCA dimensions", fontsize=18)

trends_df_train_t = pd.DataFrame(pca.transform(trends_std)[:,:1],columns=["PCA1"], index=trends_df_train.index)
fig, ax = plt.subplots(figsize=(10,5))
trends_df_train_t.plot(ax=ax);
ax.set_title("Transformed data", fontsize=18);

mean_norm = scaler.fit_transform(trends_df_train.mean(1).to_frame()).flatten()
pca_norm = scaler.fit_transform(trends_df_train_t).flatten()
trends_resume = pd.DataFrame(np.stack([pca_norm,mean_norm],1),columns=["PCA1_norm", "mean_norm"], index=trends_df_train.index)

fig, ax = plt.subplots(figsize=(10,5))
trends_resume.plot(ax=ax);
ax.set_title("Comparing PCA features and mean signal (both normalized)", fontsize=18);

In [None]:
assert False

## Market data returns

In [None]:
asset = "es1"
market_path = os.path.join('data','market',"{}.txt".format(asset))
df_market = pd.read_csv(market_path, sep='\t')
close_price_column = 'PX_LAST'
date_column='date'
df_market.loc[:, date_column] = pd.to_datetime(df_market[date_column])
df_market = df_market.set_index(date_column)
close = df_market[[close_price_column]].dropna()
close = close.pct_change().dropna()
return_column = "{}_returns".format(asset)
close.columns = [return_column]
close.head(5)

## Merging market data and trends data

In [None]:
merged_df = pd.merge_asof(trends_df, close, left_index=True, right_index=True)
merged_df = merged_df[[return_column] + word_features]
display(Markdown("### Merged data"))
display(HTML(merged_df.head(5).to_html()))
display(Markdown("#### data shape = {}".format(merged_df.shape)))

### Exploring only one part of the data

In [None]:
small_df = merged_df[:"2010"].copy()
small_df.head(5)

### Autocorrelation

In [None]:
returns = small_df.es1_returns
trend = small_df.banking

plot_acf(returns, lag_range=41, out_path=None, acf_function=autocorrelation_f)

plot_acf(trend, lag_range=41, out_path=None, acf_function=autocorrelation_f)

In [None]:
auto_correlation = []
lags = 30

for c in word_features:
    trend = small_df[c]
    auto_correlation.append(np.mean(autocorrelation_f(trend,lags)[1:]))
auto_correlation = pd.Series(auto_correlation, index=word_features).to_frame()

fig, ax = plt.subplots(figsize=(8,5))
auto_correlation.boxplot(ax=ax, grid=False);
ax.set_title("Mean auto-correlation distribution for trends time series");


In [None]:
corr = small_df[word_features].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(14,10))
ax.set_title("All words correlation", fontsize=18)
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0, linewidths=0.5, annot=False, fmt=".2f", ax=ax, cbar=True);
plt.xticks(rotation=90);

## Evaluation Draft

**We use the Time series cross validation from sklearn to observe the distribution
of the statistics related to a simple linear regresion. The model is based only in one
trend word. It tries to predict the next day return of the selected asset**

In [None]:
# Using only a part of the data

small_df = merged_df[:"2010"].copy()
small_df.head(5)

In [None]:
# shifting returns. We use the word trend on day t
# to predict returs on the day t+1

small_df.loc[:, "es1_returns"] = small_df.es1_returns.shift(-1)
small_df = small_df.dropna()
small_df.head(5)

### This functions uses the `TimeSeriesSplit` from sklearn to obtain different statistics based on a simple linear model

In [None]:
def get_simple_ols_stats(df,return_column, select_word, n_splits):

    tscv = TimeSeriesSplit(n_splits=n_splits)
    is_scores = []
    betas = []
    t_stats = []
    p_values = []
    oos_scores = []
    
    for train_index, test_index in tscv.split(df):
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]
        formula = "{} ~ {}".format(return_column, select_word)
        lr = smf.ols(formula=formula, data=df_train).fit()
        is_scores.append(lr.rsquared)
        betas.append(lr.params[1])
        t_stats.append(lr.tvalues[1])
        p_values.append(lr.pvalues[1])
        y_pred = lr.predict(df_test).values
        y_true = df_test[return_column]
        oos_r2 = r2_score(y_true, y_pred)
        oos_scores.append(oos_r2)
    
    dict_ = {"IS_rsquared":is_scores,
             "beta":betas,
             "t-statistic":t_stats,
             "p_value":p_values,
             "OOS_rsquared":oos_scores}
        
    return  pd.DataFrame(dict_)



## Experiment Results using some random words 

In [None]:
n_splits = 10
example =  word_features[4:20]
stats = []
p_values = []
oos_scores = []
is_scores = []

## Getting the results for each word
for word in example: 
    stat = get_simple_ols_stats(df=small_df,
                                return_column=return_column,
                                select_word=word,
                                n_splits=n_splits)
    stats.append(stat)

## Combining results
for word, stat in zip(example, stats):
    ps = stat["p_value"]
    ps.name = word
    p_values.append(ps)
    is_score = stat["IS_rsquared"]
    is_score.name = word
    is_scores.append(is_score)
    os_score = stat["OOS_rsquared"]
    os_score.name = word
    oos_scores.append(os_score)

    
## Plotting
display(Markdown("### Simple Linear Model Results"))
display(Markdown(""))

p_values = pd.concat(p_values, 1)
fig, ax = plt.subplots(figsize=(8,5))
p_values.boxplot(ax=ax, grid=False);
ax.set_xticklabels(example, rotation=45);
ax.set_title("P-value distribution for the coeficients of the explanatory variable");

is_scores = pd.concat(is_scores, 1)
fig, ax = plt.subplots(figsize=(8,5))
is_scores.boxplot(ax=ax, grid=False);
ax.set_xticklabels(example, rotation=45);
ax.set_title(r"In-sample $R^2$ distribution for each simple linear model");


oos_scores = pd.concat(oos_scores, 1)
fig, ax = plt.subplots(figsize=(8,5))
oos_scores.boxplot(ax=ax, grid=False);
ax.set_xticklabels(example, rotation=45);
ax.set_title(r"Out-of-sample $R^2$ distribution for each simple linear model");


In [None]:
score_table  = oos_scores.mean(0).sort_values(ascending=False).to_frame().transpose()
score_table.index = [r"$R^2$"]

display(Markdown("### Out-of-sample mean $R^2$ for each model"))
display(HTML(score_table.to_html()))