# The Effects of Social Media Bots on the Cryptomarket: Market Effects

*By Daniel Deutsch*

In [1]:
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

In [8]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.labelsize': 22,
    'axes.titlesize': 24,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 16,
    'legend.title_fontsize': 16,
    'axes.labelpad': 10,
    'axes.facecolor': '#EAEAF2'
})

# Constants
START_DATE = datetime(2019, 6, 1)
END_DATE = datetime(2022, 6, 1)
ALPHA = 0.05
SAVING_FOLDER = "./latex"
SAVING_FORMAT = 'png'
DPI = 100

In [3]:
def granger_causality_matrix(data, maxlag, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    variables = data.columns
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + ' X' for var in variables]
    df.index = [var + ' Y' for var in variables]
    return df

## Individual Effects

### Read Datasets

In [4]:
df_ohlcv = pd.read_csv("./datasets/processed/ohlcv.csv.gz", index_col=0, parse_dates=['date'], low_memory=False).set_index(['base_asset', 'date'])
df_er = pd.read_csv("./datasets/engagement_rate.csv.gz", index_col=0, header=[0, 1])

### Data Processing 

In [5]:
# Converts the dtype of the index
df_er.index = pd.to_datetime(df_er.index)

# Adds the asset's closing price to the engagement rate dataframe
df_er = pd.merge(df_er.stack(0).swaplevel(axis=0).sort_index(axis=0), df_ohlcv[['price']], left_index=True, right_index=True, how='outer').unstack('base_asset').swaplevel(axis=1).sort_index(axis=1)

# Defines columns
COLS = ['Bearish Human', 'Bullish Human', 'Bearish Bot', 'Bullish Bot']
COL_MAP = {'er_bear_human': 'Bearish Human', 'er_bull_human': 'Bullish Human', 'er_bear_bot': 'Bearish Bot', 'er_bull_bot': 'Bullish Bot'}

# BTC
df_btc = df_er['BTC'].rename(columns=COL_MAP)
df_btc.index = pd.to_datetime(df_btc.index)
df_btc['Return'] = df_btc['price'].pct_change()
df_btc = df_btc[COLS+['Return']].dropna()

# ETH
df_eth = df_er['ETH'].rename(columns=COL_MAP)
df_eth.index = pd.to_datetime(df_eth.index)
df_eth['Return'] = df_eth['price'].pct_change()
df_eth = df_eth[COLS+['Return']].dropna()

# DOGE
df_doge = df_er['DOGE'].rename(columns=COL_MAP)
df_doge.index = pd.to_datetime(df_doge.index)
df_doge['Return'] = df_doge['price'].pct_change()
df_doge = df_doge[COLS+['Return']].dropna()

### BTC

#### Correlation Matrix

In [None]:
# Obtains the correlation matrix
corr_matrix = df_btc.corr()
mask = np.triu(corr_matrix)

# Plots the correlation matrix
sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/btc_corr_matrix.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Causality Matrix

##### Stationarity Check

In [None]:
# Variables to keep track of the number of times the diff operation was performed to reach stationarity
df_btc_stationay = df_btc.copy().dropna()
summary = { col: 0 for col in COLS+['Return'] }

# Performs adfuller test on each column, if the time series is not stationary, gets the first difference of it
for col in COLS:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_btc[col])
    while pvalue > ALPHA:
        df_btc_stationay[col] = df_btc_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_btc[col])
        summary[col] += 1

# Shows how many times the first difference operation was performed in each column
summary

##### Number of Lags

In [None]:
# Instantiates the VAR model
model = VAR(df_btc)

# Finds the model with the lowest AIC
best_model_fit = model.fit(1)
for maxlag in range(2, 50):
    model_fit = model.fit(maxlag)
    if model_fit.aic < best_model_fit.aic:
        best_model_fit = model_fit

# Saves the lag that provides the lowest AIC
maxlag = best_model_fit.k_ar
maxlag

##### Granger Causality Matrix

In [None]:
# Obtains the causality matrix
df_tmp = granger_causality_matrix(df_btc_stationay, maxlag=maxlag)

# Plots the causality matrix
sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/causality_btc.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Linear Regression

##### Scaling

In [13]:
scaler = MinMaxScaler()
df_btc[COLS] = scaler.fit_transform(df_btc[COLS])

##### Fit Regression

In [None]:
# Obtains the endogenous and exogenous variables
y = df_btc['Return']
X = sm.add_constant(df_btc[COLS])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{SAVING_FOLDER}/tables/ols_btc.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())

### ETH

#### Correlation Matrix

In [None]:
# Obtains the correlation matrix
corr_matrix = df_eth.corr()
mask = np.triu(corr_matrix)

# Plots the correlation matrix
sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/eth_corr_matrix.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Causality Matrix

##### Stationarity Check

In [None]:
# Variables to keep track of the number of times the diff operation was performed to reach stationarity
df_eth_stationay = df_eth.copy()
summary = { col: 0 for col in COLS+['Return'] }

# Performs adfuller test on each column, if the time series is not stationary, gets the first difference of it
for col in COLS:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_eth[col])
    while pvalue > ALPHA:
        df_eth_stationay[col] = df_eth_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_eth[col])
        summary[col] += 1

# Shows how many times the first difference operation was performed in each column
summary

##### Number of Lags

In [None]:
# Instantiates the VAR model
model = VAR(df_eth)

# Finds the model with the lowest AIC
best_model_fit = model.fit(1)
for maxlag in range(2, 50):
    model_fit = model.fit(maxlag)
    if model_fit.aic < best_model_fit.aic:
        best_model_fit = model_fit

# Saves the lag that provides the lowest AIC
maxlag = best_model_fit.k_ar
maxlag

##### Granger Causality Matrix

In [None]:
# Obtains the causality matrix
df_tmp = granger_causality_matrix(df_eth_stationay, maxlag=maxlag)

# Plots the causality matrix
sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/causality_eth.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Linear Regression

##### Scaling

In [19]:
scaler = MinMaxScaler()
df_eth[COLS] = scaler.fit_transform(df_eth[COLS])

##### Fit Regression

In [None]:
# Obtains the endogenous and exogenous variables
y = df_eth['Return']
X = sm.add_constant(df_eth[COLS])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{SAVING_FOLDER}/tables/ols_eth.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())

### DOGE

#### Correlation Matrix

In [None]:
# Obtains the correlation matrix
corr_matrix = df_doge.corr()
mask = np.triu(corr_matrix)

# Plots the correlation matrix
sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/doge_corr_matrix.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Causality Matrix

##### Stationarity Check

In [None]:
# Variables to keep track of the number of times the diff operation was performed to reach stationarity
df_doge_stationay = df_doge.copy()
summary = { col: 0 for col in COLS+['Return'] }

# Performs adfuller test on each column, if the time series is not stationary, gets the first difference of it
for col in COLS:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_doge[col])
    while pvalue > ALPHA:
        df_doge_stationay[col] = df_doge_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_doge[col])
        summary[col] += 1

# Shows how many times the first difference operation was performed in each column
summary

##### Number of Lags

In [None]:
# Instantiates the VAR model
model = VAR(df_doge)

# Finds the model with the lowest AIC
best_model_fit = model.fit(1)
for maxlag in range(2, 50):
    model_fit = model.fit(maxlag)
    if model_fit.aic < best_model_fit.aic:
        best_model_fit = model_fit

# Saves the lag that provides the lowest AIC
maxlag = best_model_fit.k_ar
maxlag

##### Granger Causality Matrix

In [None]:
# Obtains the causality matrix
df_tmp = granger_causality_matrix(df_doge_stationay, maxlag=maxlag)

# Plots the causality matrix
sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{SAVING_FOLDER}/imgs/causality_doge.{SAVING_FORMAT}", format=SAVING_FORMAT, dpi=DPI, bbox_inches='tight')
plt.show()

#### Linear Regression

##### Scaling

In [25]:
scaler = MinMaxScaler()
df_doge[COLS] = scaler.fit_transform(df_doge[COLS])

##### Fit Regression

In [None]:
# Obtains the endogenous and exogenous variables
y = df_doge['Return']
X = sm.add_constant(df_doge[COLS])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{SAVING_FOLDER}/tables/ols_doge.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())

## Group Effects