# Master Thesis: Sentiment Metric Creation

*By Daniel Deutsch*

In [1]:
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

In [25]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

# Config constants
START_DATE = datetime(2019, 6, 1)
END_DATE = datetime(2022, 6, 1)

# Data Import

In [3]:
# Loads the datasets
df_ohlcv = pd.read_csv("./datasets/processed/ohlcv.csv.gz", index_col=0, parse_dates=['date'], low_memory=False).set_index(['base_asset', 'date'])
df_twits = pd.read_csv("./datasets/classified/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)
df_users = pd.read_csv("./datasets/enhanced/users.csv.gz", index_col=0, parse_dates=['join_date'], low_memory=False).add_prefix('user.')

# Add number of user followers to the twits dataframe
df_twits = pd.merge(df_twits, df_users, on='user.id', how='left')
df_twits['is_pred'] = df_twits['label'].isna()
df_twits['label'] = df_twits['label'].combine_first(df_twits['label_pred'])
df_twits = df_twits[['id', 'date', 'base_asset', 'n_likes', 'n_reshares', 'user.type', 'user.followers', 'label', 'is_pred' ]]
df_twits['user.followers'] = df_twits['user.followers'].clip(lower=0)
df_twits['date'] = df_twits['date'].dt.tz_localize(None)

# Sentiment Metric

## Engagement Rate (Modified)

\begin{align*}
    \text{Modified Engagement Rate} & = \frac{ likes + reshares }{followers + 1}
\end{align*}

In [4]:
df_twits['er'] = ( df_twits['n_likes'] + df_twits['n_reshares'] ) / (df_twits['user.followers'] + 1)

### Average Engagement Rate Table 

In [None]:
saving_folder = "./latex"
df_tmp = df_twits.groupby(['user.type', 'label'])['er'].mean().unstack('user.type')
df_tmp = df_tmp.rename_axis(None, axis=0)
df_tmp = df_tmp.rename_axis(None, axis=1)
df_tmp = df_tmp.rename(columns={'User': 'Human'})
df_tmp.style.format(formatter='{:,.3f}').to_latex(
    f"{saving_folder}/tables/avg_er_per_label_adn_user_type.tex",
    caption="Average engagement rate for each user type and label",
    label="table:avg_er_per_label_adn_user_type",
    position_float='centering',
    position='H',
    hrules=True
)
df_tmp

## Aggregated Engagement Rate

In [18]:
# Defines masks
mask_bull =  ( df_twits['label'] == 'Bullish' )
mask_bull_user = ( df_twits['label'] == 'Bullish' ) & ( df_twits['user.type'] == 'User' )
mask_bull_bot = ( df_twits['label'] == 'Bullish' ) & ( df_twits['user.type'] == 'Bot' )
mask_bear =  ( df_twits['label'] == 'Bearish' )
mask_bear_user = ( df_twits['label'] == 'Bearish' ) & ( df_twits['user.type'] == 'User' )
mask_bear_bot = ( df_twits['label'] == 'Bearish' ) & ( df_twits['user.type'] == 'Bot' )

# Obtains features dataframes
df_bull = df_twits[mask_bull].groupby(['base_asset', df_twits[mask_bull]['date'].dt.floor('h')])['er'].agg(er_bull='mean')
df_bull_user = df_twits[mask_bull_user].groupby(['base_asset', df_twits[mask_bull_user]['date'].dt.floor('h')])['er'].agg(n_twits_bull_user='size', er_bull_user='mean')
df_bull_bot = df_twits[mask_bull_bot].groupby(['base_asset', df_twits[mask_bull_bot]['date'].dt.floor('h')])['er'].agg(n_twits_bull_bot='size', er_bull_bot='mean')
df_bear = df_twits[mask_bear].groupby(['base_asset', df_twits[mask_bear]['date'].dt.floor('h')])['er'].agg(er_bear='mean')
df_bear_user = df_twits[mask_bear_user].groupby(['base_asset', df_twits[mask_bear_user]['date'].dt.floor('h')])['er'].agg(n_twits_bear_user='size', er_bear_user='mean')
df_bear_bot = df_twits[mask_bear_bot].groupby(['base_asset', df_twits[mask_bear_bot]['date'].dt.floor('h')])['er'].agg(n_twits_bear_bot='size', er_bear_bot='mean')

# Obtains final dataframe
df = pd.merge(df_bull, df_bull_user, left_index=True, right_index=True, how='outer')
df = pd.merge(df, df_bull_bot, left_index=True, right_index=True, how='outer')
df = pd.merge(df, df_bear, left_index=True, right_index=True, how='outer')
df = pd.merge(df, df_bear_user, left_index=True, right_index=True, how='outer')
df = pd.merge(df, df_bear_bot, left_index=True, right_index=True, how='outer')
df = pd.merge(df, df_ohlcv[['price']], left_index=True, right_index=True, how='outer')
df = df.unstack('base_asset').swaplevel(axis=1).sort_index(axis=1)

# Deletes unused dataframes
del df_ohlcv, df_twits, df_users, df_bull, df_bull_user, df_bull_bot, df_bear, df_bear_user, df_bear_bot

# Saves the final dataframe
df.to_csv("./datasets/metric.csv.gz")

In [None]:
# Saving params
saving_folder = "./latex"
saving_format = 'png'
dpi = 100

axs = df['BTC'][['er_bear_bot', 'er_bear_user', 'er_bull_bot', 'er_bull_user']].plot(subplots=True, layout=(2, 2), sharex=False, legend=False, figsize=(20, 15))
axs[0, 0].set_title("Bearish Twits Made by Bots")
axs[0, 1].set_title("Bearish Twits Made by Humans")
axs[1, 0].set_title("Bullish Twits Made by Bots")
axs[1, 1].set_title("Bullish Twits Made by Humans")
for ax in axs.flatten():
    ax.set_xlabel("Date")
    ax.set_ylabel("Average Engagement Rate")

plt.savefig(f"{saving_folder}/imgs/sentiment_metric_btc.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Metric Evaluation

In [None]:
df = pd.read_csv("./datasets/metric.csv.gz", index_col=0, header=[0, 1])

In [None]:
def validate_stationarity(data, significance='5%', autolag='AIC'):
    
    cols = data.columns
    df_results = pd.DataFrame()

    for col in cols:

        t_statistic, p_value, used_lags, nobs, critical_values, icbest = adfuller(data[col], autolag='AIC')

        df_tmp = 
        df_results = pd.concat([df_results, df_tmp], ignore_index=True)



In [None]:
adfuller(df_btc['price_pct_change'], autolag='AIC')

## BTC

In [22]:
df_btc = df['BTC']
df_btc['price_pct_change'] = df_btc['price'].pct_change()
df_btc.dropna(inplace=True)

### Stationarity

### Causality Test

### Significance

In [None]:
model = sm.OLS(df_btc['price_pct_change'], df_btc.drop(['price', 'price_pct_change'], axis=1))
model_fit = model.fit()
print(model_fit.summary())

## ETH

In [None]:
df_eth = df['ETH']
df_eth['price_pct_change'] = df_eth['price'].pct_change()
df_eth.dropna(inplace=True)

### Stationarity

### Causality Test

### Significance

In [None]:
model = sm.OLS(df_eth['price_pct_change'], df_eth.drop(['price', 'price_pct_change'], axis=1))
model_fit = model.fit()
print(model_fit.summary())

## ADA

In [None]:
df_ada = df['ADA']
df_ada['price_pct_change'] = df_ada['price'].pct_change()
df_ada.dropna(inplace=True)

### Stationarity

### Causality Test

### Significance

In [None]:
model = sm.OLS(df_ada['price_pct_change'], df_ada.drop(['price', 'price_pct_change'], axis=1))
model_fit = model.fit()
print(model_fit.summary())

## SHIB

In [None]:
df_shib = df['SHIB']
df_shib['price_pct_change'] = df_shib['price'].pct_change()
df_shib.dropna(inplace=True)

### Stationarity

### Causality Test

### Significance

In [None]:
model = sm.OLS(df_shib['price_pct_change'], df_shib.drop(['price', 'price_pct_change'], axis=1))
model_fit = model.fit()
print(model_fit.summary())

## DOGE

In [None]:
df_doge = df['DOGE']
df_doge['price_pct_change'] = df_doge['price'].pct_change()
df_doge.dropna(inplace=True)

### Stationarity

### Causality Test

### Significance

In [None]:
model = sm.OLS(df_doge['price_pct_change'], df_doge.drop(['price', 'price_pct_change'], axis=1))
model_fit = model.fit()
print(model_fit.summary())