# Master Thesis: Data Visualization

*By Daniel Deutsch*

In [15]:
import collections
import re
import string
import textwrap
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, roc_curve)
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

In [16]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.labelsize': 22,
    'axes.titlesize': 24,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 16,
    'legend.title_fontsize': 16,
    'axes.labelpad': 10,
    'axes.facecolor': '#EAEAF2'
})

# Saving params
saving_folder = "./latex"
saving_format = 'png'
dpi = 100

def grangers_causation_matrix(data, maxlag, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    variables = data.columns
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + ' X' for var in variables]
    df.index = [var + ' Y' for var in variables]
    return df

In [None]:
df_hist = pd.read_csv("./models/user_cls/history.csv")

plt.plot(df_hist['epoch'], df_hist['loss'])
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.savefig(f"{saving_folder}/imgs/user_cls_loss.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

In [186]:
df_tmp = pd.DataFrame(df_raw.dtypes, columns=['DType']).reset_index().rename(columns={'index': 'Name'})
# df_tmp['Example'] = df_raw[df_raw['id'] == 467121451].T[21].tolist()
# df_tmp['Example'].iloc[3] = "{'id': USER_ID, 'username': 'USERNAME', 'name': 'NAME', 'avatar_url': 'HTTPURL', 'avatar_url_ssl': 'HTTPURL', 'join_date': '2017-09-19', 'official': False, 'identity': 'User', 'classification': ['verified'], 'home_country': 'US', 'search_country': 'US', 'followers': 54, 'following': 132, 'ideas': 3251, 'watchlist_stocks_count': 285, 'like_count': 533, 'plus_tier': 'year', 'premium_room': '', 'trade_app': False, 'portfolio': 'private', 'portfolio_status': 'private', 'trade_status': 'PENDING_MFA'}"
# df_tmp['Example'].iloc[10] = "{'total': 3, 'user_ids': [USER_ID1, USER_ID2, USER_ID3]}"

df_tmp.style.to_latex(
    f"{saving_folder}/tables/processed_user_data_structure.tex",
    caption="Structure of processed users",
    label="table:processed_user_data_structure",
    position_float='centering',
    position='H',
    hrules=True
)

In [None]:
df_tmp

In [None]:
df_tmp['Example'].iloc[10]

# Cryptomap Dataset

In [18]:
df_cryptomap = pd.read_csv("./datasets/raw/cryptomap.csv.gz", index_col=0)

# OHLCV Dataset

In [4]:
df_ohlcv = pd.read_csv("./datasets/processed/ohlcv.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)
df_ohlcv = df_ohlcv[['date', 'base_asset', 'price']].set_index(['base_asset', 'date']).unstack('base_asset')['price']

## BTC and ETH Over Time

In [None]:
plt.plot(df_ohlcv.index, df_ohlcv['BTC'], label='BTC')
plt.plot(df_ohlcv.index, df_ohlcv['ETH'], label='ETH')
plt.ylabel("Hourly Close Price (USD)")
plt.xlabel("Date")
plt.legend()
plt.savefig(f"{saving_folder}/imgs/close_price_btc_eth.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Missing Data Plot

In [None]:
ax = sns.heatmap(df_ohlcv.isna(), cmap=sns.cm.rocket_r, cbar=False)
ax.set_yticklabels([ ylabel.get_text().split('T')[0] for ylabel in ax.get_yticklabels() ])
ax.set_xlabel("Crypto Asset")
ax.set_ylabel("Date")
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/ohlcv_missing_data.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

# Users Dataset

In [2]:
df_users = pd.read_csv("./datasets/enhanced/users2.csv.gz", index_col=0, parse_dates=['join_date'])

In [None]:
df_users.columns

## Distributions

In [None]:
_, axs = plt.subplots(3, 2, figsize=(25, 18))

sns.histplot(df_users['url_rate'], bins=100, log_scale=(False, True), kde=False, ax=axs[0, 0])
axs[0, 0].set_xlabel("URL Rate")
axs[0, 0].set_ylabel("User Count")

sns.histplot(df_users['n_words_per_twit'], bins=100, log_scale=(False, True), kde=False, ax=axs[0, 1])
axs[0, 1].set_xlabel("Number of Words per Twit")
axs[0, 1].set_ylabel("User Count")

sns.histplot(df_users['n_assets_per_twit'], bins=100, log_scale=(False, True), kde=False, ax=axs[1, 0])
axs[1, 0].set_xlabel("Number of Assets per Twit")
axs[1, 0].set_ylabel("User Count")

sns.histplot(df_users['n_emojis_per_twit'], bins=100, log_scale=(False, True), kde=False, ax=axs[1, 1])
axs[1, 1].set_xlabel("Number of Emojis per Twit")
axs[1, 1].set_ylabel("User Count")

sns.histplot(df_users['n_stopwords_per_twit'], bins=100, log_scale=(False, True), kde=False, ax=axs[2, 0])
axs[2, 0].set_xlabel("Number of Stopwords per Twit")
axs[2, 0].set_ylabel("User Count")

sns.histplot(df_users['avg_twit_similarity'], bins=100, log_scale=(False, True), kde=False, ax=axs[2, 1])
axs[2, 1].set_xlabel("Average Twit Similarity")
axs[2, 1].set_ylabel("User Count")

plt.savefig(f"{saving_folder}/imgs/users_distributions.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

In [None]:
df_tmp = df_users.rename(columns={'is_bot': 'Number of Samples'})
df_tmp['Number of Samples'] = df_tmp['Number of Samples'].replace({True: "Bot", False: "Human"})
df_tmp = df_tmp['Number of Samples'].value_counts().to_frame()
df_tmp = df_tmp.pivot_table(
    index=df_tmp.index,
    margins=True,
    margins_name='TOTAL',
    aggfunc=np.sum
)
df_tmp = df_tmp.style.format('{:,}')

df_tmp.to_latex(
    f"{saving_folder}/tables/user_types_count.tex",
    caption="Number of users per user type",
    label="table:user_types_count",
    position_float='centering',
    position='H',
    hrules=True
)
df_tmp

# Twits Dataset

In [None]:
df_twits

In [None]:
df_twits = pd.read_csv("./datasets/classified/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)
df_twits['date'] = df_twits['date'].dt.tz_localize(None)


In [7]:
df_twits = pd.merge(df_twits, df_users.add_prefix('user.'), on='user.id', how='left')

In [12]:
df_twits = df_twits.dropna(subset=['user.type'])

In [None]:
df_twits.shape

## Missing Data

In [None]:
ax = sns.heatmap(df_twits.groupby(['base_asset', df_twits['date'].dt.floor('h')]).first().unstack('base_asset')['id'].isna(), cmap=sns.cm.rocket_r, cbar=False)
ax.set_yticklabels([ ylabel.get_text().split('T')[0] for ylabel in ax.get_yticklabels() ])
ax.set_xlabel("Crypto Asset")
ax.set_ylabel("Date")
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/twit_missing_data.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Number of Twits per Label

In [None]:
df_tmp = df_twits.groupby('label', dropna=False).size().to_frame('Number of Samples').rename_axis(None).pivot_table(
    index=pd.Index(['Bearish', 'Bullish', 'NaN']),
    margins=True,
    margins_name='TOTAL',
    aggfunc=np.sum
).style.format('{:,}')

df_tmp.to_latex(
    f"{saving_folder}/tables/twit_count_per_label.tex",
    caption="Number of twits collected per label",
    label="table:twit_count_per_label",
    position_float='centering',
    position='H',
    hrules=True
)
df_tmp

## Twit Count Over Time

In [None]:
ax = df_twits[['date', 'label']].groupby([df_twits['date'].dt.to_period('M').dt.to_timestamp(), 'label'], dropna=False).size().unstack('label').plot(kind='bar', stacked=True, rot=45)
ax.set_xticklabels([ datetime.fromisoformat(xlabel.get_text()).strftime("%b %Y") for xlabel in ax.get_xticklabels() ])
plt.ylabel("Number of Twits")
plt.xlabel("Date")
plt.legend(title="Label")
plt.savefig(f"{saving_folder}/imgs/twit_count_over_time.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Twit Count per Crypto Asset

In [None]:
df_twits[['base_asset', 'label']].groupby(['base_asset', 'label'], dropna=False).size().unstack('label').plot(kind='bar', stacked=True, rot=45)
plt.ylabel("Number of Twits")
plt.xlabel("Crypto Asset")
plt.legend(title="Label")
plt.savefig(f"{saving_folder}/imgs/twit_count_per_crypto_asset.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

# Bots

In [None]:
fig = plt.figure(figsize=(24, 24))

# Sets the axis
ax0 = plt.subplot2grid((2, 2), (0, 0), colspan=1, fig=fig)
ax1 = plt.subplot2grid((2, 2), (0, 1), colspan=1, fig=fig)
ax2 = plt.subplot2grid((2, 2), (1, 0), colspan=2, fig=fig)

df_twits[['label', 'user.type']].groupby(['label', 'user.type'], dropna=False).size().unstack('user.type').plot(kind='bar', stacked=True, rot=0, ax=ax0)
ax0.set_ylabel("Number of Twits")
ax0.set_xlabel("Label")
ax0.legend(title="User Type")

df_twits[['date', 'user.type']].groupby([df_twits['date'].dt.to_period('M').dt.to_timestamp(), 'user.type'], dropna=False).size().unstack('user.type').plot(kind='bar', stacked=True, rot=90, ax=ax1)
ax1.set_xticklabels([ datetime.fromisoformat(xlabel.get_text()).strftime("%b %Y") for xlabel in ax1.get_xticklabels() ])
ax1.set_ylabel("Number of Twits")
ax1.set_xlabel("Date")
ax1.legend(title="User Type")

df_twits[['base_asset', 'user.type']].groupby(['base_asset', 'user.type'], dropna=False).size().unstack('user.type').plot(kind='bar', stacked=True, rot=45, ax=ax2)
ax2.set_ylabel("Number of Twits")
ax2.set_xlabel("Crypto Asset")
ax2.legend(title="User Type")

plt.savefig(f"{saving_folder}/imgs/bot_human_comparison.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Average Engagement Rate per User Type

# Twits Corpus

In [19]:
# Obtains a list of words that shouldn't be considered
stopwords = df_cryptomap['base_asset'].str.lower().tolist() + df_cryptomap['name'].str.lower().tolist()
stopwords += [ 
    "...", "will", "going", "see", "let", "one", "next", "still", "know", "time", "back", "coin", 
    "price", "new", "day", "don", "think", "today", "soon", "last", "night" 
]

## Bearish

### Human

In [None]:
mask = (df_twits['label'] == 'Bearish') & (df_twits['user.type'] == 'Human')
txt = df_twits[mask]['text_heavy_clean'].str.cat(sep=' ').encode('ascii', 'ignore').decode('ascii')
txt = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, stopwords))).sub('', txt)
txt = re.sub(r"\b\w{1,2}\b", "", txt)
txt = txt.translate(str.maketrans('', '', string.punctuation))
wfreq = collections.Counter(txt.split()).most_common(30)
words, freqs = zip(*wfreq[1:])

plt.figure(figsize=(20, 10))
plt.bar(words, freqs)
ax = plt.gca()
ax.set_xticklabels(map(lambda x: textwrap.fill(x, 10), words), rotation=90)
plt.savefig(f"{saving_folder}/imgs/human_bearish_word_freq.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### Bot

In [None]:
mask = (df_twits['label'] == 'Bearish') & (df_twits['user.type'] == 'Bot')
txt = df_twits[mask]['text_heavy_clean'].str.cat(sep=' ').encode('ascii', 'ignore').decode('ascii')
txt = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, stopwords))).sub('', txt)
txt = re.sub(r"\b\w{1,2}\b", "", txt)
txt = txt.translate(str.maketrans('', '', string.punctuation))
wfreq = collections.Counter(txt.split()).most_common(30)
words, freqs = zip(*wfreq[1:])

plt.figure(figsize=(20, 10))
plt.bar(words, freqs)
ax = plt.gca()
ax.set_xticklabels(map(lambda x: textwrap.fill(x, 10), words), rotation=90)
plt.savefig(f"{saving_folder}/imgs/bot_bearish_word_freq.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Bullish

### Humans

In [None]:
mask = (df_twits['label'] == 'Bullish') & (df_twits['user.type'] == 'Human')
txt = df_twits[mask]['text_heavy_clean'].str.cat(sep=' ').encode('ascii', 'ignore').decode('ascii')
txt = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, stopwords))).sub('', txt)
txt = re.sub(r"\b\w{1,2}\b", "", txt)
txt = txt.translate(str.maketrans('', '', string.punctuation))
wfreq = collections.Counter(txt.split()).most_common(30)
words, freqs = zip(*wfreq[1:])

plt.figure(figsize=(20, 10))
plt.bar(words, freqs)
ax = plt.gca()
ax.set_xticklabels(map(lambda x: textwrap.fill(x, 10), words), rotation=90)
plt.savefig(f"{saving_folder}/imgs/human_bullish_word_freq.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### Bots

In [None]:
mask = (df_twits['label'] == 'Bullish') & (df_twits['user.type'] == 'Bot')
txt = df_twits[mask]['text_heavy_clean'].str.cat(sep=' ').encode('ascii', 'ignore').decode('ascii')
txt = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, stopwords))).sub('', txt)
txt = re.sub(r"\b\w{1,2}\b", "", txt)
txt = txt.translate(str.maketrans('', '', string.punctuation))
wfreq = collections.Counter(txt.split()).most_common(30)
words, freqs = zip(*wfreq)

plt.figure(figsize=(20, 10))
plt.bar(words, freqs)
ax = plt.gca()
ax.set_xticklabels(map(lambda x: textwrap.fill(x, 10), words), rotation=90)
plt.savefig(f"{saving_folder}/imgs/bot_bullish_word_freq.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Word Count

In [None]:
df_tmp = df_twits[['user.type', 'text_heavy_clean']]
df_tmp['wcount'] = df_tmp['text_heavy_clean'].str.split().str.len()

_ = plt.figure(figsize=(15, 7))
ax = plt.gca()

sns.histplot(df_tmp, x='wcount', bins=100, log_scale=(False, True), hue='user.type', ax=ax)
ax.set_xlabel("Twit Word Count")
plt.ylabel("Number of Twits")
plt.legend(['Human', 'Bot'], title="User Type")
plt.savefig(f"{saving_folder}/imgs/wcount_distribution.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Example

In [None]:
df_tmp = df_twits[df_twits['id'] == 355849225][['text', 'text_light_clean', 'text_heavy_clean']]
df_tmp['text'] = df_tmp['text'].str.replace(" ass ", " ")
df_tmp['text_light_clean'] = df_tmp['text_light_clean'].str.replace(" ass ", " ")
df_tmp['text_heavy_clean'] = df_tmp['text_heavy_clean'].str.replace(" ass ", " ")
df_tmp['text_heavy_clean'] = df_tmp['text_heavy_clean'].str.replace(" fire fire fire ", " fire ")
df_tmp.rename(columns={'text': "Original Text", 'text_light_clean': "Light Text Cleaning", 'text_heavy_clean': "Heavy Text Cleaning"}, inplace=True)
df_tmp = df_tmp.T
df_tmp.rename(columns={290542: "Twit Corpus"}, inplace=True)
df_tmp.style.to_latex(
    f"{saving_folder}/tables/eg_text_cleaning.tex",
    caption="Example of light and heavy text cleaning to a twit corpus",
    label="table:eg_text_cleaning",
    position_float='centering',
    position='H',
    hrules=True
)
df_tmp

# Model

In [123]:
# history = pd.read_csv("./models/bert/history.csv", index_col=0)

df_test = df_twits.drop_duplicates(subset=['id']).dropna(subset=['label']).sample(3086782)
y_test = df_test['label'].replace({ 'Bearish': 0, 'Bullish': 1 })
y_test_scores = df_test['label_pred_score']
y_test_pred = df_test['label_pred'].replace({ 'Bearish': 0, 'Bullish': 1 })

In [None]:
df_test.shape

## Training Metrics

In [None]:
_, axs = plt.subplots(1, 3, figsize=(30, 12))

axs[0].plot(history['loss'], label='Train')
axs[0].plot(history['val_loss'], label='Validation')
axs[0].set_title("Loss Value")
axs[0].set_ylabel("Loss Value")
axs[0].set_xlabel("Epoch")
axs[0].legend()

axs[1].plot(history['accuracy'], label='Train')
axs[1].plot(history['val_accuracy'], label='Validation')
axs[1].set_title("Accuracy")
axs[1].set_ylabel("Accuracy")
axs[1].set_xlabel("Epoch")
axs[1].legend()

axs[2].plot(history['auc'], label='Train')
axs[2].plot(history['val_auc'], label='Validation')
axs[2].set_title("AUC")
axs[2].set_ylabel("AUC")
axs[2].set_xlabel("Epoch")
axs[2].legend()

plt.savefig(f"{saving_folder}/imgs/bert_training_metrics.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Confusion Matrix

In [None]:
# Obtains the confusion matrix
cf_matrix = confusion_matrix(y_test, y_test_pred)

# Obtains the annotations
counts = [ f"{val:0.0f}" for val in cf_matrix.flatten() ]
pcts = [ f"{100*val:.2f}" for val in cf_matrix.flatten()/np.sum(cf_matrix) ]
annot = np.asarray([ f"{count}\n({pct}%)" for count, pct in zip(counts, pcts) ]).reshape(2, 2)

# Plots the confusion matrix
ax = sns.heatmap(cf_matrix, annot=annot, cmap='Blues', fmt='')
ax.set_ylabel("True", fontsize=20)
ax.set_xlabel("Predicted", fontsize=20)
ax.xaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10) 
ax.yaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10)
plt.savefig(f"{saving_folder}/imgs/bert_confusion_matrix.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## ROC Curve

In [None]:
# Get true positive rates and false positive rates
fpr, tpr, thresholds = roc_curve(y_test, y_test_scores)

# Plots the ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle="--")

# Shades the AUC and show its value
filled_part = plt.fill_between(fpr, tpr, color='#8EB9D7')
(x0, y0), (x1, y1) = filled_part.get_paths()[0].get_extents().get_points()
plt.text(x1/2, y1/3, f"AUC = {roc_auc_score(y_test, y_test_scores):.3f}", fontsize=16)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.savefig(f"{saving_folder}/imgs/bert_roc.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

## Classification Report

In [None]:
# Obtains the classification report as a dataframe
df_clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True)).rename(columns={'0': 'Bearish', '1': 'Bullish'}).transpose()
df_clf_report.style.format(formatter='{:,.3f}').to_latex(
    f"{saving_folder}/tables/bert_classification_report.tex",
    caption="Classification report of the BERT based neural network",
    label="table:bert_classification_report",
    position_float='centering',
    position='H',
    hrules=True
)
df_clf_report

## Dataset Split

In [114]:
# Selects only useful columns
df_tmp = pd.read_csv("./datasets/enhanced/twits.csv.gz", low_memory=False)[['id', 'user.type', 'base_asset', 'text', 'label']].dropna()

# Drop duplicates on id (same twit can tag multiple base_asset)
df_tmp.drop_duplicates('id', ignore_index=True, inplace=True)

# Gets a small sample of the dataset for training and testing (balanced labels and base_assets)
df_train = df_tmp.groupby('base_asset', group_keys=False).apply(lambda x: x.groupby('label', group_keys=False).apply(lambda y: y.sample(x['label'].value_counts().min())))
# df_val = df_tmp[~df_tmp['id'].isin(df_train['id'])].groupby('base_asset', group_keys=False).apply(lambda x: x.groupby('label', group_keys=False).apply(lambda y: y.sample(x['label'].value_counts().min())))
# df_test = df_tmp[(~df_tmp['id'].isin(df_train['id'])) & (~df_tmp['id'].isin(df_val['id']))]

# df_tmp.loc[df_tmp['id'].isin(df_train['id']), 'set'] = 'Train'
# df_tmp.loc[df_tmp['id'].isin(df_val['id']), 'set'] = 'Validation'
# df_tmp.loc[df_tmp['id'].isin(df_test['id']), 'set'] = 'Test'

# Sentiment Metric

In [40]:
import pandas as pd

df_metric = pd.read_csv("./datasets/engagement_rate.csv.gz", index_col=0, header=[0, 1])
base_assets = ['BTC', 'ETH', 'DOGE', 'SHIB']
cols = ['Bearish Human', 'Bullish Human', 'Bearish Bot', 'Bullish Bot']
col_map = {'er_bear_human': 'Bearish Human', 'er_bull_human': 'Bullish Human', 'er_bear_bot': 'Bearish Bot', 'er_bull_bot': 'Bullish Bot'}

## Example

In [None]:
df_btc = df_metric['BTC'].rename(columns=col_map)
df_btc.index = pd.to_datetime(df_btc.index)

axs = df_btc[cols].plot(subplots=True, layout=(2, 2), sharex=False, legend=False, figsize=(30, 25))
axs[0, 0].set_title("Bearish Twits Made by Bots")
axs[0, 1].set_title("Bearish Twits Made by Humans")
axs[1, 0].set_title("Bullish Twits Made by Bots")
axs[1, 1].set_title("Bullish Twits Made by Humans")
for ax in axs.flatten():
    ax.set_xlabel("Date")
    ax.set_ylabel("Average Engagement Rate")

plt.savefig(f"{saving_folder}/imgs/sentiment_metric_btc.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

In [None]:
df_btc.describe()

## BTC

In [61]:
df_btc = df_metric['BTC'].rename(columns=col_map)
df_btc.index = pd.to_datetime(df_btc.index)
df_btc['Return'] = df_btc['price'].pct_change()
df_btc = df_btc[cols+['Return']].dropna()

### Correlation Matrix

In [None]:
corr_matrix = df_btc.corr()
mask = np.triu(corr_matrix)

sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/btc_corr_matrix.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### Causality Matrix

#### Check Stationarity

In [None]:
df_btc_stationay = df_btc.copy()
summary = { col: 0 for col in cols+['Return'] }
for col in cols:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_btc[col])

    while pvalue > 0.05:
        df_btc_stationay[col] = df_btc_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_btc[col])
        summary[col] += 1

summary

#### Obtains the Number of Lags

In [None]:
model = VAR(df_btc)

best_model_fit = None
for maxlag in range(1, 50):
    model_fit = model.fit(maxlag)
    if best_model_fit == None:
        best_model_fit = model_fit
    else:
        if model_fit.aic < best_model_fit.aic:
            best_model_fit = model_fit

maxlag = best_model_fit.k_ar
maxlag

#### Obtains the Granger Causality Matrix

In [None]:
df_tmp = grangers_causation_matrix(df_btc_stationay, maxlag=maxlag)

sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/causality_btc.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### OLS

In [6]:
scaler = MinMaxScaler()
df_btc[cols] = scaler.fit_transform(df_btc[cols])

In [None]:
# Obtains the endogenous and exogenous variables
y = df_btc['Return']
X = sm.add_constant(df_btc[cols])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{saving_folder}/tables/ols_btc.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())

## ETH

In [70]:
df_eth = df_metric['ETH'].rename(columns=col_map)
df_eth.index = pd.to_datetime(df_eth.index)
df_eth['Return'] = df_eth['price'].pct_change()
df_eth = df_eth[cols+['Return']].dropna()

### Correlation Matrix

In [None]:
corr_matrix = df_eth.corr()
mask = np.triu(corr_matrix)

sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/eth_corr_matrix.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### Causality Matrix

#### Check Stationarity

In [None]:
df_eth_stationay = df_eth.copy()
summary = { col: 0 for col in cols+['Return'] }
for col in cols:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_eth[col])

    while pvalue > 0.05:
        df_eth_stationay[col] = df_eth_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_eth[col])
        summary[col] += 1

summary

#### Obtains the Number of Lags

In [None]:
model = VAR(df_eth)

best_model_fit = None
for maxlag in range(1, 50):
    model_fit = model.fit(maxlag)
    if best_model_fit == None:
        best_model_fit = model_fit
    else:
        if model_fit.aic < best_model_fit.aic:
            best_model_fit = model_fit

maxlag = best_model_fit.k_ar
maxlag

#### Obtains the Granger Causality Matrix

In [None]:
df_tmp = grangers_causation_matrix(df_eth_stationay, maxlag=maxlag)

sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/causality_eth.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### OLS

In [39]:
scaler = MinMaxScaler()
df_eth[cols] = scaler.fit_transform(df_eth[cols])

In [None]:
# Obtains the endogenous and exogenous variables
y = df_eth['Return']
X = sm.add_constant(df_eth[cols])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{saving_folder}/tables/ols_eth.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())

## DOGE

In [75]:
df_doge = df_metric['DOGE'].rename(columns=col_map)
df_doge.index = pd.to_datetime(df_doge.index)
df_doge['Return'] = df_doge['price'].pct_change()
df_doge = df_doge[cols+['Return']].dropna()

### Correlation Matrix

In [None]:
corr_matrix = df_doge.corr()
mask = np.triu(corr_matrix)

sns.heatmap(corr_matrix, annot=True, mask=mask, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/doge_corr_matrix.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### Causality Matrix

#### Check Stationarity

In [None]:
df_doge_stationay = df_doge.copy()
summary = { col: 0 for col in cols+['Return'] }
for col in cols:
    t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_doge[col])

    while pvalue > 0.05:
        df_doge_stationay[col] = df_doge_stationay[col].diff() 
        t_statistic, pvalue, usedlag, nobs, critical_values, icbest = adfuller(df_doge[col])
        summary[col] += 1

summary

#### Obtains the Number of Lags

In [None]:
model = VAR(df_doge)

best_model_fit = None
for maxlag in range(1, 50):
    model_fit = model.fit(maxlag)
    if best_model_fit == None:
        best_model_fit = model_fit
    else:
        if model_fit.aic < best_model_fit.aic:
            best_model_fit = model_fit

maxlag = best_model_fit.k_ar
maxlag

#### Obtains the Granger Causality Matrix

In [None]:
df_tmp = grangers_causation_matrix(df_doge_stationay, maxlag=maxlag)

sns.heatmap(df_tmp, annot=True, cmap='Blues')
plt.xticks(rotation=45)
plt.savefig(f"{saving_folder}/imgs/causality_doge.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()

### OLS

In [51]:
scaler = MinMaxScaler()
df_doge[cols] = scaler.fit_transform(df_doge[cols])

In [None]:
# Obtains the endogenous and exogenous variables
y = df_doge['Return']
X = sm.add_constant(df_doge[cols])

# Runs the OLS regression
model = sm.OLS(y, X)
model_fit = model.fit()
with open(f"{saving_folder}/tables/ols_doge.tex", 'w+') as f:
    f.write(model_fit.summary().as_latex())
print(model_fit.summary())