# Master Thesis: Sentiment Classification Models

#### The Influence of Bots in the Crypto Market

*By Daniel Jorge Deutsch*

In [1]:
import collections
import re
import ssl
import string
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from IPython.display import clear_output
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

# Setup

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Creates a default https context
ssl._create_default_https_context = ssl._create_unverified_context

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 6),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

# Constants
START_DATE = datetime(2019, 6, 1)
END_DATE = datetime(2022, 6, 1)

## Auxiliary Functions

In [3]:
def render_df_table(data, col_width=3.0, row_height=0.625, font_size=14, header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w', bbox=[0, 0, 1, 1], header_columns=0, ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        _, ax = plt.subplots(figsize=size)
        ax.axis('off')
    mpl_table = ax.table(cellText=data.round(3).values, bbox=bbox, colLabels=data.columns, **kwargs)
    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in mpl_table._cells.items():
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax.get_figure(), ax

# Data Import

## Twits Dataset

Here we load the pandas dataframe with the processed dataset containing more than 7 million twits (posted between 2019-06-01 and 2022-06-01) about the top 50 crypto assets listed in the Binance exchange. The dataset has the following structure:

| Index &nbsp; | Name                      | Description                                                   |
|--------------|:--------------------------|--------------------------------------------------------------:|
|  0           | `id`                      | Id of the twit post                                           |
|  1           | `date`                    | Date of twit creation                                         |
|  2           | `base_asset`              | Crypto asset related to the twit                              |
|  3           | `user_id`                 | Id of the user who posted the twit                            |
|  4           | `text`                    | Twit raw corpus                                               |
|  5           | `text_light_clean`        | Twit corpus with light text cleaning                          |
|  6           | `text_heavy_clean`        | Twit corpus with heavy text cleaning                          |
|  7           | `n_likes`                 | Number of twit post likes                                     |
|  8           | `n_reshares`              | Number of twit post reshares dependents                       |
|  9           | `label`                   | Whether the twit has a bearish or a bullish sentiment         |

In [12]:
df = pd.read_csv("./datasets/enhanced/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)

## Users Dataset

From the twits dataframe, we are able to derive a dataset with unique user information. This dataset contains over 160 thousand users and has the following features:

| Index &nbsp; | Name &nbsp; &nbsp; &nbsp; | Description                                                   |
|--------------|:--------------------------|--------------------------------------------------------------:|
|  0           | `user_id`                 | Id of the user who posted the twit                            |
|  1           | `n_user_followers`        | Number of users that follow the user that posted the twit     |
|  2           | `n_user_following`        | Number of users that the user that posted the twit follows    |
|  3           | `user_join_date`          | Date when the user created his account                        |
|  4           | `n_user_ideas`            | Total number of twits made by the user since his joining date |
|  5           | `is_user_official`        | =True if the user is official                                 |
|  6           | `n_user_watchlist_stocks` | Number of stocks in the user's watchlist                      |
|  7           | `n_user_likes`            | Total number of twits that the user have liked                |
|  8           | `n_user_twits`            | Number of twits of the user that are in the twits dataset     |
|  9           | `n_active_days`           | Number of days since user's joining date                      |
| 10           | `n_active_days_clipped`   | Number of days since max(START_DATE, user's joining date)     |
| 11           | `user_twit_freq`          | Daily frequency of user twits                                 |
| 12           | `user_idea_freq`          | Daily frequency of user ideas                                 |

In [None]:
df_users = pd.read_csv("./datasets/enhanced/users.csv.gz", index_col=0, parse_dates=['join_date'])

# Data Visualization

Before we deepen into the sentiment analysis models, it is important to properly understand the data that we want to classify. A good way to do it is by plotting different visualizations of it.

## User Dataset

### Ideas per User Distribution

An idea in StockTwits is how they name a post (or what I call a twit). Here we can see that most of the users tend to post a a couple thousands of twits, but there are outliers that have posted over 200,000 twits.

Considering that stocktwits was launched in 2009, since then we would have 2022-2009=13 years of usage i.e., 13*365=4745 days. This means that the user who posted 200,000 twits (considering he was created in 2009), would have to post 200000/4745=42 twits a day to reach this mark. According to a study made by ___, the average user on Twitter (a platform with a lot more frequent users), tend to post on average 2.6 twits a day. By looking at this two informations, it is clear that there are lots of bot accounts in the dataset.

In [None]:
sns.histplot(df_users['n_user_ideas'], bins=300, log_scale=(False, True), kde=False)
plt.title("Log-Scaled Distribution of the Number of Ideas Posted by each User")
plt.xlabel("Number of Ideas Posted by the User")
plt.savefig("./imgs/idea_count_per_user_distribution.png")
plt.show()

### Ideas Frequency Per User

In the following plot it is easier to see what was explained above. There are several users with posting frequency way above expected from a human.

In [None]:
sns.histplot(df_users['user_idea_freq'], bins=300, log_scale=(False, True), kde=False)
plt.title("Log-Scaled Distribution of the Frequency of Ideas Posted by each User")
plt.xlabel("Frequency of Ideas Posted by the User")
plt.savefig("./imgs/idea_freq_per_user_distribution.png")
plt.show()

### Twit Count per User Distribution

Here, we focus on the dataset itself, 

In [None]:
sns.histplot(df_users['n_user_twits'], bins=300, log_scale=(False, True), kde=False)
plt.title("Log-Scaled Distribution of the Number of Twits Posted by each User")
plt.xlabel("Number of Twits Posted by the User")
plt.savefig("./imgs/twit_count_per_user_distribution.png")
plt.show()

### Daily Twit Frequency Distribution

In [None]:
sns.histplot(df_users['user_twit_freq'], bins=300, log_scale=(False, True), kde=False)
plt.title("Log-Scaled Distribution of the Daily Posting Frequency of each User")
plt.xlabel("Number of Twits Posted by the User per day")
plt.savefig("./imgs/twit_daily_freq_per_user_distribution.png")
plt.show()

### User Followers Distribution

In [None]:
sns.histplot(df_users['n_user_following'], bins=300, log_scale=(False, True), kde=False)
plt.title("Log-Scaled Distribution of the Frequency of Ideas Posted by each User")
plt.xlabel("Frequency of Ideas Posted by the User")
plt.savefig("./imgs/idea_freq_per_user_distribution.png")
plt.show()

## Twits Dataset

### General Data Insights

#### Label Count

From this plot we can clearly observe that we are considering a very unbalanced dataset. Almost 4 million twits are marked as having a bullish sentiment behind it and no more than 500 thousand were marked as having a bearish sentiment. We can also observe that a vary considerate amount of twits (over 2 million of them) don't have a label showing the sentiment behind it. 

In [None]:
df[['label', 'user_type']].groupby(['label', 'user_type'], dropna=False).size().unstack('user_type').plot(kind='bar', stacked=True, rot=45)
plt.title("Twit Count per Label")
plt.ylabel("Number of Twits")
plt.xlabel("Label")
plt.savefig("./imgs/twit_count_per_label.png")
plt.show()

#### Twit Count Over Time

This plot shows us the evolution of twit volume over time. We can observe that there was a huge increase in twits about crypto assets in the year 2021 that held until 2022. It is worth noticing that basically during the time considered, there is almos always a higher volume of bullish twits over bearish ones. 

In [None]:
df[['date', 'label']].groupby([df['date'].dt.date, 'label'], dropna=False).size().unstack('label').plot(lw=0.7, rot=45)
plt.title("Twit Count Over Time")
plt.ylabel("Number of Twits")
plt.xlabel("Time")
plt.savefig("./imgs/twit_count_over_time.png")
plt.show()

#### Twit Count per Month

In [None]:
df[['date', 'label']].groupby([df['date'].dt.month_name(), 'label'], dropna=False).size().unstack('label').reindex(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']).plot(kind='bar', stacked=True, rot=45)
plt.title("Twit Count per Month")
plt.ylabel("Number of Twits")
plt.xlabel("Month")
plt.savefig("./imgs/twit_count_per_month.png")
plt.show()

#### Twit Count per Year

In [None]:
df[['date', 'label']].groupby([df['date'].dt.year, 'label'], dropna=False).size().unstack('label').plot(kind='bar', stacked=True, rot=45)
plt.title("Twit Count per Year")
plt.ylabel("Number of Twits")
plt.xlabel("Year")
plt.savefig("./imgs/twit_count_per_year.png")
plt.show()

#### Twit Count per Crypto Asset

From the following plot, it is clear that there are a few crypto assets with a bigger representation in the dataset when compared to othe ones. We can se that most of the crypto with higher twit volume are "hyped crypto" that receive more attention from the media such as BTC, ETH, DOGE and SHIB.

In [None]:
df[['base_asset', 'label']].groupby(['base_asset', 'label'], dropna=False).size().unstack('label').plot(kind='bar', stacked=True, rot=45)
plt.title("Twit Count per Crypto Asset")
plt.ylabel("Number of Twits")
plt.xlabel("Crypto Asset")
plt.savefig("./imgs/twit_count_per_crypto_asset.png")
plt.show()

### Word Frequency

#### Word Clouds

Considering now the corpus of each twit, we can se the most used words in each category. 

In [None]:
fig = plt.figure(figsize=(21, 10))

# Sets the axis
ax0 = plt.subplot2grid((2, 3), (0, 0), colspan=3, fig=fig)
ax1 = plt.subplot2grid((2, 3), (1, 0), colspan=1, fig=fig)
ax2 = plt.subplot2grid((2, 3), (1, 1), colspan=1, fig=fig)
ax3 = plt.subplot2grid((2, 3), (1, 2), colspan=1, fig=fig)

# All twits plot
txt_all = df['text_heavy_clean'].str.cat(sep=' ')
wc0 = WordCloud(width=735, height=175, collocations=False, background_color='white').generate(txt_all)
ax0.set_title("All Twits WordCloud", fontsize=20)
ax0.set_axis_off()
ax0.imshow(wc0)

# Bearish twits plot
txt_bearish = df[df['label'] == 'Bearish']['text_heavy_clean'].str.cat(sep=' ')
wc1 = WordCloud(width=735, height=525, collocations=False, background_color='white').generate(txt_bearish)
ax1.set_title("Bearish Twits WordCloud", fontsize=20)
ax1.set_axis_off()
ax1.imshow(wc1)

# Bullish twits plot
txt_bullish = df[df['label'] == 'Bullish']['text_heavy_clean'].str.cat(sep=' ')
wc2 = WordCloud(width=735, height=525, collocations=False, background_color='white').generate(txt_bullish)
ax2.set_title("Bullish Twits WordCloud", fontsize=20)
ax2.set_axis_off()
ax2.imshow(wc2)

# Non-labeld twits plot
txt_nan = df[df['label'].isna()]['text_heavy_clean'].str.cat(sep=' ')
wc3 = WordCloud(width=735, height=525, collocations=False, background_color='white').generate(txt_nan)
ax3.set_title("Non-Labeled Twits WordCloud", fontsize=20)
ax3.set_axis_off()
ax3.imshow(wc3)

plt.savefig("./imgs/wordclouds.png")
plt.show()

#### Top Most Frequent Words

In [None]:
fig = plt.figure(figsize=(21, 15))

# Sets the axis
ax0 = plt.subplot2grid((2, 3), (0, 0), colspan=3, fig=fig)
ax1 = plt.subplot2grid((2, 3), (1, 0), colspan=1, fig=fig)
ax2 = plt.subplot2grid((2, 3), (1, 1), colspan=1, fig=fig)
ax3 = plt.subplot2grid((2, 3), (1, 2), colspan=1, fig=fig)

# All twits plot
wfreq_all = collections.Counter(txt_all.split()).most_common(30)
words, freqs = zip(*wfreq_all)
ax0.bar(words, freqs)
ax0.set_xticklabels(words, rotation=45)
ax0.set_title("All Twits Top 30 Frequent Words", fontsize=20)

# Bearish twits plot
wfreq_bearish = collections.Counter(txt_bearish.split()).most_common(10)
words, freqs = zip(*wfreq_bearish)
ax1.bar(words, freqs)
ax1.set_xticklabels(words, rotation=45)
ax1.set_title("Bearish Twits Top 10 Frequent Words", fontsize=20)

# Bullish twits plot
wfreq_bullish = collections.Counter(txt_bullish.split()).most_common(10)
words, freqs = zip(*wfreq_bullish)
ax2.bar(words, freqs)
ax2.set_xticklabels(words, rotation=45)
ax2.set_title("Bullish Twits Top 10 Frequent Words", fontsize=20)

# Non-labeld twits plot
wfreq_nan = collections.Counter(txt_nan.split()).most_common(10)
words, freqs = zip(*wfreq_nan)
ax3.bar(words, freqs)
ax3.set_xticklabels(words, rotation=45)
ax3.set_title("Non-Labeled Twits Top 10 Frequent Words", fontsize=20)

plt.savefig("./imgs/top_frequent_words.png")
plt.show()

### Word Count

#### Number of Words per Twit Distribution

In [None]:
fig = plt.figure(figsize=(21, 15))

# Sets the axis
ax0 = plt.subplot2grid((2, 3), (0, 0), colspan=3, fig=fig)
ax1 = plt.subplot2grid((2, 3), (1, 0), colspan=1, fig=fig)
ax2 = plt.subplot2grid((2, 3), (1, 1), colspan=1, fig=fig)
ax3 = plt.subplot2grid((2, 3), (1, 2), colspan=1, fig=fig)

# All twits plot
wcount_all = df['text_heavy_clean'].str.split().str.len()
sns.histplot(wcount_all, bins=100, kde=True, ax=ax0)
ax0.set_title("All Twits Word Count Distribution")
ax0.set_xlabel("Twit Word Count")

# Bearish twits plot
wcount_bearish = df[df['label'] == 'Bearish']['text_heavy_clean'].str.split().str.len()
sns.histplot(wcount_bearish, bins=100, kde=True, ax=ax1)
ax1.set_title("Bearish Twits Word Count Distribution")
ax1.set_xlabel("Twit Word Count")

# Bullish twits plot
wcount_bullish = df[df['label'] == 'Bullish']['text_heavy_clean'].str.split().str.len()
sns.histplot(wcount_bullish, bins=100, kde=True, ax=ax2)
ax2.set_title("Bullish Twits Word Count Distribution")
ax2.set_xlabel("Twit Word Count")

# Non-labeld twits plot
wcount_nan = df[df['label'].isna()]['text_heavy_clean'].str.split().str.len()
sns.histplot(wcount_nan, bins=100, kde=True, ax=ax3)
ax3.set_title("Non-Labeled Twits Word Count Distribution")
ax3.set_xlabel("Twit Word Count")

plt.savefig("./imgs/number_of_words_per_tweet_distribution.png")
plt.show()

# Sentiment Analysis

## Data Selection

### Balanced Data Sampling

In [26]:
# Selects only useful columns
df_small = df[['id', 'user.type', 'base_asset', 'text', 'label']].dropna()

# Drop duplicates on text (same twit can tag multiple sabe_asset)
df_small.drop_duplicates('text', ignore_index=True, inplace=True)

# Bots removal
df_small = df_small[df_small['user.type'] == 'User']

# Gets a small sample of the dataset for training and testing (balanced labels and base_assets)
df_small = df_small.groupby('base_asset', group_keys=False).apply(lambda x: x.groupby('label', group_keys=False).apply(lambda y: y.sample(x['label'].value_counts().min())))

# Resets the index
df_small.reset_index(drop=True, inplace=True)

### Train Test Validation Split

In [6]:
# Extracts the explanatory and explanable variables
X = df_small[['text', 'text_light_clean', 'text_heavy_clean']]
y = df_small['label'].replace({'Bearish': 0, 'Bullish': 1})

# Splits the data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=1)

## BERT Based Neural Network

### Model Building

In [89]:
# Sets the input of the Neural Network
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Obtains the output of the Neural Network
output = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')(text_input)
output = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2", trainable=True, name='BERT_encoder')(output)
output = tf.keras.layers.Dropout(0.6)(output['sequence_output'])
output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(output)
output = tf.keras.layers.Attention(name='attention')([output, output])
output = tf.keras.layers.Conv1D(128, 9, activation='relu', padding='same', name='convolutional')(output)
output = tf.keras.layers.GlobalAveragePooling1D(name='average_pooling')(output)
output = tf.keras.layers.Dropout(0.4)(output)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(output)

# Defines the optimizer of the Neural Network
learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Defines the loss function of the Neural Network
loss = 'binary_crossentropy'

# Builds and compiles the model
model = tf.keras.Model(inputs=text_input, outputs=output)
model.compile(optimizer, loss=loss, metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

### Model Training

In [None]:
# Sets training params
epochs = 30
batch_size = 512
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("./models/bert/weights.h5", monitor='val_accuracy', save_freq='epoch', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0005, patience=5, restore_best_weights=True),
    tf.keras.callbacks.CSVLogger("./models/bert/history.csv", separator=',', append=True)
]

# Trains the model
history = model.fit(
    x=X_train['text'], y=y_train,
    validation_data=(X_val['text'], y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
)

# Saves the model
model.save("./models/bert")

# Loads the saved variables
model = tf.keras.models.load_model("./models/bert")
history = pd.read_csv("./models/bert/history.csv", index_col=0)

# Display the history
clear_output()
history

### Model Evaluation

In [None]:
# Obtains the model label predictions for the test set
y_test_scores = pd.Series(model.predict(X_test['text']).flatten())
y_test_pred = y_test_scores.round()

#### Metrics plot

In [None]:
_, axs = plt.subplots(1, 3, figsize=(21, 7))

axs[0].plot(history['loss'], label='Train')
axs[0].plot(history['val_loss'], label='Validation')
axs[0].set_title("Loss Value")
axs[0].set_ylabel("Loss Value")
axs[0].set_xlabel("Epoch")
axs[0].legend()

axs[1].plot(history['accuracy'], label='Train')
axs[1].plot(history['val_accuracy'], label='Validation')
axs[1].set_title("Accuracy")
axs[1].set_ylabel("Accuracy")
axs[1].set_xlabel("Epoch")
axs[1].legend()

axs[2].plot(history['auc'], label='Train')
axs[2].plot(history['val_auc'], label='Validation')
axs[2].set_title("AUC")
axs[2].set_ylabel("AUC")
axs[2].set_xlabel("Epoch")
axs[2].legend()

plt.savefig("./imgs/bert_training_metrics_plot.png")
plt.show()

#### Confusion Matrix

In [None]:
# Obtains the confusion matrix
cf_matrix = confusion_matrix(y_test, y_test_pred)

# Obtains the annotations
counts = [ f"{val:0.0f}" for val in cf_matrix.flatten() ]
pcts = [ f"{100*val:.2f}" for val in cf_matrix.flatten()/np.sum(cf_matrix) ]
annot = np.asarray([ f"{count}\n({pct}%)" for count, pct in zip(counts, pcts) ]).reshape(2, 2)

# Plots the confusion matrix
ax = sns.heatmap(cf_matrix, annot=annot, cmap='Blues', fmt='')
ax.set_title("BERT Model Confusion Matrix")
ax.set_ylabel("True", fontsize=20)
ax.set_xlabel("Predicted", fontsize=20)
ax.xaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10) 
ax.yaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10)
plt.savefig("./imgs/bert_confusion_matrix.png")
plt.show()

#### ROC Curve

In [None]:
# Get true positive rates and false positive rates
fpr, tpr, thresholds = roc_curve(y_test, y_test_scores)

# Plots the ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle="--")

# Shades the AUC and show its value
filled_part = plt.fill_between(fpr, tpr, color='#8EB9D7')
(x0, y0), (x1, y1) = filled_part.get_paths()[0].get_extents().get_points()
plt.text(x1/2, y1/3, f"AUC = {roc_auc_score(y_test, y_test_scores):.3f}", fontsize=16)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("BERT Receiver Operating Characteristic (ROC)")
plt.show()

#### Classification Report

In [None]:
# Obtains the classification report as a dataframe
df_clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True)).rename(columns={'0': 'Bearish', '1': 'Bullish'}).transpose().reset_index().rename(columns={ 'index': ''})

# Plots the classification report
fig, ax = render_df_table(df_clf_report, header_columns=1)
fig.savefig("./imgs/bert_classification_report.png")
plt.show()

## BERTweet Based Neural Network

In [None]:
# Sets the input of the Neural Network
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Obtains the output of the Neural Network
output = preprocessing(text_input)
output = encoder(output['input_word_ids'], attention_mask=output['input_mask'], token_type_ids=output['input_type_ids'])
output = tf.keras.layers.Dropout(0.6)(output['last_hidden_state'])
output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(output)
output = tf.keras.layers.Attention(name='attention')([output, output])
output = tf.keras.layers.Conv1D(128, 9, activation='relu', padding='same', name='convolutional')(output)
output = tf.keras.layers.GlobalAveragePooling1D(name='average_pooling')(output)
output = tf.keras.layers.Dropout(0.4)(output)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(output)

# Defines the optimizer of the Neural Network
learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Defines the loss function of the Neural Network
loss = 'binary_crossentropy'

# Builds and compiles the model
model = tf.keras.Model(inputs=text_input, outputs=output)
model.compile(optimizer, loss=loss, metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

## Whole Dataset Evaluation

In [4]:
df_twits = pd.read_csv("./datasets/classified/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)
df_twits.dropna(subset=['label'], inplace=True)

In [10]:
y_test = df_twits['label'].replace({ 'Bearish': 0, 'Bullish': 1 })
y_test_scores = df_twits['label_pred_score']
y_test_pred = df_twits['label_pred'].replace({ 'Bearish': 0, 'Bullish': 1 })

#### Confusion Matrix

In [None]:
# Obtains the confusion matrix
cf_matrix = confusion_matrix(y_test, y_test_pred)

# Obtains the annotations
counts = [ f"{val:0.0f}" for val in cf_matrix.flatten() ]
pcts = [ f"{100*val:.2f}" for val in cf_matrix.flatten()/np.sum(cf_matrix) ]
annot = np.asarray([ f"{count}\n({pct}%)" for count, pct in zip(counts, pcts) ]).reshape(2, 2)

# Plots the confusion matrix
ax = sns.heatmap(cf_matrix, annot=annot, cmap='Blues', fmt='')
ax.set_title("BERT Model Confusion Matrix")
ax.set_ylabel("True", fontsize=20)
ax.set_xlabel("Predicted", fontsize=20)
ax.xaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10) 
ax.yaxis.set_ticklabels(['Bearish', 'Bullish'], fontsize=10)
plt.savefig("./imgs/bert_confusion_matrix_full.png")
plt.show()

#### ROC Curve

In [None]:
# Get true positive rates and false positive rates
fpr, tpr, thresholds = roc_curve(y_test, y_test_scores)

# Plots the ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle="--")

# Shades the AUC and show its value
filled_part = plt.fill_between(fpr, tpr, color='#8EB9D7')
(x0, y0), (x1, y1) = filled_part.get_paths()[0].get_extents().get_points()
plt.text(x1/2, y1/3, f"AUC = {roc_auc_score(y_test, y_test_scores):.3f}", fontsize=16)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("BERT Receiver Operating Characteristic (ROC)")
plt.savefig("./imgs/bert_roc_full.png")
plt.show()

#### Classification Report

In [None]:
# Obtains the classification report as a dataframe
df_clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True)).rename(columns={'0': 'Bearish', '1': 'Bullish'}).transpose().reset_index().rename(columns={ 'index': ''})

# Plots the classification report
fig, ax = render_df_table(df_clf_report, header_columns=1)
fig.savefig("./imgs/bert_classification_report_full.png")
plt.show()