# The Effects of Social Media Bots on the Cryptomarket: Sentiment Classification

*By Daniel Deutsch*

In [1]:
import collections
import re
import ssl
import string
import warnings
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from IPython.display import clear_output
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud

In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Creates a default https context
ssl._create_default_https_context = ssl._create_unverified_context

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 6),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

# Constants
START_DATE = datetime(2019, 6, 1)
END_DATE = datetime(2022, 6, 1)

## BERT Based Neural Network

### Read Data

#### Read

In [None]:
df_twits = pd.read_csv("./datasets/enhanced/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)
df_users = pd.read_csv("./datasets/enhanced/users.csv.gz", index_col=0, parse_dates=['join_date'])

#### Balanced Data Sampling

In [None]:
# Selects only useful columns
df_small = df_twits[['id', 'user.type', 'base_asset', 'text', 'label']].dropna()

# Drop duplicates on text (same twit can tag multiple sabe_asset)
df_small.drop_duplicates('text', ignore_index=True, inplace=True)

# Bots removal
df_small = df_small[df_small['user.type'] == 'User']

# Gets a small sample of the dataset for training and testing (balanced labels and base_assets)
df_small = df_small.groupby('base_asset', group_keys=False).apply(lambda x: x.groupby('label', group_keys=False).apply(lambda y: y.sample(x['label'].value_counts().min())))

# Resets the index
df_small.reset_index(drop=True, inplace=True)

#### Train Test Validation Split

In [None]:
# Extracts the explanatory and explanable variables
X = df_small[['text', 'text_light_clean', 'text_heavy_clean']]
y = df_small['label'].replace({'Bearish': 0, 'Bullish': 1})

# Splits the data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=1)

### Model Architechture

In [2]:
# Sets the input of the Neural Network
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Obtains the output of the Neural Network
output = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')(text_input)
output = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/2", trainable=True, name='BERT_encoder')(output)
output = tf.keras.layers.Dropout(0.6)(output['sequence_output'])
output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(output)
output = tf.keras.layers.Attention(name='attention')([output, output])
output = tf.keras.layers.Conv1D(128, 9, activation='relu', padding='same', name='convolutional')(output)
output = tf.keras.layers.GlobalAveragePooling1D(name='average_pooling')(output)
output = tf.keras.layers.Dropout(0.4)(output)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(output)

# Defines the optimizer of the Neural Network
learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Defines the loss function of the Neural Network
loss = 'binary_crossentropy'

# Builds and compiles the model
model = tf.keras.Model(inputs=text_input, outputs=output)
model.compile(optimizer, loss=loss, metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

### Model Training

In [None]:
# Sets training params
epochs = 30
batch_size = 512
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("./models/bert/weights.h5", monitor='val_accuracy', save_freq='epoch', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0005, patience=5, restore_best_weights=True),
    tf.keras.callbacks.CSVLogger("./models/bert/history.csv", separator=',', append=True)
]

# Trains the model
history = model.fit(
    x=X_train['text'], y=y_train,
    validation_data=(X_val['text'], y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
)

# Saves the model
model.save("./models/bert")

# Loads the saved variables
model = tf.keras.models.load_model("./models/bert")
history = pd.read_csv("./models/bert/history.csv", index_col=0)

# Display the history
clear_output()
history