# The Effects of Social Media Bots on the Cryptomarket: User Classification

*By Daniel Deutsch*

In [3]:
import warnings
from datetime import datetime

import emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from IPython.display import clear_output
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from wordcloud import STOPWORDS

In [4]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 6),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

# Constants
START_DATE = datetime(2019, 6, 1)
END_DATE = datetime(2022, 6, 1)

## Enhance Users Dataset

### Read Data

In [None]:
df_users = pd.read_csv("./datasets/processed/users.csv.gz", index_col=0, parse_dates=['join_date'])
df_twits = pd.read_csv("./datasets/processed/twits.csv.gz", index_col=0, parse_dates=['date'], low_memory=False)

### Existing Features

In [None]:
# id
df_users['id'] = df_users['id']

# username
df_users['username'] = df_users['username']

# name
df_users['name'] = df_users['name']

# avatar_url
df_users['avatar_url'] = (df_users['avatar_url'] != "http://avatars.stocktwits.com/images/default_avatar_thumb.jpg").astype(int)

# avatar_url_ssl
del df_users['avatar_url_ssl']

# join_date
df_users['join_date'] = df_users['join_date']

# official
df_users['official'] = df_users['official'].astype(int)

# identity
del df_users['identity']

# classification
df_users['suggested'] = df_users['classification'].str.contains('suggested').astype(int)
df_users['verified'] = df_users['classification'].str.contains('verified').astype(int)
del df_users['classification']

# home_country
df_users['from_us'] = (df_users['home_country'] == "US").astype(int)
df_users['from_ca'] = (df_users['home_country'] == "CA").astype(int)
df_users['from_in'] = (df_users['home_country'] == "IN").astype(int)
del df_users['home_country']

# search_country
del df_users['search_country']

# followers
df_users['followers'] = df_users['followers'].clip(lower=0)

# following
df_users['following'] = df_users['following']

# ideas
df_users['ideas'] = df_users['ideas']

# watchlist_stocks_count
df_users['watchlist_stocks_count'] = df_users['watchlist_stocks_count']

# like_count
df_users['like_count'] = df_users['like_count']

# plus_tier
df_users['plus_tier'] = (~df_users['plus_tier'].isna()).astype(int)

# premium_room
df_users['premium_room'] = (~df_users['premium_room'].isna()).astype(int)

# trade_app
df_users['trade_app'] = df_users['trade_app'].astype(int)

# trade_status
df_users['trade_status'] = (~df_users['trade_status'].isna()).astype(int)

# portfolio_waitlist
del df_users['portfolio_waitlist']

# portfolio_status
df_users['portfolio_status'] = (~df_users['portfolio_status'].isna()).astype(int)

# portfolio
del df_users['portfolio']

# n_twits
df_users['n_twits'] = df_users['n_twits']

### New Features

In [None]:
# n_active_days
df_users['n_active_days'] = df_users['join_date'].apply(lambda x: ( END_DATE - x ).days )

# n_active_days_clipped
df_users['n_active_days_clipped'] = df_users['join_date'].apply(lambda x: ( END_DATE - max(START_DATE, x) ).days )

# twit_freq
df_users['twit_freq'] = df_users['n_twits']/df_users['n_active_days_clipped']

# idea_freq
df_users['idea_freq'] = df_users['n_twits']/df_users['n_active_days']

# Create the columns in the dataset
df_users['url_rate'] = 0
df_users['n_words_per_twit'] = 0
df_users['n_assets_per_twit'] = 0
df_users['n_emojis_per_twit'] = 0
df_users['n_stopwords_per_twit'] = 0
df_users['avg_twit_similarity'] = 0
df_users['n_commas_per_twit'] = 0
df_users['n_points_per_twit'] = 0
df_users['n_semicolons_per_twit'] = 0
df_users['n_exclamations_per_twit'] = 0
df_users['n_quotes_per_twit'] = 0
df_users['n_oparentheses_per_twit'] = 0
df_users['n_cparentheses_per_twit'] = 0


i, n = 0, df_twits.groupby('user.id').ngroups
for user_id, twits in df_twits.groupby('user.id'):

    print(f"\r{i}/{n-1}", end="")
    i += 1
    
    # Drop duplicated twits
    twits = twits.drop_duplicates(subset=['id'], ignore_index=True)
    twits = twits.dropna(subset=['text'])

    # Obtains general params
    n_twits = max(twits.shape[0], 1)
    n_urls = twits['text'].str.findall(r"[A-Za-z0-9]+://[A-Za-z0-9%-_]+(/[A-Za-z0-9%-_])*(#|\\?)[A-Za-z0-9%-_&=]*").apply(len).sum()
    n_words = twits['text'].str.split(" ").apply(len).sum()
    n_assets = twits['text'].str.split(r"\$([a-zA-Z]+)\.x").apply(len).sum()
    n_emojis = twits['text'].apply(lambda x: len([c for c in x if c in emoji.UNICODE_EMOJI['en'] ])).sum()
    n_stopwords = twits['text'].apply(lambda x: len(set(x.split()) & STOPWORDS)).sum()
    n_commas = twits['text'].str.count(",").sum()
    n_points = twits['text'].str.count(".").sum()
    n_semicolons = twits['text'].str.count(";").sum()
    n_exclamations = twits['text'].str.count("!").sum()
    n_quotes = twits['text'].str.count("\"").sum()
    n_oparentheses = twits['text'].str.count("\(").sum()
    n_cparentheses = twits['text'].str.count("\)").sum()

    # url_rate
    df_users.loc[df_users['id'] == user_id, 'url_rate'] = n_urls/n_twits

    # n_words_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_words_per_twit'] = n_words/n_twits

    # n_assets_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_assets_per_twit'] = n_assets/n_twits

    # n_emojis_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_emojis_per_twit'] = n_emojis/n_twits

    # n_stopwords_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_stopwords_per_twit'] = n_stopwords/n_twits

    # avg_twit_similarity
    try:
        tfidf = TfidfVectorizer(stop_words='english').fit_transform(twits['text'])
        similarity = cosine_similarity(tfidf, tfidf)
        df_users.loc[df_users['id'] == user_id, 'avg_twit_similarity'] = similarity[np.triu_indices_from(similarity, k=1)].mean()
    except:
        df_users.loc[df_users['id'] == user_id, 'avg_twit_similarity'] = pd.NA
    
    # n_commas_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_commas_per_twit'] = n_commas/n_twits

    # n_points_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_points_per_twit'] = n_points/n_twits

    # n_semicolons_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_semicolons_per_twit'] = n_semicolons/n_twits

    # n_exclamations_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_exclamations_per_twit'] = n_exclamations/n_twits

    # n_quotes_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_quotes_per_twit'] = n_quotes/n_twits

    # n_oparentheses_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_oparentheses_per_twit'] = n_oparentheses/n_twits

    # n_cparentheses_per_twit
    df_users.loc[df_users['id'] == user_id, 'n_cparentheses_per_twit'] = n_cparentheses/n_twits

### Save Enhanced Dataset

In [None]:
df_users.to_csv("./datasets/enhanced/users1.csv.gz")

## Variational Autoencoder Bot Detection Model

Another popular approach to anomaly detection that has gained a lot of traction as deep learning became more widely available is based on reconstruction methods. The underlying idea is based on the assumption that if a model can learn a function that compresses and reconstructs normal data, then it will fail to do so when encountered with anomalous data because its function was only trained on normal data. The failure to reconstruct data or, more accurately, the range of the reconstruction error that it entails, can therefore signal the presence of anomalous data.

An autoencoder is a deep learning model that is usually based on two main components: an encoder that learns a lower-dimensional representation of input data, and a decoder that tries to reproduce the input data in its original dimension using the lower-dimensional representation generated by the encoder. The idea underlying this architecture is quite similar to that of image compression: a well-trained encoder learns to encode the input data in such a way that will capture the most important information it contains and which will therefore be sufficient (or as close as possible to be sufficient) to reproduce it by the decoder.

In a VAE, the encoder similarly learns a function that takes as its input a vector of size n. However, instead of learning how to generate a latent vector that the decoder function can reproduce, as traditional AEs do, a VAE learns to generate two vectors (of size m) that represent the parameters (mean and variance) of a distribution from which the latent vector is sampled, and which the decoder function can transform back to the original input vector. Simply put, while the AE’s learning task is to learn a function that will transform data into a latent vector that a decoder can easily reproduce, the VAE’s learning task is to learn a function that will generate parameters of distributions from which a latent vector that a decoder can easily reproduce can be sampled.

### Read Dataset

#### Read Dataset

In [None]:
df_users = pd.read_csv("./datasets/enhanced/users1.csv.gz", index_col=0, parse_dates=['join_date'])
df_users.shape

In [None]:
pd.isna(df_users['avg_twit_similarity']).value_counts()

In [5]:
# Reads the enhanced dataset
df_users = pd.read_csv("./datasets/enhanced/users1.csv.gz", index_col=0, parse_dates=['join_date'])

# Removes rows without data
df_users = df_users.dropna()

# Selects only the used features
X = df_users.drop(['id', 'username', 'name', 'join_date'], axis=1)

#### Scaling

In [6]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

### Model Architecture

#### Encoder

In [8]:
def sample(args):
    z_mean, z_log_var = args
    batch = tf.keras.backend.shape(z_mean)[0]
    dim = tf.keras.backend.int_shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.keras.backend.exp(0.5 * z_log_var) * epsilon

In [9]:
# Sets the encoder's input
encoder_input = tf.keras.layers.Input(shape=(X.shape[1],), name="encoder_input")

# Obtains the encoder's output
encoder_output = tf.keras.layers.Dense(X.shape[1]//2, activation='relu')(encoder_input)
z_mean = tf.keras.layers.Dense(X.shape[1]//3, name="z_mean")(encoder_output)
z_log_var = tf.keras.layers.Dense(X.shape[1]//3, name="z_log_var")(encoder_output)
encoder_output = tf.keras.layers.Lambda(sample, output_shape=(X.shape[1]//3,), name="latent_space")([z_mean, z_log_var])

# Builds the encoder
encoder = tf.keras.Model(inputs=encoder_input, outputs=encoder_output, name="encoder")

#### Decoder

In [10]:
# Sets the decoder's input
decoder_input = tf.keras.layers.Input(shape=(X.shape[1]//3,), name="decoder_input")

# Obtains the decoder's output
decoder_output = tf.keras.layers.Dense(X.shape[1]//2, activation='relu')(decoder_input)
decoder_output = tf.keras.layers.Dense(X.shape[1], activation='sigmoid')(decoder_output)

# Builds the decoder
decoder = tf.keras.Model(decoder_input, decoder_output, name="decoder")

#### Variational Autoencoder

In [11]:
# Sets the vae's input
vae_input = encoder_input

# Obtains the vae's output
vae_output = encoder(vae_input)
vae_output = decoder(vae_output)

# Builds the vae model
model = tf.keras.Model(encoder_input, vae_output, name="vae")

In [12]:
# Defines the vae's optimizer
learning_rate = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipvalue=0.5)

# Defines the vae's loss function
def vae_loss(x, x_decoded):
    reconstruction_loss = tf.keras.backend.sum(tf.keras.backend.square(x - x_decoded))
    kl_loss = -0.5 * (1 + z_log_var - tf.keras.backend.square(z_mean) - tf.keras.backend.exp(z_log_var))
    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=-1))
    total_loss = tf.keras.backend.mean(reconstruction_loss + kl_loss)
    return reconstruction_loss

# Compiles the vae model
model.compile(optimizer, loss=vae_loss)

### Model Training

In [None]:
# Sets training params
epochs = 500
batch_size = 1024
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("./models/user_cls/weights.h5", monitor='loss', save_freq='epoch', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.05, patience=5, restore_best_weights=True),
    tf.keras.callbacks.CSVLogger("./models/user_cls/history.csv", separator=',', append=True)
]

# Trains the model
history = model.fit(
    x=X, y=X,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    shuffle=True,
)

# Saves the model
model.save("./models/user_cls")

### Model Predicting

In [None]:
# Loads the model's weights
model.load_weights("./models/user_cls/weights.h5")

# Obtains the decoded variables
X_decoded = model.predict(X)

# Merges the original and the decoded sets
X_full = np.hstack([X, X_decoded])

## K Nearest Neighbors (KNN)

### Calculating Neighbors

In [15]:
# Builds the KNN model with k=6
nbrs = NearestNeighbors(n_neighbors=6)

# Fits the model
nbrs.fit(X_full)

# Obtains the neighbors' distances
dists, idx = nbrs.kneighbors(X_full)

### Defining Threshold

In [None]:
import seaborn as sns

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.labelsize': 22,
    'axes.titlesize': 24,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 16,
    'legend.title_fontsize': 16,
    'axes.labelpad': 10,
    'axes.facecolor': '#EAEAF2'
})

# Saving params
saving_folder = "./latex"
saving_format = 'png'
dpi = 100

ax = sns.histplot(dists.mean(axis=1), bins=500, log_scale=(False, True), kde=False)
ax.set_xlabel("Average Distance")
ax.set_ylabel("User Count")
plt.savefig(f"{saving_folder}/imgs/distance_distribution.{saving_format}", format=saving_format, dpi=dpi, bbox_inches='tight')
plt.show()


In [None]:
# Sets a threshold
thold = 0.09

# Gets the percentage of anomalies in the dataset
n_anomalies = (dists.mean(axis=1) > thold).astype(int).sum()
pct_anomalies = n_anomalies/dists.shape[0]

# Plots the Anomaly Region
plt.scatter(range(dists.shape[0]), dists.mean(axis=1), s=3)
plt.axhspan(thold, max(dists.mean(axis=1)), alpha=0.2, color='r')
plt.title(f"Anomaly Region ({100*pct_anomalies:.2f}% anomalies)")
plt.show()

### Classifying Users

In [23]:
df_users['type'] = "Human"
df_users.loc[dists.mean(axis=1) > thold, 'type'] = "Bot"

### Saving Results

In [18]:
df_users.to_csv("./datasets/enhanced/users2.csv.gz")