In [None]:
# dependencies
import re
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt

from zipfile import ZipFile
from matplotlib.pyplot import figure
from collections import Counter, defaultdict
from itertools import combinations

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics.pairwise import euclidean_distances as euclidean

In [None]:
# installing Character BERT
!git clone https://github.com/helboukkouri/character-bert.git

In [None]:
# installing Transformers for CharacterBert
! pip install transformers

## Loading the Data

In [None]:
# loading the dat
userData = pd.read_csv("anonymized_disqus_user_data.csv")
botComments = pd.read_csv("botnet_comments.csv")
# potentialBot = pd.read_csv("potential_social_botnets.csv")
# metaData = pd.read_csv("user_meta_data.csv")

In [None]:
# converting message to string as some of them are float, etc.
botComments.message = botComments.message.astype(str)

In [None]:
# removing the html tags
botComments.message = botComments.message.apply(lambda html: re.sub('<.*?>', '', html))

## Bot Detection and Metrics

### Clustering UserNames

In [None]:
%cd ./character-bert/

In [None]:
!pwd

In [None]:
from modeling.character_bert import CharacterBertModel
from utils.character_cnn import CharacterIndexer

In [None]:
!python download.py --model='general_character_bert'

In [None]:
# Load some pre-trained CharacterBERT
model = CharacterBertModel.from_pretrained(
    './pretrained-models/general_character_bert/')

In [None]:
# np.array of all usernames
x = botComments.username.drop_duplicates().to_numpy()
n_batch = min(x.shape[0]//512, 60)
batch = x[:n_batch*512].reshape((n_batch, 512))


# Convert token sequence into character indices
indexer = CharacterIndexer()
batch_ids = indexer.as_padded_tensor(batch)

# Feed batch to CharacterBERT & get the embeddings
embeddings_for_batch, _ = model(batch_ids)
embeddings_for_x = embeddings_for_batch

In [None]:
userNameEmbeddings = embeddings_for_x.reshape(-1, embeddings_for_x.shape[-1]).detach().numpy()

In [None]:
userNameEmbeddings.shape

In [None]:
# Agglomerative Clustering (Heirarchial Clustering)
clusteringModel = AgglomerativeClustering(n_clusters=None, distance_threshold=11.9)
clusteringModel.fit(userNameEmbeddings)
clusterAssignment = clusteringModel.labels_

In [None]:
# processing clustered data
clusteredUsers = defaultdict(set)
for userId, clusterId in enumerate(clusterAssignment):
    clusteredUsers[clusterId].add(x[userId])

In [None]:
len(clusteredUsers)

In [None]:
clusteredUsers

In [None]:
euclidean([userNameEmbeddings[991]], [userNameEmbeddings[992]])

In [None]:
cosine([userNameEmbeddings[991]], [userNameEmbeddings[992]])

### Metrics

In [None]:
# cluster based on same password
def clusterByPassword(botDF, userData):
  bots = botDF[['username']].drop_duplicates()
  botsWithDetails = pd.merge(botDF, userData, on='username')
  groupedBots = botsWithDetails.groupby(by='password_id').agg(set)
  expectedCluster = {}
  for id, cluster in enumerate(groupedBots.username.values):
    expectedCluster[id] = cluster
  return expectedCluster

In [None]:
# defining function to get true/predicted edges
def getEdges(clusters):
  allEdges = set()
  for cluster in clusters.values():
    edges = set(combinations(cluster, 2))
    allEdges |= edges
  return allEdges

In [None]:
# metric definition
def getMetric(n, trueEdges, predictedEdges):
  totalEdges = n*((n - 1)/2)
  trueEdges = set(map(tuple, map(sorted, trueEdges)))
  predictedEdges = set(map(tuple, map(sorted, predictedEdges)))
  truePositives = trueEdges & predictedEdges
  falsePositives = predictedEdges - trueEdges
  print("TE/PE:", len(trueEdges), len(predictedEdges))
  print("TP/FP:", len(truePositives), len(falsePositives))
  accuracy = len(truePositives)/len(trueEdges)
  penalizingFactor = 1 - ((len(falsePositives)*len(truePositives))/totalEdges)
  return accuracy*penalizingFactor

In [None]:
expectedCluster = clusterByPassword(botComments, userData)

In [None]:
trueEdges = getEdges(expectedCluster)

In [None]:
predictedEdges = getEdges(clusteredUsers)

In [None]:
n = pd.merge(botComments[['username']].drop_duplicates(), userData, on='username').shape[0]

In [None]:
getMetric(n, trueEdges, predictedEdges)

In [None]:
userNameToEmbedding = {k: v for k, v in zip(x, userNameEmbeddings)}

In [None]:
def getEmbedding(username):
    return userNameToEmbedding[username]

In [None]:
embeddings = botComments.username.apply(lambda username: getEmbedding(username))

In [None]:
embeddings = embeddings.to_numpy()

In [None]:
embeddings = np.stack(embeddings)

In [None]:
embeddings.shape

In [None]:
x.shape

In [None]:
rem_batch = [x[29184:]]

batch_ids_ = indexer.as_padded_tensor(rem_batch)

# Feed batch to CharacterBERT & get the embeddings
embeddings_for_batch_, _ = model(batch_ids_)
embeddings_for_x_ = embeddings_for_batch[0]

In [None]:
embeddings_for_x_ = embeddings_for_x_.detach().numpy()

In [None]:
for k, v in zip(rem_batch, embeddings_for_x_):
    userNameToEmbedding[k] = v

In [None]:
rem_batch = rem_batch[0].tolist()

In [None]:
botComments.shape

In [None]:
%cd ..

In [None]:
np.save("userEmbeddings.npy", embeddings)