In [None]:
# dependencies
import re
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt

from zipfile import ZipFile
from matplotlib.pyplot import figure
from collections import Counter, defaultdict
from itertools import combinations

In [None]:
# path to data
path = "./archive.zip"

In [None]:
# extracting the compressed data
with ZipFile(path, 'r') as zip:
  zip.extractall()

In [None]:
# loading the data
userData = pd.read_csv("./anonymized_disqus_user_data.csv")
botComments = pd.read_csv("./botnet_comments.csv")
potentialBot = pd.read_csv("./potential_social_botnets.csv")
metaData = pd.read_csv("./user_meta_data.csv")

In [None]:
# converting message to string as some of them are float, etc.
botComments.message = botComments.message.astype(str)

In [None]:
# removing the html tags
botComments.message = botComments.message.apply(lambda html: re.sub('<.*?>', '', html))

In [None]:
# installing sentence transformers (https://github.com/UKPLab/sentence-transformers)
!pip install -U sentence-transformers

In [None]:
# dependencies
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [None]:
# loading the corpusEmbeddings
corpusEmbeddings = np.load("./corpusEmbeddings.npy")

In [None]:
corpusEmbeddings.shape

In [None]:
# Agglomerative Clustering (Heirarchial Clustering)
clusteringModel = AgglomerativeClustering(n_clusters=None, distance_threshold=5.9)
clusteringModel.fit(corpusEmbeddings)
clusterAssignment = clusteringModel.labels_

In [None]:
# processing clustered data
clusteredSentences = defaultdict(list)
clusteredUsers = defaultdict(set)
for sentenceId, clusterId in enumerate(clusterAssignment):
    clusteredSentences[clusterId].append(sentenceId)
    clusteredUsers[clusterId].add(botComments.username[sentenceId])

In [None]:
len(clusteredUsers)

In [None]:
botComments.message[clusteredSentences[2]]

In [None]:
botComments.message[clusteredSentences[3]]

In [None]:
botComments.message[clusteredSentences[100]]

In [None]:
printer = [print(clusteredUsers[cluster], "\n") for cluster in clusteredUsers]

In [None]:
# cluster based on same password
def clusterByPassword(botDF, userData):
  bots = botDF[['username']].drop_duplicates()
  botsWithDetails = pd.merge(botDF, userData, on='username')
  groupedBots = botsWithDetails.groupby(by='password_id').agg(set)
  expectedCluster = defaultdict(list)
  for id, cluster in enumerate(groupedBots.username.values):
    expectedCluster[id] = cluster
  return expectedCluster

In [None]:
# defining function to get true/predicted edges
def getEdges(clusters):
  allEdges = set()
  for cluster in clusters.values():
    edges = set(combinations(cluster, 2))
    allEdges |= edges
  return allEdges

In [None]:
# metric definition
def getMetric(n, trueEdges, predictedEdges):
  totalEdges = n*((n - 1)/2)
  trueEdges = set(map(tuple, map(sorted, trueEdges)))
  predictedEdges = set(map(tuple, map(sorted, predictedEdges)))
  truePositives = trueEdges & predictedEdges
  falsePositives = predictedEdges - trueEdges
  print("TE/PE:", len(trueEdges), len(predictedEdges))
  print("TP/FP:", len(truePositives), len(falsePositives))
  accuracy = len(truePositives)/len(trueEdges)
  penalizingFactor = 1 - ((len(falsePositives)*len(truePositives))/totalEdges)
  return accuracy*penalizingFactor

In [None]:
expectedCluster = clusterByPassword(botComments, userData)

In [None]:
trueEdges = getEdges(expectedCluster)

In [None]:
predictedEdges = getEdges(clusteredUsers)

In [None]:
getMetric(15163, trueEdges, predictedEdges)

In [None]:
 pd.merge(botComments[['username']].drop_duplicates(), userData, on='username').shape

In [None]:
corpusEmbeddings.shape

In [None]:
botComments[['message']].drop_duplicates().shape

In [None]:
botComments.shape

In [None]:
np.unique(corpusEmbeddings, axis=0).shape