# PySpark for Social Bot Detection (Twibot-22 benchmark)

In [1]:
from pyspark.sql import SparkSession

# Create SparkSession from builder
# If the sample data you work with is small, you can remove the `.config` call
spark = SparkSession.builder.appName('Spark') \
            .config("spark.driver.memory", "10g") \
            .config("spark.driver.maxResultSize", "3g") \
            .getOrCreate()
sc = spark.sparkContext

23/04/14 17:09:39 WARN Utils: Your hostname, Mufins-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.16.3.56 instead (on interface en0)
23/04/14 17:09:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/14 17:09:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
small_tweet_df = spark.read.json('partitioned/tweet_1_par_1.json')
tweet_schema_small = small_tweet_df.schema

                                                                                

In [3]:
small_tweet_df.printSchema()

root
 |-- attachments: struct (nullable = true)
 |    |-- media_keys: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- poll_ids: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- author_id: long (nullable = true)
 |-- context_annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- domain: struct (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- entity: struct (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |-- conversation_id: long (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- annotations: array (nullable = true)
 |    |    |-- element: struct (c

In [4]:
full_tweet_df = spark.read.json('partitioned/tweet_1_par_1.json', schema=tweet_schema_small)
full_tweet_df.na.drop("all")

DataFrame[attachments: struct<media_keys:array<string>,poll_ids:array<string>>, author_id: bigint, context_annotations: array<struct<domain:struct<description:string,id:string,name:string>,entity:struct<description:string,id:string,name:string>>>, conversation_id: bigint, created_at: string, entities: struct<annotations:array<struct<end:bigint,normalized_text:string,probability:string,start:bigint,type:string>>,cashtags:array<struct<end:bigint,start:bigint,tag:string>>,hashtags:array<struct<end:bigint,indices:array<bigint>,start:bigint,tag:string,text:string>>,media:array<struct<display_url:string,expanded_url:string,id:bigint,id_str:string,indices:array<bigint>,media_url:string,media_url_https:string,sizes:struct<large:struct<h:bigint,resize:string,w:bigint>,medium:struct<h:bigint,resize:string,w:bigint>,small:struct<h:bigint,resize:string,w:bigint>,thumb:struct<h:bigint,resize:string,w:bigint>>,source_status_id:bigint,source_status_id_str:string,source_user_id:bigint,source_user_id_s

In [5]:
edge_df = spark.read.csv('edge.csv', header=True)
edge_df.show(5)

+--------------------+---------+--------------------+
|           source_id| relation|           target_id|
+--------------------+---------+--------------------+
| u980749991491682304|followers|u1480979504696864775|
|          u105387876|following|          u402576793|
|          u148520716|following|           u59653593|
|u1276438425457967110|following|u1389155636693381120|
|u1445432327367237638|following| u848348952084828160|
+--------------------+---------+--------------------+
only showing top 5 rows



In [6]:
edge_df.select('relation').distinct().show()



+----------+
|  relation|
+----------+
| followers|
| following|
|       own|
|    pinned|
|      post|
|   contain|
|   discuss|
| mentioned|
|  followed|
|      like|
|    quoted|
| retweeted|
|replied_to|
|membership|
+----------+



                                                                                

In [7]:
from pyspark.sql.types import StringType, LongType
import pyspark.sql.functions as F
import re

def preprocessing(row):
    URL_PATTERN = r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    rowlist = str(row).split()
    rowlist = [word.strip() for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '#') else "hashtagtag" for word in rowlist]
    rowlist = [word if not word.strip().startswith(
        '@') else "usertag" for word in rowlist]
    rowlist = [word.lower() for word in rowlist]
    rowlist = [re.sub(URL_PATTERN, "urltag", word) for word in rowlist]
    return " ".join(rowlist)

udfPreprocessing = F.udf(preprocessing, StringType())

In [8]:
preprocessed_tweets = full_tweet_df.withColumn("preprocessed_text", udfPreprocessing(F.col("text"))).select(["preprocessed_text", "author_id", "id"])

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

tokenizer = Tokenizer(inputCol="preprocessed_text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=50000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
tfidfData = pipeline.fit(preprocessed_tweets).transform(preprocessed_tweets)

                                                                                

In [10]:
label_df = spark.read.csv("label.csv", header=True)
idProcessing = F.udf(lambda x: int(x[1:]), LongType())
label_df = label_df.withColumn("author_id", idProcessing(F.col("id")))
label_df = label_df.withColumn("label_bool", F.col("label") == "bot")
label_df = label_df.withColumn("label_int", F.col("label_bool").cast("integer"))
label_df = label_df.select(F.col("author_id"), F.col("label_int").alias("label"))

In [11]:
tfidfData_ = tfidfData.join(label_df, on="author_id", how='left')

In [12]:
label_df.show(5)

+-------------------+-----+
|          author_id|label|
+-------------------+-----+
|1217628182611927040|    0|
|         2664730894|    0|
|1266703520205549568|    0|
|1089159225148882949|    0|
|           36741729|    1|
+-------------------+-----+
only showing top 5 rows



In [13]:
tfidfData_.printSchema()

root
 |-- author_id: long (nullable = true)
 |-- preprocessed_text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)



In [14]:
# join_1_df = tfidfData.join(edge_df, tfidfData["id"] == edge_df["source_id"], how='left')
# join_full_df = join_1_df.join(tfidfData.withColumnRenamed("features", "referenced_features").withColumnRenamed("id", "referenced_id").withColumnRenamed("author_id", "referenced_author_id").alias("b"), F.col("a.target_id") == F.col("b.referenced_id"), how='left')
temp_df = tfidfData_.join(edge_df, tfidfData_["id"] == edge_df["source_id"], how="left")
output_df = temp_df.alias("tfidf1").join(tfidfData_.alias("tfidf2"), F.col("tfidf1.target_id") == F.col("tfidf2.id"), how='left')

In [15]:
output_df.printSchema()

root
 |-- author_id: long (nullable = true)
 |-- preprocessed_text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)
 |-- source_id: string (nullable = true)
 |-- relation: string (nullable = true)
 |-- target_id: string (nullable = true)
 |-- author_id: long (nullable = true)
 |-- preprocessed_text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)



In [16]:
output_df = output_df.select(
    F.col("tfidf1.author_id").alias("source_author_id"),
    F.col("tfidf1.label").alias("source_label"),
    F.col("tfidf1.features").alias("source_features"),
    F.col("relation"),
    F.col("tfidf2.author_id").alias("target_author_id"),
    F.col("tfidf2.label").alias("target_label"),
    F.col("tfidf2.features").alias("target_features")
)

In [17]:
output_df.cache()

DataFrame[source_author_id: bigint, source_label: int, source_features: vector, relation: string, target_author_id: bigint, target_label: int, target_features: vector]

In [18]:
output_df.is_cached

True

In [19]:
distinct_user_id = output_df.select("source_author_id").distinct().rdd.flatMap(lambda x: x).collect()
len(distinct_user_id)

                                                                                

1293

In [20]:
import numpy as np
from scipy.sparse import coo_matrix

def convert_sparse_vectors_to_tensor(sparse_vectors_list):
    # get the number of rows and columns of the sparse vectors
    num_rows = sparse_vectors_list[0].size
    num_cols = len(sparse_vectors_list)

    # create a list of tuples (i, j, v) containing the non-zero entries of the sparse vectors
    non_zero_entries = []
    for j, sparse_vector in enumerate(sparse_vectors_list):
        for i, v in zip(sparse_vector.indices, sparse_vector.values):
            non_zero_entries.append((i, j, v))

    # create a SciPy COO sparse matrix from the non-zero entries
    coo_mat = coo_matrix((np.array([entry[2] for entry in non_zero_entries]),
                          (np.array([entry[0] for entry in non_zero_entries]),
                           np.array([entry[1] for entry in non_zero_entries]))),
                          shape=(num_rows, num_cols))

    # create a PyTorch sparse tensor from the SciPy COO sparse matrix
    torch_sparse_tensor = torch.sparse_coo_tensor(torch.LongTensor([coo_mat.row.tolist(), coo_mat.col.tolist()]),
                                                  torch.FloatTensor(coo_mat.data.astype(np.float32)),
                                                  torch.Size(coo_mat.shape))

    # convert the PyTorch sparse tensor to a dense tensor
    torch_dense_tensor = torch_sparse_tensor.to_dense().T
    
    return torch_dense_tensor

In [25]:
# Retrieve a torch tweet tensor for every user
import pandas as pd
import torch

from tqdm import tqdm

tweets_torch = []
tweet_labels_torch = []
relationships = []
owned_tweets_indicator = []

users = pd.read_csv('label.csv')
# for user_id in tqdm(users['id']):
for user_id in tqdm(distinct_user_id):
# if True:
    # user_id = int(user_id[1:])
    tweets_from_user = output_df.filter(output_df["source_author_id"] == user_id)
    # print(user_id)
    # tweets_from_user = output_df.filter(output_df["source_author_id"] == 3078703231)
    tweets_from_user_df = tweets_from_user.toPandas()
    tweet_count = len(tweets_from_user_df)

    owned_tweets = tweets_from_user_df["source_features"]
    referenced_tweets = tweets_from_user_df["target_features"].dropna()

    owned_labels = tweets_from_user_df["source_label"]
    referenced_labels = tweets_from_user_df["target_label"].dropna()

    ref_tweet_count = len(referenced_tweets)
    ref_label_count = len(referenced_labels)
    owned_tweet_count = len(owned_tweets)
    assert ref_tweet_count == ref_label_count

    # tfidf_array_owned = owned_tweets.rdd.flatMap(lambda x: x).collect()
    if ref_tweet_count != 0:
        # tfidf_array_referenced = referenced_tweets.rdd.flatMap(lambda x: x).collect()
        # tweets_torch.append(torch.tensor(tfidf_array_owned + tfidf_array_referenced))
        tweets_torch.append(convert_sparse_vectors_to_tensor(owned_tweets.tolist() + referenced_tweets.tolist()))
    elif tweet_count != 0:
        # tweets_torch.append(torch.tensor(tfidf_array_owned))
        tweets_torch.append(convert_sparse_vectors_to_tensor(owned_tweets.tolist()))
    else:
        tweets_torch.append(torch.empty([0, 50000]))

    # label_array_owned = owned_labels.rdd.flatMap(lambda x: x).collect()
    if ref_label_count != 0:
        # label_array_referenced = referenced_labels.rdd.flatMap(lambda x: x).collect()
        tweet_labels_torch.append(torch.tensor(owned_labels.tolist() + referenced_labels.tolist()))
    elif tweet_count != 0:
        tweet_labels_torch.append(torch.tensor(owned_labels.tolist()))
    else:
        tweet_labels_torch.append(torch.tensor([]))

    if ref_tweet_count != 0:
        # tweets_from_user = tweets_from_user.withColumn("origin_tweet_new_id", F.monotonically_increasing_id())
        # max_tweet_id = tweets_from_user.agg({"x": "max"}).collect()[0]
        # tweets_from_user = tweets_from_user.dropna(subset="target_features")
        # tweets_from_user = tweets_from_user.withColumn("referenced_tweet_new_id", max_tweet_id + F.monotonically_increasing_id())
        # tweets_from_user = tweets_from_user.select(["origin_tweet_new_id"])
        # rel_array = owned_tweets.rdd.flatMap(lambda x: x).collect()

        tweets_from_user_df["origin_tweet_new_id"] = tweets_from_user_df.index
        max_tweet_id = max(tweets_from_user_df.index)
        tweets_from_user_df = tweets_from_user_df.dropna(subset="target_features").reset_index()
        tweets_from_user_df["referenced_tweet_new_id"] = tweets_from_user_df.index + max_tweet_id + 1
        rel_array = [(i, j) for _, (i, j) in tweets_from_user_df[["origin_tweet_new_id", "referenced_tweet_new_id"]].iterrows()]

        relationships.append(torch.tensor(rel_array))
        
        yes_temp = [1] * (max_tweet_id + 1)
        no_temp = [0] * len(tweets_from_user_df)
        full_temp = yes_temp + no_temp
        owned_tweets_indicator.append(torch.tensor(full_temp))
    else:
        relationships.append(torch.empty([0, 2]))
        owned_tweets_indicator.append(torch.tensor([1] * owned_tweet_count))

100%|████████████████████████████████████████████████████████████████████████████████████| 1293/1293 [02:34<00:00,  8.39it/s]


In [26]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [27]:
class VariedDataset(torch.utils.data.Dataset):
    
    def __init__(self, torch_list):
        self.torch_list = torch_list
        
    def __getitem__(self, idx):
        if idx >= len(self):
            raise ValueError(f"Index exceeds length of the dataset of {len(self)}")
        return self.torch_list[idx]
    
    def __len__(self):
        return len(self.torch_list)

In [28]:
def collate_fn_padd(batch):
    """
    Padds batch of variable length

    Note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    """
    ## Get sequence lengths
    lengths = [t.shape[0] for t in batch]
    print(lengths)
    print(batch[0].shape)
    try:
        n_features = batch[0].shape[1]
        print(n_features)
    except:
        n_features = 1
    max_length = max(lengths)
    if max_length == 0:
        max_length += 1
    batch_size = len(lengths)

    padded_tensor = torch.zeros(batch_size, max_length, n_features, dtype=torch.float32)
    for i, val in enumerate(batch):
        l = lengths[i]
        padded_tensor[i, :l] = val
    
    return padded_tensor

In [29]:
# Missing max_id_user_tweet_rela
dataset = VariedDataset(owned_tweets_indicator)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn_padd)
torch.save(dataloader, "dataloaders/sample/user_tweet_ind.pt")

In [None]:
dataset = VariedDataset(tweets_torch)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn_padd)
torch.save(dataloader, "dataloaders/sample/tweet.pt")

In [None]:
dataset = VariedDataset(tweet_labels_torch)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn_padd)
torch.save(dataloader, "dataloaders/sample/tweet_labels.pt")

In [None]:
dataset = VariedDataset(relationships)
dataloader = DataLoader(relationships, batch_size=8, collate_fn=collate_fn_padd)
torch.save(dataloader, "dataloaders/sample/relationships.pt")

In [None]:
import pandas as pd

user_sample_df = pd.DataFrame(distinct_user_id, columns=['id'])
user_sample_df["id"] = "u" + user_sample_df["id"].astype('str')
user_sample_df.to_csv("sample_user.csv", index=False)

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
