## Preprocessing script

This notebook chooses a (seeded) random subset of humans and nodes in the TwiBot user list and filters tweets to those that are written by or mention accounts in the subset.

In [1]:
import pandas, numpy
import getpass

# Change dirpath to the location of the TwiBot22 dataset on your device
dirpath = '/scratch/{user}/datasets/TwiBot22/'

user = getpass.getuser()
labels = pandas.read_csv(f'{dirpath}/label.csv')

In [3]:
n_humans = sum(labels.label == 'human')
n_bots = len(labels) - n_humans
n_humans, n_bots, len(labels)

(860057, 139943, 1000000)

In [4]:
# overwrite
n_humans = 10000
n_bots = 10000

In [5]:
seed = 42
human_nodes = labels[labels['label'] == 'human'].sample(n=n_humans, random_state=seed)
bot_nodes = labels[labels['label'] == 'bot'].sample(n=n_bots, random_state=seed)

new_labels = pandas.concat([human_nodes, bot_nodes])
len(new_labels)

20000

In [6]:
new_labels['id'] = new_labels['id'].str[1:].astype(int)

In [7]:
new_labels[:5]

Unnamed: 0,id,label
978570,17140361,human
211001,1165940942106312704,human
826938,133482732,human
433434,284933167,human
300833,757907959989080064,human


In [8]:
# old dataframe is called tweet_0: 
# each row is a tweet:
# columns: author_id (int), entities (dict), public_metrics (dict)
# structure of entities: {'hashtags': <list of hashtags>, 'symbols': <list of symbols>, 'user_mentions': <list of dicts>}
#      structure of each dict in user_mentions: {'id': <int>, 'name': <str>}
# structure of public_metrics: {'retweet_count': <int>, 'reply_count': <int>, 'like_count': <int>, 'quote_count': <int>}

# I have a data frame called new_labels where every row is a user:
# columns: id (int), label (str)

# new data frame called new_tweets: 
# each row is a tweet:
# this data frame should only include tweets whose author_id is in new_labels or whose author_id is in mentioned_ids:
# columns: author_id, mentioned_ids, mentions count, hashtag count, symbols count, urls count, in reply to user id, retweet count, reply count, like count, quote count

In [9]:
def is_valid_tweet(row):
    # Extract mentioned_ids from the entities field
    
    mentioned_ids = [
        mention['id'] for mention in row['entities'].get('user_mentions', [])
    ] if row['entities'] else []
    
    # Check if author_id or any mentioned_id is in valid_authors
    return row['author_id'] in valid_authors or bool(set(mentioned_ids) & valid_authors)

In [10]:
import pandas as pd

# Step 1: Extract author_ids from new_labels
valid_authors = set(new_labels['id'])

In [12]:
# Step 2: Process tweets to extract required data
def process_tweet(row):
    entities = row['entities']
    if not entities: entities = {}
    public_metrics = row['public_metrics']
    
    # Extract counts
    mentioned_ids = [mention['id'] for mention in entities.get('user_mentions', [])]
    mention_count = len(mentioned_ids)
    hashtag_count = len(entities.get('hashtags', []))
    symbols_count = len(entities.get('symbols', []))
    urls_count = len(entities.get('urls', [])) if 'urls' in entities else 0
    
    # Extract public metrics
    #print(
    retweet_count = public_metrics['retweet_count']
    reply_count = public_metrics['reply_count']
    like_count = public_metrics['like_count']
    quote_count = public_metrics['quote_count']
    
    text = row['text']
    in_reply_to_user_id = row['in_reply_to_user_id']
    
    return pd.Series({
        'author_id': row['author_id'],
        'mentioned_ids': mentioned_ids,
        'mention_count': mention_count,
        'hashtag_count': hashtag_count,
        'symbols_count': symbols_count,
        'urls_count': urls_count,
        'retweet_count': retweet_count,
        'reply_count': reply_count,
        'like_count': like_count,
        'quote_count': quote_count,
        'text': text,
        'in_reply_to_user_id': in_reply_to_user_id,
        'created_at': row['created_at'],
        'conversation_id': row['conversation_id'],
        'id': row['id'],
    })



## The following four cells can be repeated for all tweet files.

In [13]:
# Loading in giant tweet file
tweet_file = 1
tweets = pandas.read_json(f'/scratch/{user}/datasets/TwiBot22/tweet_{tweet_file}.json')

In [14]:
filtered_tweet = tweets[tweets.apply(is_valid_tweet, axis=1)]

In [15]:
processed_tweet = filtered_tweet.apply(process_tweet, axis=1)

In [16]:
processed_tweet.to_csv(f'{dirpath}/processed_tweet{tweet_file}.csv')