# Rehydrate and enrich a random set of user accounts with Botometer and BotometerLite scores

1. Make an output directory to write API results to and load libraries
2. Randomly sample n tweets [GWU tweet sets](https://tweetsets.library.gwu.edu/datasets)
3. Rehydrate the tweet IDs in the Python terminal using `twarc`
4. Sample n unique, english speaking accounts
5. Enrich with Botometer scores
6. Enrich with BotometerLite scores
7. Merge results and output to csv

## 1. Make an output directory to write API results to and load libraries

In [1]:
# Load libraries
from twarc import Twarc
import pandas as pd
import time
import os
import random 
import twitter_col
import botometer

# make an output directory to write API responses to
collection_name = 'covid_bot_scores'
if not os.path.exists(collection_name):
    os.makedirs(collection_name)
    os.makedirs(collection_name + '/botometer/')

## 2. Randomly sample n tweets [GWU tweet sets](https://tweetsets.library.gwu.edu/datasets)

In [2]:
# Randomly select n tweet IDs from each of the n files
files = os.listdir('COVID_version7/')
files = files[0:5]

# set seed = 1 for reproducibility and randomly sample n accounts
random.seed(1)
tweet_ids = []
for file in files:
    with open('COVID_version7/' + file) as f:
        temp = [line.rstrip() for line in f]
        temp = random.sample(temp, 500)
        tweet_ids = tweet_ids + temp
        print(len(tweet_ids))    

500
1000
1500
2000
2500


In [3]:
# write the tweet ids to a text file
with open(collection_name + '/sample_tweet_ids.txt', 'w') as f:
    for item in tweet_ids:
        f.write("%s\n" % item)

##  3. Rehydrate the tweet IDs in the Python terminal using `twarc`

1. twarc configure
2. twarc hydrate COVID_sample_tweet_ids.txt > COVID_sample_rehydrated.jsonl

## 3. Sample n unique, english speaking accounts

In [4]:
tweets = twitter_col.parse_twitter_json("/Users/dankoban/Documents/EM6574/COVID_sample_rehydrated.jsonl", 
                                    to_csv = False, sentiment = False)

tweets = tweets[tweets['status_lang'] == 'en']

random.seed(1)
user_ids = tweets['id_str'].unique().tolist()
user_ids = random.sample(user_ids, 10)
len(user_ids)

| |                   #                           | 20068 Elapsed Time: 0:00:01


10

## 4. Enrich with Botometer scores

To run this code you will need a Twitter developer account and Rapid API key. Each API result is written to an individual csv file to prevent losing results in the event a kernel dies. Botometer allows user to check up to 17,280 Twitter accounts per day. However, I have never come close to reaching that limit due to latency of the API. It generally takes me about 2 days to pull 10,000 accounts.

In [5]:
import botometer
import time

rapidapi_key = ''
twitter_app_auth = {
    'consumer_key': '',
    'consumer_secret': '',
    'access_token': '',
    'access_token_secret': '',
  }
bom = botometer.Botometer(wait_on_ratelimit=True,
                          rapidapi_key=rapidapi_key,
                          **twitter_app_auth)

In [6]:
botometer_full = []
i = 0
for user in user_ids:
    i+=1
    try:
        result = bom.check_account(user)            
        temp = pd.DataFrame(result)
        temp = pd.DataFrame({'id_str': [temp['user']['user_data']['id_str']],
                             'screen_name': [temp['user']['user_data']['screen_name']],
                             'cap_en': [temp['cap']['english']],
                             'cap_un': [temp['cap']['universal']],

                             'astroturf_raw_en': [temp['raw_scores']['english']['astroturf']],
                             'fake_follower_raw_en': [temp['raw_scores']['english']['fake_follower']],
                             'financial_raw_en': [temp['raw_scores']['english']['financial']],
                             'other_raw_en': [temp['raw_scores']['english']['other']],
                             'overall_raw_en': [temp['raw_scores']['english']['overall']],
                             'self_declared_raw_en': [temp['raw_scores']['english']['self_declared']],
                             'spammer_raw_en': [temp['raw_scores']['english']['spammer']],

                             'astroturf_display_en': [temp['display_scores']['english']['astroturf']],
                             'fake_follower_display_en': [temp['display_scores']['english']['fake_follower']],
                             'financial_display_en': [temp['display_scores']['english']['financial']],
                             'other_display_en': [temp['display_scores']['english']['other']],
                             'overall_display_en': [temp['display_scores']['english']['overall']],
                             'self_declared_display_en': [temp['display_scores']['english']['self_declared']],
                             'spammer_display_en': [temp['display_scores']['english']['spammer']],

                             'astroturf_raw_un': [temp['raw_scores']['universal']['astroturf']],
                             'fake_follower_raw_un': [temp['raw_scores']['universal']['fake_follower']],
                             'financial_raw_un': [temp['raw_scores']['universal']['financial']],
                             'other_raw_un': [temp['raw_scores']['universal']['other']],
                             'overall_raw_un': [temp['raw_scores']['universal']['overall']],
                             'self_declared_raw_un': [temp['raw_scores']['universal']['self_declared']],
                             'spammer_raw_un': [temp['raw_scores']['universal']['spammer']],

                             'astroturf_display_un': [temp['display_scores']['universal']['astroturf']],
                             'fake_follower_display_un': [temp['display_scores']['universal']['fake_follower']],
                             'financial_display_un': [temp['display_scores']['universal']['financial']],
                             'other_display_un': [temp['display_scores']['universal']['other']],
                             'overall_display_un': [temp['display_scores']['universal']['overall']],
                             'self_declared_display_un': [temp['display_scores']['universal']['self_declared']],
                             'spammer_display_un': [temp['display_scores']['universal']['spammer']]
                     })
        print(i)
        timestr = time.strftime("%m%d%Y_%H%M")
        temp.to_csv(collection_name + '/botometer/' + str(user) + timestr + ".csv")
        botometer_full.append(temp)

    except:
        pass  

1
2
3
4
5
6
7
8
9
10


## 5. Enrich with BotometerLite scores 

BotometerLite allows for batch queries of up to 20,000 accounts per day and completes in minutes.

In [8]:
# Helper function to split user ids into batches.  
# BotometerLite accepts up to 100 user IDs per query.
def batch(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]   

# Read in the Botometer scores gathered in step 3.
files = os.listdir(collection_name + '/botometer/')
df_list = []
for file in files:
    temp = pd.read_csv(collection_name + '/botometer/' + file, 
                        dtype={'id_str': 'str'})
    df_list.append(temp)
df = pd.concat(df_list)
        
user_ids = df['id_str'].unique().tolist()                
batches = list(batch(user_ids, 100))
len(batches)

1

In [9]:
blt_twitter = botometer.BotometerLite(rapidapi_key=rapidapi_key, **twitter_app_auth)
blt_scores = []
for batch in batches:
    temp = blt_twitter.check_accounts_from_user_ids(batch)
    blt_scores.append(pd.DataFrame(temp))
blt_scores = pd.concat(blt_scores)

## 6. Combine Botometer and BotometerLite scores into a single dataframe

In [10]:
blt_scores = blt_scores.rename(columns={'user_id': 'id_str',
                                        'botscore': 'bot_lite'})
blt_scores['id_str'] = blt_scores['id_str'].astype('str')

# merge bot scores
merged_bot_scores = df.merge(blt_scores[['id_str', 'bot_lite']], how = 'left', on = 'id_str')

keep_cols = ['id_str', 'cap_en', 'cap_un',
       'astroturf_raw_en', 'fake_follower_raw_en', 'financial_raw_en',
       'other_raw_en', 'overall_raw_en', 'self_declared_raw_en',
       'spammer_raw_en', 'astroturf_display_en', 'fake_follower_display_en',
       'financial_display_en', 'other_display_en', 'overall_display_en',
       'self_declared_display_en', 'spammer_display_en', 'astroturf_raw_un',
       'fake_follower_raw_un', 'financial_raw_un', 'other_raw_un',
       'overall_raw_un', 'self_declared_raw_un', 'spammer_raw_un',
       'astroturf_display_un', 'fake_follower_display_un',
       'financial_display_un', 'other_display_un', 'overall_display_un',
       'self_declared_display_un', 'spammer_display_un', 'bot_lite']
merged_bot_scores = merged_bot_scores[keep_cols]
merged_bot_scores = merged_bot_scores[merged_bot_scores['bot_lite'].notnull()]
merged_bot_scores

# merge with the user profile info
tweets_filtered = tweets[tweets['id_str'].isin(merged_bot_scores.id_str)]
final_df = tweets_filtered.merge(merged_bot_scores, how = 'left', on = 'id_str')
final_df.to_csv(collection_name + '/enriched_accounts.csv')