# Rehydrate and enrich a random set of user accounts with Botometer and BotometerLite scores

1. Make an output directory to write API results to and load libraries
2. Randomly sample n tweets [GWU tweet sets](https://tweetsets.library.gwu.edu/datasets)
3. Rehydrate the tweet IDs in the Python terminal using `twarc`
4. Sample n unique, english speaking accounts
5. Enrich with Botometer scores
6. Enrich with BotometerLite scores
7. Merge results and output to csv

## 1. Rehydrate tweet IDs from Bot-wiki

In [None]:
import pandas as pd
import twitter_col
import os
bots = pd.read_json('data/botwiki-2019_tweets.json')
bots = pd.json_normalize(bots['user'])
bots['id'].to_csv("data/bot_wiki_ids.txt", header=None, index=False)

1. twarc configure
2. twarc hydrate COVID_sample_tweet_ids.txt > COVID_sample_rehydrated.jsonl

## 2. Read in rehydrated data

In [None]:
tweets = twitter_col.parse_twitter_json("/Users/dankoban/Documents/bot_detection/data/bot_wiki_rehydrated.jsonl", 
                                    to_csv = False, sentiment = False)

tweets = tweets[tweets['status_lang'] == 'en']
user_ids = tweets['id_str'].unique().tolist()
len(user_ids)

## 3. Enrich with Botometer scores

To run this code you will need a Twitter developer account and Rapid API key. Each API result is written to an individual csv file to prevent losing results in the event a kernel dies. Botometer allows user to check up to 17,280 Twitter accounts per day. However, I have never come close to reaching that limit due to latency of the API. It generally takes me about 2 days to pull 10,000 accounts.

In [None]:
import botometer
import time

rapidapi_key = ''
twitter_app_auth = {
    'consumer_key': '',
    'consumer_secret': '',
    'access_token': '',
    'access_token_secret': '',
  }
bom = botometer.Botometer(wait_on_ratelimit=True,
                          rapidapi_key=rapidapi_key,
                          **twitter_app_auth)

In [None]:
botometer_full = []
i = 0
for user in user_ids:
    i+=1
    try:
        result = bom.check_account(user)            
        temp = pd.DataFrame(result)
        temp = pd.DataFrame({'id_str': [temp['user']['user_data']['id_str']],
                             'screen_name': [temp['user']['user_data']['screen_name']],
                             'cap_en': [temp['cap']['english']],
                             'cap_un': [temp['cap']['universal']],

                             'astroturf_raw_en': [temp['raw_scores']['english']['astroturf']],
                             'fake_follower_raw_en': [temp['raw_scores']['english']['fake_follower']],
                             'financial_raw_en': [temp['raw_scores']['english']['financial']],
                             'other_raw_en': [temp['raw_scores']['english']['other']],
                             'overall_raw_en': [temp['raw_scores']['english']['overall']],
                             'self_declared_raw_en': [temp['raw_scores']['english']['self_declared']],
                             'spammer_raw_en': [temp['raw_scores']['english']['spammer']],

                             'astroturf_display_en': [temp['display_scores']['english']['astroturf']],
                             'fake_follower_display_en': [temp['display_scores']['english']['fake_follower']],
                             'financial_display_en': [temp['display_scores']['english']['financial']],
                             'other_display_en': [temp['display_scores']['english']['other']],
                             'overall_display_en': [temp['display_scores']['english']['overall']],
                             'self_declared_display_en': [temp['display_scores']['english']['self_declared']],
                             'spammer_display_en': [temp['display_scores']['english']['spammer']],

                             'astroturf_raw_un': [temp['raw_scores']['universal']['astroturf']],
                             'fake_follower_raw_un': [temp['raw_scores']['universal']['fake_follower']],
                             'financial_raw_un': [temp['raw_scores']['universal']['financial']],
                             'other_raw_un': [temp['raw_scores']['universal']['other']],
                             'overall_raw_un': [temp['raw_scores']['universal']['overall']],
                             'self_declared_raw_un': [temp['raw_scores']['universal']['self_declared']],
                             'spammer_raw_un': [temp['raw_scores']['universal']['spammer']],

                             'astroturf_display_un': [temp['display_scores']['universal']['astroturf']],
                             'fake_follower_display_un': [temp['display_scores']['universal']['fake_follower']],
                             'financial_display_un': [temp['display_scores']['universal']['financial']],
                             'other_display_un': [temp['display_scores']['universal']['other']],
                             'overall_display_un': [temp['display_scores']['universal']['overall']],
                             'self_declared_display_un': [temp['display_scores']['universal']['self_declared']],
                             'spammer_display_un': [temp['display_scores']['universal']['spammer']]
                     })
        print(i)
        timestr = time.strftime("%m%d%Y_%H%M")
        temp.to_csv('/Users/dankoban/Documents/bot_detection/data/' + str(user) + timestr + ".csv")
        botometer_full.append(temp)

    except:
        pass  

## 4. Enrich with BotometerLite scores 

BotometerLite allows for batch queries of up to 20,000 accounts per day and completes in minutes.

In [None]:
# Helper function to split user ids into batches.  
# BotometerLite accepts up to 100 user IDs per query.
def batch(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]   

# Read in the Botometer scores gathered in step 3.
files = os.listdir('/Users/dankoban/Documents/bot_detection/data/')
df_list = []
for file in files:
    temp = pd.read_csv('/Users/dankoban/Documents/bot_detection/data/' + file, 
                        dtype={'id_str': 'str'})
    df_list.append(temp)
df = pd.concat(df_list)
        
user_ids = df['id_str'].unique().tolist()                
batches = list(batch(user_ids, 100))
len(batches)

In [None]:
blt_twitter = botometer.BotometerLite(rapidapi_key=rapidapi_key, **twitter_app_auth)
blt_scores = []
for batch in batches:
    temp = blt_twitter.check_accounts_from_user_ids(batch)
    blt_scores.append(pd.DataFrame(temp))
blt_scores = pd.concat(blt_scores)

## 5. Combine Botometer and BotometerLite scores into a single dataframe

In [None]:
blt_scores = blt_scores.rename(columns={'user_id': 'id_str',
                                        'botscore': 'bot_lite'})
blt_scores['id_str'] = blt_scores['id_str'].astype('str')

# merge bot scores
merged_bot_scores = df.merge(blt_scores[['id_str', 'bot_lite']], how = 'left', on = 'id_str')

keep_cols = ['id_str', 'cap_en', 'cap_un',
       'astroturf_raw_en', 'fake_follower_raw_en', 'financial_raw_en',
       'other_raw_en', 'overall_raw_en', 'self_declared_raw_en',
       'spammer_raw_en', 'astroturf_display_en', 'fake_follower_display_en',
       'financial_display_en', 'other_display_en', 'overall_display_en',
       'self_declared_display_en', 'spammer_display_en', 'astroturf_raw_un',
       'fake_follower_raw_un', 'financial_raw_un', 'other_raw_un',
       'overall_raw_un', 'self_declared_raw_un', 'spammer_raw_un',
       'astroturf_display_un', 'fake_follower_display_un',
       'financial_display_un', 'other_display_un', 'overall_display_un',
       'self_declared_display_un', 'spammer_display_un', 'bot_lite']
merged_bot_scores = merged_bot_scores[keep_cols]
merged_bot_scores = merged_bot_scores[merged_bot_scores['bot_lite'].notnull()]
merged_bot_scores

# merge with the user profile info
tweets_filtered = tweets[tweets['id_str'].isin(merged_bot_scores.id_str)]
final_df = tweets_filtered.merge(merged_bot_scores, how = 'left', on = 'id_str')
final_df.to_csv('/Users/dankoban/Documents/bot_detection/data/' + 'enriched_accounts.csv')