In [1]:

import pandas as pd
import datetime
import time
import json
import functions_twitter

# Read input.csv with interested twitter_username.
users_data_df = pd.read_csv(f'output_0/twitter_summary.tsv', sep = '\t', usecols = ['username', 'id', 'public_metrics.followers_count'])

# Read list of secrets. Due to rate limit, multiple twitter developer accounts are recommended.
secrets_pool = pd.read_csv('twitter_dev_secrets.csv', index_col = False)
bearer_tokens = list(secrets_pool['bearer_token'])

# Filter twitter_usernames with non-zero followers_count
users_data_df_nonzero = users_data_df[(users_data_df['public_metrics.followers_count'] > 0)]

# Rank twitter_usernams by followers_count in descending order
users_data_df_nonzero.sort_values('public_metrics.followers_count', ascending = True, inplace = True)
users_data_df_nonzero.reset_index(drop=True, inplace=True)

# Calculate total users and followers included in analysis
total_users_count = len(users_data_df_nonzero)
total_followers = users_data_df_nonzero['public_metrics.followers_count'].sum()

total_bearer_token_count = min(20, len(bearer_tokens)) # Max cap is at 20 bearer_tokens
request_left = 15 # Request limit for GET /2/users/:id/followers is 15 requests per 15-minute window (app auth)
followers_per_batch = 1000 # per 15minutes
rate_of_req_hrly = total_bearer_token_count*request_left*followers_per_batch*4

est_time_req_hr = total_followers/rate_of_req_hrly

text = f"Total number of Twitter Developer Accounts = {total_bearer_token_count}\nTotal number of Twitter Usernames = {total_users_count}\nTotal number of Followers = {total_followers}\nEstimated time required for analysis = {est_time_req_hr:.2f} hrs"
#open text file
text_file = open(f"output_0/analytics_notice.txt", "w")
#write string to file
text_file.write(text)
#close file
text_file.close()

bearer_tokens_catridge = bearer_tokens.copy()
bearer_token = bearer_tokens_catridge.pop(0)
# request_left = 15 # Request limit for GET /2/users/:id/followers is 15 requests per 15-minute window (app auth)
rate_limit_reset_time = time.mktime(datetime.datetime.now().timetuple()) + 1000 # timestamp when bearer_token_catridge restarted

users_followers = []
error_log = []

for index, row in users_data_df_nonzero.iterrows():
    user_followers = []
    origin_id = row['id']
    origin_username = row['username']
    nxt_page = True
    nxt_token = ''
    while nxt_page:
        print(f'{index} - {origin_username} | {len(bearer_tokens_catridge)} - {request_left}')
        if request_left > 0:
            request, request_left = functions_twitter.twitter_get_followers_by_users(origin_id, nxt_token, bearer_token)
            if request.status_code == 200:
                try:
                    request_json = request.json()
                    user_followers += request_json['data']
                    request_meta = request_json['meta']
                    if 'next_token' in request_meta.keys(): # There are more followers under username
                        nxt_page = True # Continue with while loop
                        nxt_token = f'&pagination_token={request_meta["next_token"]}' # Save pagination token
                    else:
                        nxt_page = False # Stop iteration of current username
                except KeyError:
                    print(f'{origin_username} - Might be a private account. No access allowed.')
                    error_detail = {'error' : f'Might be a private account. No access allowed.', 'username' : origin_username}
                    error_log.append(error_detail)
                    nxt_page = False # Stop iteration of current username
            else: # Requests failed. Did not get full list of followers of username
                print(f'{origin_username} - Incomplete run. Did not finish extracting all followers')
                error_detail = {'error' : f'Incomplete run. Did not finish extracting all followers', 'username' : origin_username}
                error_log.append(error_detail)
                nxt_page = False # Stop iteration of current username
        else:
            request_reset_unixtime = float(request.headers['x-rate-limit-reset']) # timestamp when the request limit for bearer token is expected to reset.
            rate_limit_reset_time = min(request_reset_unixtime,rate_limit_reset_time)
            bearer_tokens_catridge, bearer_token, request_left, rate_limit_reset_time = functions_twitter.catridge_reset(rate_limit_reset_time, bearer_tokens_catridge, bearer_tokens, 15)
    if len(user_followers) == 0:
        continue
    else:
        user_followers = [dict(item, **{'origin_username':origin_username}) for item in user_followers]
        users_followers += user_followers

users_followers_df = pd.DataFrame(users_followers)
users_followers_df.to_csv('output_0/users_followers_df.csv')

error_log_df = pd.DataFrame(error_log)
error_log_df.to_csv('output_0/error_log_df.csv')

0 - GMX_IO
