In [3]:
import numpy as np
import pandas as pd
import requests
import datetime
import time
import functions_transform
import functions_twitter
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

In [6]:
# Read input.csv with interested twitter_username.
input_df = pd.read_csv(f'input.csv', index_col = 0)

# Standardise all twitter_username to lowercase.
input_df['twitter_username'] = input_df['twitter_username'].apply(lambda x: x.lower() if type(x) == str else x)

# Extract twitter_username from input_df and remove NaN.
twitter_socials = input_df['twitter_username'].dropna()

# Extract all unique twitter_usernames and put them into a list
unique_twitter_users = list(dict.fromkeys(list(twitter_socials)))

# group unique_twitter_usernames into batch of 100s and convert them into chunks of strings
user_groups = list(functions_transform.chunks(unique_twitter_users, 100))
user_strings, counts = functions_transform.string_chunks(user_groups)

# Read list of secrets. Due to rate limit, multiple twitter developer accounts are recommended.
secrets_pool = pd.read_csv('twitter_dev_secrets.csv', index_col = False)
bearer_tokens = list(secrets_pool['bearer_token'])

# Select first token on the list to start requests
# auth_header = {'Authorization' : f'Bearer {bearer_tokens[0]}'}

bearer_tokens_catridge = bearer_tokens.copy()
bearer_token = bearer_tokens_catridge.pop(0)
rate_limit_reset_time = time.mktime(datetime.datetime.now().timetuple()) + 1000 # timestamp when bearer_token_catridge restarted

# user_fields = 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld'
user_fields = 'created_at,id,public_metrics,username,verified'

# Call Twitter API to extract username info
users_data = []
users_errors = []
total_batch = len(user_strings)
request_left = 300
for i, user_string in enumerate(user_strings):
    print(f'{i} / {total_batch} - {request_left}')
    nxt_page = True
    while nxt_page:
        if request_left > 0:
            request, request_left = functions_twitter.twitter_get_users_by_usernames(user_string, bearer_token, user_fields)
            if request.status_code == 200:
                request_json = request.json()
                if 'errors' in request_json.keys():
                    users_errors += request_json['errors']
                else:
                    users_data += request_json['data']
            else:
                print('gg')
            nxt_page = False
        else:
            request_reset_unixtime = float(request.headers['x-rate-limit-reset']) # timestamp when the request limit for bearer token is expected to reset.
            rate_limit_reset_time = min(request_reset_unixtime,rate_limit_reset_time)
            bearer_tokens_catridge, bearer_token, request_left, rate_limit_reset_time = functions_twitter.catridge_reset(rate_limit_reset_time, bearer_tokens_catridge, bearer_tokens, 300)
            
users_data_df = pd.json_normalize(users_data)
users_errors_df = pd.json_normalize(users_errors)

# Convert users_data_df.created_at from string to datetime.
users_data_df['created_at'] = pd.to_datetime(users_data_df['created_at']).dt.strftime('%Y-%m-%d')
users_data_df['created_at'] = users_data_df['created_at'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))

today = datetime.datetime.today()

# Add column 'days_since_inception' to users_data_df - Number of days since inception of twitter_username
users_data_df['days_since_inception'] = users_data_df['created_at'].apply(lambda x: (today - x).days)

# Add column 'rate_of_tweet_count' to users_data_df - Weekly average tweet_count since inception
users_data_df['wkly_avg_tweet_count'] = users_data_df['public_metrics.tweet_count']*7/(users_data_df['days_since_inception'])

# Add column 'rate_of_follower_count' to users_data_df - Daily average growth of follower count since inception
users_data_df['daily_avg_growth_follower_count'] = users_data_df['public_metrics.followers_count']/users_data_df['days_since_inception']

# Add column 'rate_of_listed_count' to users_data_df - Daily average growth of listed count since inception
users_data_df['daily_avg_growth_listed_count'] = users_data_df['public_metrics.listed_count']/users_data_df['days_since_inception']

# Export users_data_df into csv
users_data_df.sort_values('daily_avg_growth_listed_count', ascending = False, inplace = True)
users_data_df.reset_index(drop=True, inplace=True)
users_data_df.to_csv(f'output_0/twitter_summary.tsv', sep = '\t')

# Export users_errors_df into csv
users_errors_df.to_csv(f'output_0/twitter_errors.tsv', sep = '\t')

0 / 1 - 300
