In [None]:
import tweepy
import json
import pickle as pkl
import mysql.connector
from tqdm import tqdm
import time
import pandas as pd

In [None]:
# Twitter API credentials
bearer_token = <YOUR BEARER TOKEN>

# Create API client
client = tweepy.Client(bearer_token)

# # --- Database connection details ---
DB_HOST = <YOUR DB HOST>
DB_USER = <YOUR DB USER>
DB_PASSWORD = <YOUR DB PASSWORD>
DB_NAME = <YOUR DB NAME>

# --- Connect to the database ---
db_connection = mysql.connector.connect(
    host=DB_HOST,
    user=DB_USER,
    password=DB_PASSWORD,
    database=DB_NAME,
    charset="utf8mb4",
    collation="utf8mb4_unicode_ci"
)

# Create a cursor object
cursor = db_connection.cursor()

In [None]:
# load tweet ids
with open('top_tweetIds.pkl', 'rb') as f:
    tweet_ids = pkl.load(f)
tweet_ids = [str(id) for id in tweet_ids]
print(len(tweet_ids))

In [None]:
# Function to fetch tweets by IDs
def fetch_tweets(ids):
    tweets_response = client.get_tweets(ids,
                                        tweet_fields=['author_id','created_at','public_metrics','source'],
                                        user_fields=['affiliation','public_metrics','username','verified','verified_type','parody'])
    data = list(tweets_response.data)
    tweets_data = [dict(tweet_data) for tweet_data in data]
    return tweets_data

# Function to flatten tweet data
def flatten_tweet_data(data):
    flat_data = {}
    for key, value in data.items():
        if isinstance(value, dict):
            flat_data.update(flatten_tweet_data(value))
        else:
            flat_data[key] = value
    return flat_data

# Function to insert tweets into the database
def store_tweets(tweets_data):
    insert_query = """
INSERT IGNORE INTO tweets (
    created_at,
    retweet_count,
    reply_count,
    like_count,
    quote_count,
    bookmark_count,
    impression_count,
    text,
    edit_history_tweet_ids,
    author_id,
    id
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
    # Insert each tweet into the database
    for tweet in tweets_data:
        edit_history_json = json.dumps(tweet["edit_history_tweet_ids"])
        values = (
            tweet['created_at'],
            tweet['retweet_count'],
            tweet['reply_count'],
            tweet['like_count'],
            tweet['quote_count'],
            tweet['bookmark_count'],
            tweet['impression_count'],
            tweet['text'],
            edit_history_json,
            tweet['author_id'],
            tweet['id']
        )
    
        cursor.execute(insert_query, values)
    
    # Commit the transaction
    db_connection.commit()
    
# Function to fetch and store tweets
def fetch_and_store(ids):
    tweets_data = fetch_tweets(ids)
    flat_tweets_data = [flatten_tweet_data(tweet) for tweet in tweets_data]
    # convert datetime to string
    for tweet_data in flat_tweets_data:
        tweet_data['created_at'] = tweet_data['created_at'].strftime('%Y-%m-%d %H:%M:%S')
    store_tweets(flat_tweets_data)

In [None]:
# Fetch and store tweets in batches
batch_size = 100
start_at = 1450
for i in tqdm(range(start_at, len(tweet_ids), batch_size), desc="Processing tweets", unit="batch", unit_scale=True):
    end_i = i + batch_size if i + batch_size < len(tweet_ids) else len(tweet_ids)
    print(f"Processing tweets {i} to {end_i}")
    try:
        fetch_and_store(tweet_ids[i:end_i])
        # sleep for 1 minute to avoid rate limit
        time.sleep(70)
    except Exception as e:
        print(f"Error processing tweets {i} to {end_i}: {e}")
        

In [None]:
# --- Clean up ---
cursor.close()
db_connection.close()