In [18]:
import requests
import json
import time
import random 
import os
from typing import Tuple
import pandas as pd 

In [26]:

BEARER_TOKEN = os.environ.get("BEARER_TOKEN")

ENDPOINT_URL = "https://api.x.com/2/tweets/search/recent"

query_parameters = {"query": '("crypto news" OR "crypto" OR tokens) lang:en -is:retweet',
    "tweet.fields": "id,text,author_id,created_at",
    "user.fields": "id,name,username,created_at,description,location,verified",
    "expansions": "author_id",
    "max_results": 10,
}

def request_headers(token: str) -> dict:
    '''   Returns a dictionary summarizing the bearer token authentication details.'''

    return {"Authorization": f"Bearer {token}"}

headers = request_headers(BEARER_TOKEN)

In [40]:

def connect_to_endpoint(header: dict, parameters: dict, max_retries: int = 5) -> json:
    """
    Connects to the endpoint and requests data.
    Returns a json with Twitter data if a 200 status code is yielded.
    Programme stops if there is a problem with the request and sleeps
    if there is a temporary problem accessing the endpoint.
    """
    attempt = 0
    while attempt < max_retries:
        response = requests.get(url=ENDPOINT_URL, headers=header, params=parameters, timeout=10)
        response_status_code = response.status_code
        
        if response_status_code == 200:
            return response.json()
        
        elif response_status_code == 429:
            retry_after = int(response.headers.get("Retry-After", random.randint(5,60)))
            print(f"Rate limited. Retrying in {retry_after} seconds...")
            time.sleep(retry_after)

        elif 400 <= response_status_code < 500:
            raise requests.exceptions.HTTPError(
                f"Cannot get data, the program will stop!\nHTTP {response_status_code}: {response.text}"
                )
        else:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            print(f"Temporary issue, retrying in {wait_time:.2f} seconds...\nHTTP {response_status_code}: {response.text}")
            time.sleep(wait_time)
        attempt += 1

    raise requests.exceptions.RetryError("Max retries exceeded. Unable to get data. ")


In [None]:

json_response = connect_to_endpoint(headers, query_parameters)

In [29]:
json_response.keys()

dict_keys(['data', 'includes', 'meta'])

In [30]:
json_response["meta"]

{'newest_id': '1899580628435349651',
 'oldest_id': '1899580620797255763',
 'result_count': 10,
 'next_token': 'b26v89c19zqg8o3frrctzp9i9l6k76vrorv92yvaajrel'}

In [31]:
len(json_response["data"])

10

In [32]:
json_response["data"][0]

{'created_at': '2025-03-11T21:58:08.000Z',
 'text': "The Boost social &amp; on-chain engagement protocol is poised to evolve brand-audience interaction for major brands and creators worldwide\n\nI'm contributing to the @boostdotgg protocol to lead the way in consumer crypto\n\nLet's $BOOST. Join Season 1 with me\n\nhttps://t.co/XCd7uxMjIT",
 'author_id': '1489195098906963974',
 'id': '1899580628435349651',
 'edit_history_tweet_ids': ['1899580628435349651']}

In [33]:
json_response['meta']['next_token']

'b26v89c19zqg8o3frrctzp9i9l6k76vrorv92yvaajrel'

In [34]:
json_response.keys()

dict_keys(['data', 'includes', 'meta'])

In [35]:

def process_x_data(json_response: json,
                   query_tag: str,
                   tweets_data: pd.DataFrame,
                   users_data: pd.DataFrame
                   ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Adds new tweet/user information to the table of 
    tweets/users and saves dataframes as pickle files,
    if data is available.
    """

    if "data" in json_response.keys():
        new = pd.DataFrame(json_response["data"])
        tweets_data = pd.concat([tweets_data, new])
        tweets_data.to_pickle("tweets_" + query_tag + ".pkl")

        if "users" in json_response["includes"].keys():
            new = pd.DataFrame(json_response["includes"]["users"])
            users_data = pd.concat([users_data, new])
            users_data.drop_duplicates("id", inplace=True)
            users_data.to_pickle("users_" + query_tag + ".pkl")
    return tweets_data, users_data 


In [36]:
tweets_data = pd.DataFrame()
users_data = pd.DataFrame()
query_tag = "crypto"

tweets_data, users_data = process_x_data(
        json_response, query_tag, tweets_data, users_data
    )

In [38]:
tweets_data.head()

Unnamed: 0,created_at,text,author_id,id,edit_history_tweet_ids
0,2025-03-11T21:58:08.000Z,The Boost social &amp; on-chain engagement pro...,1489195098906963974,1899580628435349651,[1899580628435349651]
1,2025-03-11T21:58:08.000Z,"@HaykCryptoKing @lynk0x Take the $CHANCE, embr...",1870049003955421184,1899580627684299054,[1899580627684299054]
2,2025-03-11T21:58:08.000Z,📢 Exciting times! I pocketed 14500$ in the $DO...,1828384152548184064,1899580624945508375,[1899580624945508375]
3,2025-03-11T21:58:08.000Z,Exciting news in the crypto world! Coinbase's ...,908628906,1899580624945443158,[1899580624945443158]
4,2025-03-11T21:58:07.000Z,I’m watching my 401k and the housing market an...,28864965,1899580623594922432,[1899580623594922432]


In [39]:
users_data.head()

Unnamed: 0,verified,id,username,name,description,created_at,location
0,False,1489195098906963974,Looweo,Luzeno,crypto enthusiast ~\nbao bao 🐼,2022-02-03T11:12:42.000Z,
1,False,1870049003955421184,alkuice,Alkuice,CryptoPro,2024-12-20T10:10:12.000Z,Nigeria
2,False,1828384152548184064,HollandRaq,moepokes,Not a singer but an artist.. a storyteller .....,2024-08-27T10:48:52.000Z,"Montgomery, IL"
3,False,908628906,LinaShaney,lina shaney caicedo,,2012-10-27T18:09:27.000Z,
4,False,28864965,DawktaJawlz,JewelsVerne 🏳️‍🌈,Psychology PhD. Sr User Researcher. Previously...,2009-04-04T20:33:19.000Z,
