In [1]:
#Install required packages (run once per env)

!pip -q install tweepy python-dotenv pandas vaderSentiment matplotlib


In [3]:
import os
import re
import math
import time
import unicodedata
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from dotenv import load_dotenv
import tweepy

# Load env
load_dotenv()
BEARER = os.getenv("X_BEARER_TOKEN")
if not BEARER:
    raise RuntimeError("X_BEARER_TOKEN not found. Please create a .env file with X_BEARER_TOKEN.")
client = tweepy.Client(bearer_token=BEARER, wait_on_rate_limit=True)

# Define search queries
We focus on Australian politics. Note the Australian Labor Party is spelled **Labor** (not Labour). You can tweak '''MAX_TWEET_PER_QUERY''', date window, and the keyword lists.

In [7]:
# Query config
MAX_TWEETS_PER_QUERY = 100 # Adjust based on your API tier/limits
DAYS_BACK = 30 # Adjust based on period of time, here we are analysing sentiment in the past month
LANG = "en" # English only

SINCE = (dt.datetime.utcnow() - dt.timedelta(days=DAYS_BACK)).strftime("%Y-%m-%dT%H:%M:%SZ")

# Hastags and Aussie context terms

AUS_CONTEXT = ["auspol", "Australia", "Australian", "Canberra", "Parliament"]

# Party-focused keyword sets

LABOR_TERMS = [
    "Australian Labor Party", "ALP", "Labor Australia", "Anthony Albanese", "Albo", "@AlboMP", "@AustralianLabor"
]

LIBERAL_TERMS = [
    "Liberal Party of Australia", "Liberal Australia", "LNP", "Coalition Australia", "Susan Ley"
]

def build_query(terms):
    # Compose an OR list, exclude retweets, filter by language.
    or_block = " OR ".join([f'("{t}")' if " " in t else t for t in terms + AUS_CONTEXT])
    # Avoid very common noise words here; you can add -is:reply to limit to original posts
    q = f"({or_block}) lang:{LANG} -is:retweet"
    return q

QUERIES = {
    "Labor": build_query(LABOR_TERMS),
    "Liberal": build_query(LIBERAL_TERMS),
}

QUERIES

  SINCE = (dt.datetime.utcnow() - dt.timedelta(days=DAYS_BACK)).strftime("%Y-%m-%dT%H:%M:%SZ")


{'Labor': '(("Australian Labor Party") OR ALP OR ("Labor Australia") OR ("Anthony Albanese") OR Albo OR @AlboMP OR @AustralianLabor OR auspol OR Australia OR Australian OR Canberra OR Parliament) lang:en -is:retweet',
 'Liberal': '(("Liberal Party of Australia") OR ("Liberal Australia") OR LNP OR ("Coalition Australia") OR ("Susan Ley") OR auspol OR Australia OR Australian OR Canberra OR Parliament) lang:en -is:retweet'}

# Fetch recent post with expansions and metrics

In [8]:
\
def fetch_recent(query, max_results=MAX_TWEETS_PER_QUERY, since_iso=SINCE):
    # Fetch recent tweets for a given query using X API v2.
    # Returns a list of dicts.
    results = []
    # Tweepy client.search_recent_tweets returns up to 100 per call
    per_call = 100 if max_results >= 100 else max_results
    next_token = None
    fetched = 0
    while fetched < max_results:
        try:
            resp = client.search_recent_tweets(
                query=query,
                max_results=per_call,
                start_time=since_iso,
                tweet_fields=["id","text","lang","created_at","public_metrics","possibly_sensitive","source"],
                user_fields=["username","name","public_metrics","verified"],
                expansions=["author_id"],
            )
        except tweepy.TooManyRequests:
            # Rate limited: wait per API guidance
            time.sleep(60)
            continue

        if not resp.data:
            break

        users = {u["id"]: u for u in (resp.includes.get("users", []) if resp.includes else [])}

        for t in resp.data:
            u = users.get(t.author_id, {}) if hasattr(t, "author_id") else {}
            pm = t.public_metrics or {}
            results.append({
                "id": t.id,
                "created_at": t.created_at,
                "text": t.text,
                "lang": t.lang,
                "retweets": pm.get("retweet_count"),
                "replies": pm.get("reply_count"),
                "likes": pm.get("like_count"),
                "quotes": pm.get("quote_count"),
                "author_id": getattr(t, "author_id", None),
                "author_username": u.get("username"),
                "author_name": u.get("name"),
                "author_verified": u.get("verified"),
                "author_followers": (u.get("public_metrics") or {}).get("followers_count"),
                "source": getattr(t, "source", None),
                "possibly_sensitive": getattr(t, "possibly_sensitive", None),
            })
        fetched += len(resp.data)

        # Tweepy v4's search_recent_tweets response handles pagination internally via .meta
        next_token = resp.meta.get("next_token") if hasattr(resp, "meta") else None
        if not next_token:
            break

    return results

# Fetch both sets
all_rows = []
for party, q in QUERIES.items():
    rows = fetch_recent(q, MAX_TWEETS_PER_QUERY)
    for r in rows:
        r["party"] = party
    all_rows.extend(rows)

df_raw = pd.DataFrame(all_rows)
len(df_raw), df_raw.head(3)

Rate limit exceeded. Sleeping for 878 seconds.


KeyboardInterrupt: 