In [1]:
!pip install tweepy python-dotenv pandas --quiet

import os
import tweepy
import pandas as pd
from dotenv import load_dotenv

In [2]:
# Load .env
load_dotenv()

# Twitter API credentials
BEARER_TOKEN = os.getenv("BEARER_TOKEN")

# Authenticate with Twitter API v2
client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)

In [3]:
# Define tickers and corresponding names
ticker_name_map = {
    "AAPL": "Apple",
    "TSLA": "Tesla",
    "MSFT": "Microsoft",
    "GOOGL": "Google",
    "AMZN": "Amazon",
    "SP500": "S&P 500",
    "NASDAQ": "NASDAQ",
    "IBEX35": "IBEX 35"
}

# Companies/keywords we want to track
keywords1 = ["AAPL", "TSLA", "MSFT", "GOOGL", "AMZN"]
keywords2 = ["SP500", "NASDAQ", "IBEX35"]
max_tweets_per_keyword = 50

In [4]:
def collect_tweets(keywords):
    for ticker in keywords:
        company_name = ticker_name_map.get(ticker, "")
        print(f"Searching tweets for {ticker} ({company_name})...")

        query = (
            f'({ticker} OR ${ticker} OR "#{ticker}" OR "{company_name}"'
            f') (stock OR price OR shares OR market OR buy OR sell OR volatility OR bullish OR bearish OR earnings) '
            f'lang:en -is:retweet'
        )
        print(f"Query: {query}")

        tweets_per_keyword = []

        tweets = tweepy.Paginator(
            client.search_recent_tweets,
            query=query,
            tweet_fields=["created_at", "public_metrics", "text", "lang"],
            max_results=100
        ).flatten(limit=max_tweets_per_keyword)

        count = 0
        for tweet in tweets:
            tweets_per_keyword.append({
                "ticker": ticker,
                "date": tweet.created_at,
                "text": tweet.text,
                "likes": tweet.public_metrics["like_count"],
                "retweets": tweet.public_metrics["retweet_count"]
            })
            count += 1

        print(f"Collected {count} tweets for {ticker}")

        df_partial = pd.DataFrame(tweets_per_keyword)
        df_partial.to_csv(f"../data/raw/tweets_partial_{ticker}.csv", index=False)
        print(f"Saved partial dataset: tweets_partial_{ticker}.csv")

    print("Finished collecting tweets for current batch.")

In [None]:
collect_tweets(keywords1)

Rate limit exceeded. Sleeping for 689 seconds.


Searching tweets for MSFT (Microsoft)...
Query: (MSFT OR $MSFT OR "#MSFT" OR "Microsoft" ) (stock OR price OR shares OR market OR buy OR sell OR volatility OR bullish OR bearish OR earnings) lang:en -is:retweet


Rate limit exceeded. Sleeping for 902 seconds.


In [None]:
collect_tweets(keywords2)

In [None]:
# Collect all dataframes
all_keywords = keywords1 + keywords2
all_dfs = []

for ticker in all_keywords:
    csv_path = f"../data/raw/tweets_partial_{ticker}.csv"
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        all_dfs.append(df)
    else:
        print(f"Warning: File {csv_path} not found, skipping.")

if all_dfs:
    df_combined = pd.concat(all_dfs, ignore_index=True)
    final_path = "../data/raw/tweet_finance.csv"
    df_combined.to_csv(final_path, index=False)
    print(f"Saved combined dataset {final_path} with {len(df_combined)} tweets total.")
    print(df_combined.head())
else:
    print("No tweet data found to combine.")