# Twitter API Validation: Official vs TwitterAPI.io

Compare tweets fetched from both APIs for the same 1-day period to validate data consistency.

In [None]:
# Import standard libraries (fast)
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta, timezone
from pathlib import Path
import os
import sys
import time

print('Standard libraries imported')

In [None]:
# Import tweepy (may take a moment)
import tweepy
print('Tweepy imported')

In [None]:
# Load environment and config
# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Load .env manually to avoid Config.validate() issues
from dotenv import load_dotenv
load_dotenv(Path.cwd().parent / '.env')

# Get API credentials directly from environment
# Check both uppercase (new) and lowercase (old) variable names
X_BEARER_TOKEN = os.getenv("X_BEARER_TOKEN")
TWITTERAPIIO_API_KEY = os.getenv("TWITTERAPIIO_API_KEY") or os.getenv("twitterapiio_API_KEY")

print('Environment loaded')
print(f'Official API Bearer Token: {"SET" if X_BEARER_TOKEN else "NOT SET"}')
print(f'TwitterAPI.io API Key: {"SET" if TWITTERAPIIO_API_KEY else "NOT SET"}')

In [None]:
# Connect to database
from xminer.io.db import engine
from sqlalchemy import text

print('Database connection established')

In [None]:
# Configuration info - just for reference
print('Validation method: Fetching NEW tweets using since_id')
print('Each user will be queried for tweets with ID > their last fetched tweet')

In [None]:
# Get sample of X profiles with their latest tweet_id (since_id)
query = """
SELECT x_user_id, username, since_id, last_tweet_date FROM (
    SELECT DISTINCT ON (xp.x_user_id)
        xp.x_user_id, 
        xp.username,
        t.tweet_id as since_id,
        t.created_at as last_tweet_date
    FROM x_profiles xp
    JOIN politicians_12_2025 p ON xp.username = p.username
    LEFT JOIN tweets t ON t.username = xp.username
    WHERE xp.x_user_id IS NOT NULL
      AND xp.username IS NOT NULL
    ORDER BY xp.x_user_id, t.created_at DESC NULLS LAST
) sub
WHERE since_id IS NOT NULL
ORDER BY RANDOM()
LIMIT 5
"""

with engine.connect() as conn:
    sample_users = pd.read_sql(text(query), conn)

print(f'Sample users for comparison ({len(sample_users)}):')
for _, row in sample_users.iterrows():
    print(f'  - @{row["username"]} (ID: {row["x_user_id"]}) | since_id: {row["since_id"]} | last: {row["last_tweet_date"]}')

In [None]:
# Initialize BOTH API clients

# 1. Official Twitter API (tweepy)
official_client = tweepy.Client(
    bearer_token=X_BEARER_TOKEN,
    wait_on_rate_limit=True
)
print('Official Twitter API client initialized')

# 2. TwitterAPI.io client with pagination support
class TwitterAPIIOClient:
    BASE_URL = "https://api.twitterapi.io"
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.headers = {"X-API-Key": api_key}
    
    def get_user_tweets(self, user_id: int, max_pages: int = 5):
        """Fetch tweets with pagination support"""
        url = f"{self.BASE_URL}/twitter/user/last_tweets"
        all_tweets = []
        cursor = None
        
        for page in range(max_pages):
            params = {
                "userId": str(user_id),
                "includeReplies": "true"
            }
            if cursor:
                params["cursor"] = cursor
            
            response = requests.get(url, headers=self.headers, params=params)
            response.raise_for_status()
            data = response.json()
            
            data_obj = data.get('data', {})
            tweets = data_obj.get('tweets', []) if isinstance(data_obj, dict) else []
            all_tweets.extend(tweets)
            
            # Check for next page
            if not data.get('has_next_page') or not data.get('next_cursor'):
                break
            cursor = data.get('next_cursor')
            time.sleep(0.5)  # Rate limit courtesy
        
        return all_tweets

twitterapiio_client = TwitterAPIIOClient(TWITTERAPIIO_API_KEY)
print('TwitterAPI.io client initialized')

In [None]:
# Define tweet fields for official API
TWEET_FIELDS = [
    'created_at', 'lang', 'public_metrics', 'conversation_id',
    'in_reply_to_user_id', 'possibly_sensitive', 'source',
    'entities', 'referenced_tweets'
]

def normalize_official_tweet(tweet, author_id: int):
    """Normalize tweet from official API to common format"""
    metrics = tweet.public_metrics or {}
    return {
        'tweet_id': str(tweet.id),
        'author_id': author_id,
        'text': tweet.text,
        'created_at': tweet.created_at,
        'lang': tweet.lang,
        'like_count': metrics.get('like_count', 0),
        'reply_count': metrics.get('reply_count', 0),
        'retweet_count': metrics.get('retweet_count', 0),
        'quote_count': metrics.get('quote_count', 0),
        'impression_count': metrics.get('impression_count', 0),
        'tweet_source': tweet.source,  # Renamed to avoid conflict
        'api_source': 'official'
    }

def parse_twitterapiio_datetime(dt_str):
    """Parse TwitterAPI.io datetime format"""
    if not dt_str:
        return None
    try:
        # Format: "Sat Sep 27 09:05:04 +0000 2025"
        return datetime.strptime(dt_str, "%a %b %d %H:%M:%S %z %Y")
    except:
        try:
            return datetime.fromisoformat(dt_str.replace('Z', '+00:00'))
        except:
            return None

def normalize_twitterapiio_tweet(tweet):
    """Normalize tweet from TwitterAPI.io to common format"""
    author = tweet.get('author', {})
    return {
        'tweet_id': str(tweet.get('id')),
        'author_id': author.get('id'),
        'text': tweet.get('text'),
        'created_at': parse_twitterapiio_datetime(tweet.get('createdAt')),
        'lang': tweet.get('lang'),
        'like_count': tweet.get('likeCount', 0),
        'reply_count': tweet.get('replyCount', 0),
        'retweet_count': tweet.get('retweetCount', 0),
        'quote_count': tweet.get('quoteCount', 0),
        'impression_count': tweet.get('viewCount', 0),
        'tweet_source': tweet.get('source'),
        'api_source': 'twitterapiio'
    }

print('Normalization functions defined')

In [None]:
# Fetch NEW tweets from BOTH APIs for each sample user (using since_id)
results = []

for idx, user in sample_users.iterrows():
    user_id = int(user['x_user_id'])
    username = user['username']
    since_id = str(user['since_id'])
    last_tweet_date = user['last_tweet_date']
    
    print(f'\n{"="*60}')
    print(f'[{idx+1}/{len(sample_users)}] Fetching NEW tweets for @{username}')
    print(f'  User ID: {user_id}')
    print(f'  Since ID: {since_id} (last tweet: {last_tweet_date})')
    print(f'{"="*60}')
    
    official_tweets = []
    twitterapiio_tweets = []
    official_error = None
    twitterapiio_error = None
    
    # 1. Fetch from Official API (using since_id)
    print('\n[Official Twitter API]')
    try:
        response = official_client.get_users_tweets(
            id=user_id,
            max_results=100,
            since_id=since_id,  # Only get tweets newer than this
            tweet_fields=TWEET_FIELDS
        )
        if response.data:
            official_tweets = [normalize_official_tweet(t, user_id) for t in response.data]
            print(f'  Retrieved: {len(official_tweets)} NEW tweets')
            if official_tweets:
                dates = [t['created_at'] for t in official_tweets if t['created_at']]
                if dates:
                    print(f'  Date range: {min(dates)} to {max(dates)}')
        else:
            print('  Retrieved: 0 NEW tweets')
    except Exception as e:
        official_error = str(e)
        print(f'  Error: {e}')
    
    # 2. Fetch from TwitterAPI.io (filter by since_id client-side)
    print('\n[TwitterAPI.io]')
    try:
        tweets_raw = twitterapiio_client.get_user_tweets(user_id)
        print(f'  Total from API: {len(tweets_raw)} tweets')
        
        # Filter to only tweets with ID > since_id
        since_id_int = int(since_id)
        for t in tweets_raw:
            tweet_id = t.get('id')
            if tweet_id and int(tweet_id) > since_id_int:
                normalized = normalize_twitterapiio_tweet(t)
                twitterapiio_tweets.append(normalized)
        
        print(f'  NEW tweets (ID > since_id): {len(twitterapiio_tweets)}')
        if twitterapiio_tweets:
            dates = [t['created_at'] for t in twitterapiio_tweets if t['created_at']]
            if dates:
                print(f'  Date range: {min(dates)} to {max(dates)}')
    except Exception as e:
        twitterapiio_error = str(e)
        print(f'  Error: {e}')
    
    results.append({
        'username': username,
        'user_id': user_id,
        'since_id': since_id,
        'official_tweets': official_tweets,
        'twitterapiio_tweets': twitterapiio_tweets,
        'official_count': len(official_tweets),
        'twitterapiio_count': len(twitterapiio_tweets),
        'official_error': official_error,
        'twitterapiio_error': twitterapiio_error
    })

print(f'\n{"="*60}')
print('Fetch complete!')

In [None]:
# Create summary comparison
print('\n' + '='*80)
print('API COMPARISON SUMMARY - NEW TWEETS ONLY')
print('='*80)
print(f'Method: Fetching tweets with ID > since_id (last fetched tweet)')
print(f'Sample size: {len(results)} users\n')

summary_data = []
for r in results:
    summary_data.append({
        'Username': f"@{r['username']}",
        'Since ID': r['since_id'][-8:] + '...',  # Show last 8 chars
        'Official': r['official_count'],
        'TwitterAPI.io': r['twitterapiio_count'],
        'Diff': r['official_count'] - r['twitterapiio_count'],
        'Off Err': 'Yes' if r['official_error'] else '',
        'TIO Err': 'Yes' if r['twitterapiio_error'] else ''
    })

df_summary = pd.DataFrame(summary_data)
print(df_summary.to_string(index=False))

print(f'\n{"-"*40}')
print(f'TOTALS (NEW tweets only):')
print(f'  Official API:   {df_summary["Official"].sum()} tweets')
print(f'  TwitterAPI.io:  {df_summary["TwitterAPI.io"].sum()} tweets')
print(f'  Net Difference: {df_summary["Diff"].sum()}')

In [None]:
# Detailed comparison: Match tweets by ID
print('\n' + '='*80)
print('DETAILED TWEET-BY-TWEET COMPARISON')
print('='*80)

for r in results:
    if r['official_count'] == 0 and r['twitterapiio_count'] == 0:
        print(f'\n@{r["username"]}: No tweets from either API')
        continue
    
    print(f'\n@{r["username"]}')
    print('-' * 40)
    
    # Create sets of tweet IDs
    official_ids = {t['tweet_id'] for t in r['official_tweets']}
    twitterapiio_ids = {t['tweet_id'] for t in r['twitterapiio_tweets']}
    
    common = official_ids & twitterapiio_ids
    only_official = official_ids - twitterapiio_ids
    only_twitterapiio = twitterapiio_ids - official_ids
    
    print(f'  Tweets in BOTH APIs:      {len(common)}')
    print(f'  Only in Official API:     {len(only_official)}')
    print(f'  Only in TwitterAPI.io:    {len(only_twitterapiio)}')
    
    # Show tweets only in one API
    if only_official:
        print(f'\n  Tweets ONLY in Official API:')
        for tid in list(only_official)[:2]:
            tweet = next(t for t in r['official_tweets'] if t['tweet_id'] == tid)
            text_preview = tweet['text'][:60] + '...' if len(tweet['text']) > 60 else tweet['text']
            print(f'    - {tid}: "{text_preview}"')
    
    if only_twitterapiio:
        print(f'\n  Tweets ONLY in TwitterAPI.io:')
        for tid in list(only_twitterapiio)[:2]:
            tweet = next(t for t in r['twitterapiio_tweets'] if t['tweet_id'] == tid)
            text_preview = tweet['text'][:60] + '...' if len(tweet['text']) > 60 else tweet['text']
            print(f'    - {tid}: "{text_preview}"')
    
    # Compare metrics for common tweets
    if common:
        print(f'\n  Metric comparison for matching tweets:')
        for tweet_id in list(common)[:2]:  # Compare first 2
            off_tweet = next(t for t in r['official_tweets'] if t['tweet_id'] == tweet_id)
            tio_tweet = next(t for t in r['twitterapiio_tweets'] if t['tweet_id'] == tweet_id)
            
            text_preview = off_tweet['text'][:50] + '...' if len(off_tweet['text']) > 50 else off_tweet['text']
            print(f'\n    Tweet: "{text_preview}"')
            print(f'    ID: {tweet_id}')
            print(f'                   Official  |  TwitterAPI.io  |  Diff')
            print(f'    Likes:         {off_tweet["like_count"]:>8}  |  {tio_tweet["like_count"]:>13}  |  {off_tweet["like_count"] - tio_tweet["like_count"]:>5}')
            print(f'    Retweets:      {off_tweet["retweet_count"]:>8}  |  {tio_tweet["retweet_count"]:>13}  |  {off_tweet["retweet_count"] - tio_tweet["retweet_count"]:>5}')
            print(f'    Impressions:   {off_tweet["impression_count"]:>8}  |  {tio_tweet["impression_count"]:>13}  |  {off_tweet["impression_count"] - tio_tweet["impression_count"]:>5}')

In [None]:
# Calculate overall match statistics
print('\n' + '='*80)
print('OVERALL VALIDATION STATISTICS')
print('='*80)

total_official = 0
total_twitterapiio = 0
total_common = 0
total_only_official = 0
total_only_twitterapiio = 0

for r in results:
    official_ids = {t['tweet_id'] for t in r['official_tweets']}
    twitterapiio_ids = {t['tweet_id'] for t in r['twitterapiio_tweets']}
    
    total_official += len(official_ids)
    total_twitterapiio += len(twitterapiio_ids)
    total_common += len(official_ids & twitterapiio_ids)
    total_only_official += len(official_ids - twitterapiio_ids)
    total_only_twitterapiio += len(twitterapiio_ids - official_ids)

print(f'\nMethod: NEW tweets only (ID > since_id)')
print(f'Users sampled: {len(results)}')
print(f'\nTweets from Official API:     {total_official}')
print(f'Tweets from TwitterAPI.io:    {total_twitterapiio}')
print(f'\nTweets in BOTH APIs:          {total_common}')
print(f'Only in Official API:         {total_only_official}')
print(f'Only in TwitterAPI.io:        {total_only_twitterapiio}')

print(f'\n--- Match Rates ---')
if total_official > 0:
    official_match = (total_common / total_official) * 100
    print(f'Official tweets also in TIO:  {official_match:.1f}%')

if total_twitterapiio > 0:
    tio_match = (total_common / total_twitterapiio) * 100
    print(f'TIO tweets also in Official:  {tio_match:.1f}%')

all_unique = total_common + total_only_official + total_only_twitterapiio
if all_unique > 0:
    overlap = (total_common / all_unique) * 100
    print(f'Overall overlap rate:         {overlap:.1f}%')

In [None]:
# Visualize comparison
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['NEW Tweet Count by User', 'Overall Tweet Coverage'],
    specs=[[{'type': 'bar'}, {'type': 'pie'}]]
)

# Bar chart: tweets per user
usernames = [f"@{r['username']}" for r in results]
official_counts = [r['official_count'] for r in results]
twitterapiio_counts = [r['twitterapiio_count'] for r in results]

fig.add_trace(
    go.Bar(name='Official API', x=usernames, y=official_counts, marker_color='#1DA1F2'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(name='TwitterAPI.io', x=usernames, y=twitterapiio_counts, marker_color='#FF6B6B'),
    row=1, col=1
)

# Pie chart: overlap
fig.add_trace(
    go.Pie(
        labels=['Both APIs', 'Only Official', 'Only TwitterAPI.io'],
        values=[total_common, total_only_official, total_only_twitterapiio],
        marker_colors=['#2ECC71', '#1DA1F2', '#FF6B6B'],
        textinfo='label+percent+value'
    ),
    row=1, col=2
)

fig.update_layout(
    title=f'API Validation: NEW Tweets (since_id) - {len(results)} users',
    barmode='group',
    height=500,
    showlegend=True
)

fig.show()

In [None]:
# Save detailed results to CSV
all_tweets_data = []

for r in results:
    for t in r['official_tweets']:
        t['username'] = r['username']
        all_tweets_data.append(t.copy())
    
    for t in r['twitterapiio_tweets']:
        t['username'] = r['username']
        all_tweets_data.append(t.copy())

if all_tweets_data:
    df_all = pd.DataFrame(all_tweets_data)
    output_path = Path('../outputs/api_validation_tweets.csv')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df_all.to_csv(output_path, index=False)
    print(f'Saved {len(df_all)} tweet records to {output_path}')
    print(f'  - Official API tweets: {len([t for t in all_tweets_data if t["api_source"] == "official"])}')
    print(f'  - TwitterAPI.io tweets: {len([t for t in all_tweets_data if t["api_source"] == "twitterapiio"])}')
else:
    print('No tweets to save')

## Interpretation Guide

### What the metrics mean:
- **Tweets in BOTH APIs**: These tweets were returned by both APIs - high confidence data
- **Only in Official API**: Tweets the official API returned but TwitterAPI.io missed
- **Only in TwitterAPI.io**: Tweets TwitterAPI.io returned but official API missed

### Expected outcomes:
- **High overlap (>90%)**: Both APIs are returning consistent data
- **Metric differences**: Small differences in engagement counts are normal due to API timing
- **Missing tweets**: May be due to API pagination limits or timing of data availability

### Recommendations:
- If Official API has more tweets: Use it as primary source
- If TwitterAPI.io has more tweets: May have better historical coverage
- For engagement metrics: Use most recent fetch regardless of source