In [None]:
from dotenv import load_dotenv
import os
import requests
import json
import pandas as pd
import time
from requests.exceptions import HTTPError

# Load API keys
load_dotenv()
BASE_URL = "https://twitter241.p.rapidapi.com"
HEADERS = {
    "x-rapidapi-host": "twitter241.p.rapidapi.com",
    "x-rapidapi-key": "056375d7f5mshd4546f4fb1a7f4ep129c82jsna85290da7d03"
}

Influencer list
INFLUENCER_USERNAMES = [
    "Ninja", "shroud", "Myth_", "DrLupo", "TimTheTatman", "Syndicate", "Summit1g", "Pokimane",
    "Tfue", "Jacksepticeye", "Valkyrae", "Quackity", "TheGrefg", "Jynxzi", "markiplier",
    "SSSniperWolf", "OMGitsAliA", "scump", "LazarBeam", "Pokelawls"
]


# Retry decorator for 429 errors
# Retry decorator: infinite retries with no delay


def retry_on_429():
    def decorator(func):
        def wrapper(*args, **kwargs):
            while True:
                try:
                    return func(*args, **kwargs)
                except HTTPError as e:
                    if e.response.status_code == 429:
                        print(f"⏳ 429 Too Many Requests. Retrying in 0.5s...")
                        time.sleep(0.5)
                        continue
                    else:
                        raise
        return wrapper
    return decorator



# API calls
@retry_on_429()
def get_user_info(username):
    url = f"{BASE_URL}/user"
    params = {"username": username}
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

@retry_on_429()
def get_user_followers(user_id, count=100):
    url = f"{BASE_URL}/followers"
    params = {"user": user_id, "count": count}
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

@retry_on_429()
def get_user_following(user_id, count=100):
    url = f"{BASE_URL}/followings"
    params = {"user": user_id, "count": count}
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

@retry_on_429()
def get_user_tweets(user_id, count=50):
    url = f"{BASE_URL}/user-tweets"
    params = {"user": user_id, "count": count}
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

# Output folders
os.makedirs("data/raw/profiles", exist_ok=True)
os.makedirs("data/raw/followers", exist_ok=True)
os.makedirs("data/raw/following", exist_ok=True)
os.makedirs("data/raw/tweets", exist_ok=True)

# Collect data
for username in INFLUENCER_USERNAMES:
    print(f"\n📦 Collecting data for {username}...")

    user_info = get_user_info(username)
    if not user_info:
        print(f"⚠️ Failed to fetch user info for {username}")
        continue

    # Extract user ID from nested structure
    user_data = user_info.get("result", {}).get("data", {}).get("user", {}).get("result", {})
    user_id = user_data.get("rest_id")

    if not user_id:
        print(f"❌ User ID not found for {username}")
        continue

    # Save profile
    with open(f"data/raw/profiles/{username}.json", "w") as f:
        json.dump(user_info, f, indent=2)
    print(f"✅ Profile saved for {username}")

    # Get and save tweets
    tweets = get_user_tweets(user_id)
    if tweets:
        with open(f"data/raw/tweets/{username}.json", "w") as f:
            json.dump([{"tweets": tweets}], f, indent=2)
        print(f"✅ Tweets saved for {username}")

    # Get and save followers
    followers = get_user_followers(user_id)
    if followers:
        with open(f"data/raw/followers/{username}.json", "w") as f:
            json.dump(followers, f, indent=2)
        print(f"✅ Followers saved for {username}")

    # Get and save following
    following = get_user_following(user_id)
    if following:
        with open(f"data/raw/following/{username}.json", "w") as f:
            json.dump(following, f, indent=2)
        print(f"✅ Following saved for {username}")

    # Delay to reduce risk of rate limits

print("\n🎉 All data collected and saved per influencer.")



📦 Collecting data for TheGrefg...
✅ Profile saved for TheGrefg
✅ Tweets saved for TheGrefg
✅ Followers saved for TheGrefg
✅ Following saved for TheGrefg

📦 Collecting data for Jynxzi...
✅ Profile saved for Jynxzi
⏳ 429 Too Many Requests. Retrying in 0.5s...
✅ Tweets saved for Jynxzi
✅ Followers saved for Jynxzi
✅ Following saved for Jynxzi

📦 Collecting data for markiplier...
✅ Profile saved for markiplier
⏳ 429 Too Many Requests. Retrying in 0.5s...
⏳ 429 Too Many Requests. Retrying in 0.5s...
⏳ 429 Too Many Requests. Retrying in 0.5s...
✅ Tweets saved for markiplier
✅ Followers saved for markiplier
✅ Following saved for markiplier

📦 Collecting data for SSSniperWolf...
✅ Profile saved for SSSniperWolf
⏳ 429 Too Many Requests. Retrying in 0.5s...
✅ Tweets saved for SSSniperWolf
⏳ 429 Too Many Requests. Retrying in 0.5s...
⏳ 429 Too Many Requests. Retrying in 0.5s...
✅ Followers saved for SSSniperWolf
⏳ 429 Too Many Requests. Retrying in 0.5s...
⏳ 429 Too Many Requests. Retrying in 0.

In [67]:
def extract_features_from_json(json_path: str) -> pd.DataFrame:
    """Extracts and aggregates tweet and profile features from an influencer's tweet JSON file."""
    
    # Load tweet JSON
    with open(json_path, "r") as f:
        data = json.load(f)

    # Extract tweets
    entries = data[0]['tweets']['result']['timeline']['instructions']
    tweet_items = []
    for instruction in entries:
        if instruction.get('type') == "TimelineAddEntries":
            for entry in instruction['entries']:
                try:
                    tweet = entry['content']['itemContent']['tweet_results']['result']
                    tweet_items.append(tweet)
                except KeyError:
                    continue

    # Helper: parse timestamp
    def parse_datetime(t): return datetime.strptime(t, "%a %b %d %H:%M:%S %z %Y")

    # Helper: sentiment
    def get_sentiment(text):
        blob = TextBlob(text)
        return blob.sentiment.polarity, blob.sentiment.subjectivity

    # Extract features per tweet
    records = []
    for t in tweet_items:
        legacy = t['legacy']
        views = t.get('views', {}).get('count', '0')
        text = legacy.get('full_text', '')
        media = legacy.get('entities', {}).get('media', [])
        urls = legacy.get('entities', {}).get('urls', [])
        mentions = legacy.get('entities', {}).get('user_mentions', [])
        hashtags = legacy.get('entities', {}).get('hashtags', [])
        polarity, subjectivity = get_sentiment(text)

        views_count = int(views.replace(',', '')) if views else 1
        total_engagement = legacy['favorite_count'] + legacy['retweet_count'] + legacy['reply_count']
        engagement_rate = total_engagement / views_count if views_count > 0 else 0

        records.append({
            "created_at": parse_datetime(legacy['created_at']),
            "text_length": len(text),
            "word_count": len(re.findall(r'\w+', text)),
            "has_media": int(bool(media)),
            "has_url": int(bool(urls)),
            "has_mentions": int(bool(mentions)),
            "has_hashtags": int(bool(hashtags)),
            "favorite_count": legacy['favorite_count'],
            "retweet_count": legacy['retweet_count'],
            "reply_count": legacy['reply_count'],
            "bookmark_count": legacy.get('bookmark_count', 0),
            "views": views_count,
            "engagement_rate": engagement_rate,
            "sentiment_polarity": polarity,
            "sentiment_subjectivity": subjectivity,
            "hour": parse_datetime(legacy['created_at']).hour,
            "weekday": parse_datetime(legacy['created_at']).weekday(),
            "is_weekend": int(parse_datetime(legacy['created_at']).weekday() >= 5),
        })

    df = pd.DataFrame(records)

    if df.empty:
        raise ValueError("No valid tweets found in JSON.")

    # === Aggregation ===
    agg_features = {
        "text_length": ["mean", "std", "max"],
        "word_count": ["mean", "std", "max"],
        "has_media": "mean",
        "has_url": "mean",
        "has_mentions": "mean",
        "has_hashtags": "mean",
        "favorite_count": ["mean", "max"],
        "retweet_count": ["mean", "max"],
        "reply_count": ["mean", "max"],
        "bookmark_count": ["mean", "max"],
        "views": ["mean", "max"],
        "engagement_rate": ["mean", "max", "std"],
        "sentiment_polarity": ["mean", "std"],
        "sentiment_subjectivity": ["mean", "std"],
        "hour": ["mean"],
        "is_weekend": "mean"
    }

    agg_df = df.agg(agg_features)
    agg_df.columns = ['{}_{}'.format(col[0], col[1]) if isinstance(col, tuple) else col for col in agg_df.columns]

    # === Add user profile features ===
    user_info = tweet_items[0]['core']['user_results']['result']['legacy']
    created_at_user = parse_datetime(user_info['created_at'])
    profile_features = {
        "followers_count": user_info["followers_count"],
        "friends_count": user_info["friends_count"],
        "listed_count": user_info["listed_count"],
        "statuses_count": user_info["statuses_count"],
        "media_count": user_info["media_count"],
        "favourites_count": user_info["favourites_count"],
        "account_age_days": (datetime.utcnow() - created_at_user.replace(tzinfo=None)).days,
        "follower_following_ratio": user_info["followers_count"] / (user_info["friends_count"] + 1)
    }

    final_features = pd.concat([agg_df.reset_index(drop=True), pd.DataFrame([profile_features])], axis=1)
    return final_features


In [68]:
import os

input_dir = "data/raw/tweets/"
output_dir = "data/outputs/engagement_features/"
os.makedirs(output_dir, exist_ok=True)

for file in os.listdir(input_dir):
    if file.endswith(".json"):
        screen_name = file.replace(".json", "")
        try:
            feature_df = extract_features_from_json(os.path.join(input_dir, file))
            feature_df.to_csv(f"{output_dir}/{screen_name}_features.csv", index=False)
            print(f"✅ Saved: {screen_name}")
        except Exception as e:
            print(f"❌ Error processing {screen_name}: {e}")


import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from datetime import datetime
import re

✅ Saved: TheGrefg
✅ Saved: OMGitsAliA
✅ Saved: LazarBeam
✅ Saved: Myth_
✅ Saved: Jynxzi
❌ Error processing Pokelawls: 'legacy'
✅ Saved: Jacksepticeye
✅ Saved: Summit1g
✅ Saved: scump
✅ Saved: markiplier
✅ Saved: Tfue
✅ Saved: DrLupo
✅ Saved: TimTheTatman
✅ Saved: shroud
✅ Saved: Syndicate
❌ Error processing Pokimane: No valid tweets found in JSON.
✅ Saved: Ninja
❌ Error processing SSSniperWolf: 'legacy'
❌ Error processing Valkyrae: 'legacy'


In [None]:
print(final_features.head(3))

In [69]:
import pandas as pd

# Load centrality data
centrality_df = pd.read_csv("data/outputs/centrality/influencer_centrality.csv")

# Drop redundant profile fields (to avoid overlap with final_features)
redundant_cols = ['followers_count', 'friends_count', 'statuses_count', 'name', 'screen_name']

# List of influencer Twitter handles
INFLUENCER_USERNAMES = [
    "Ninja", "shroud", "Myth_", "DrLupo", "TimTheTatman", "Syndicate", "Summit1g", "Pokimane",
    "Tfue", "Jacksepticeye", "Valkyrae", "Quackity", "TheGrefg", "Jynxzi", "markiplier",
    "SSSniperWolf", "OMGitsAliA", "scump", "LazarBeam", "Pokelawls"
]

# Placeholder to store all merged datasets
all_merged = []

for influencer_screen_name in INFLUENCER_USERNAMES:
    # Load or generate final_features for this influencer
    # Replace this with your actual logic
    try:
        final_features = pd.read_csv(f"data/outputs/engagement_features/{influencer_screen_name}_features.csv")
        final_features = final_features.loc[:, ~final_features.columns.str.contains('^Unnamed')]
    except FileNotFoundError:
        print(f"⚠️ Skipping {influencer_screen_name}: features file not found.")
        continue

    # Match centrality row
    influencer_centrality = centrality_df[
        centrality_df['screen_name'].str.lower() == influencer_screen_name.lower()
    ].reset_index(drop=True)

    if influencer_centrality.empty:
        print(f"⚠️ Skipping {influencer_screen_name}: no centrality data found.")
        continue

    influencer_centrality = influencer_centrality.drop(columns=redundant_cols, errors='ignore')

    # Keep only "mean" row if final_features has multiple rows (mean/std/max)
    if "mean" in final_features.index or final_features.shape[0] > 1:
        try:
            final_features = final_features.set_index("stat").loc[["mean"]].reset_index(drop=True)
        except:
            final_features = final_features.iloc[[0]]  # fallback to first row
    else:
        final_features = final_features.iloc[[0]]

    # Merge horizontally
    merged = pd.concat([final_features.reset_index(drop=True), influencer_centrality], axis=1)
    merged["screen_name"] = influencer_screen_name  # Add label for traceability

    print(f"\n✅ Merged dataset for {influencer_screen_name}:")
    print(merged.T)

    all_merged.append(merged)

# Optionally combine all into one DataFrame
final_dataset = pd.concat(all_merged, ignore_index=True)

# Save merged dataset
final_dataset.to_csv("data/outputs/influencer_features_combined_all.csv", index=False)



✅ Merged dataset for Ninja:
                                      0
text_length                  104.647059
word_count                    18.294118
has_media                      0.705882
has_url                        0.117647
has_mentions                   0.588235
has_hashtags                   0.058824
favorite_count              3147.411765
retweet_count                251.411765
reply_count                   56.882353
bookmark_count               251.176471
views                     217888.705882
engagement_rate               18.627375
sentiment_polarity             0.172638
sentiment_subjectivity         0.292157
hour                          16.235294
is_weekend                     0.235294
followers_count               6527829.0
friends_count                    1516.0
listed_count                     3577.0
statuses_count                  46063.0
media_count                      6984.0
favourites_count                25961.0
account_age_days                 5320.0
follower_fo

In [None]:
import os
import json
from datetime import datetime
import pandas as pd
from textblob import TextBlob


def parse_datetime(date_str: str) -> datetime | None:
    if not date_str:
        return None
    return datetime.strptime(date_str, "%a %b %d %H:%M:%S %z %Y")


def get_sentiment(text: str) -> tuple[float, float]:
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity


def extract_tweet_features(json_path: str) -> pd.DataFrame:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    tweet_records = []
    try:
        instructions = data['tweets']['result']['timeline']['instructions']
    except KeyError:
        raise ValueError("Invalid JSON structure: Missing 'instructions'.")

    entries = []
    for instr in instructions:
        if instr.get("type") == "TimelineAddEntries":
            entries.extend(instr.get("entries", []))
        elif instr.get("entry", {}).get("content", {}).get("itemContent"):
            entries.append(instr["entry"])  # Handle pinned tweets

    for entry in entries:
        content = entry.get("content", {})
        item_content = content.get("itemContent", {})
        tweet_result = item_content.get("tweet_results", {}).get("result", {})

        if tweet_result.get("__typename") != "Tweet":
            continue

        legacy = tweet_result.get("legacy", {})
        user_legacy = tweet_result.get("core", {}).get("user_results", {}).get("result", {}).get("legacy", {})

        full_text = legacy.get("full_text", "")
        polarity, subjectivity = get_sentiment(full_text)

        tweet_info = {
            "tweet_id": legacy.get("id_str"),
            "created_at": parse_datetime(legacy.get("created_at")),
            "full_text": full_text,
            "like_count": legacy.get("favorite_count"),
            "retweet_count": legacy.get("retweet_count"),
            "reply_count": legacy.get("reply_count"),
            "quote_count": legacy.get("quote_count"),
            "view_count": int(tweet_result.get("views", {}).get("count", 0)),
            "text_length": len(full_text),
            "word_count": len(full_text.split()),
            "has_media": int("media" in legacy),
            "has_url": int(bool(legacy.get("urls"))),
            "has_mentions": int(bool(legacy.get("user_mentions"))),
            "has_hashtags": int(bool(legacy.get("hashtags"))),
            "sentiment_polarity": polarity,
            "sentiment_subjectivity": subjectivity,
            "hour": parse_datetime(legacy.get("created_at")).hour if legacy.get("created_at") else None,
            "is_weekend": int(parse_datetime(legacy.get("created_at")).weekday() >= 5) if legacy.get("created_at") else None,
            "engagement_rate": (
                legacy.get("favorite_count", 0) +
                legacy.get("retweet_count", 0) +
                legacy.get("reply_count", 0) +
                legacy.get("quote_count", 0)
            ) / (int(tweet_result.get("views", {}).get("count", 1))),
            "_user_legacy": user_legacy,
        }

        tweet_records.append(tweet_info)

    df = pd.DataFrame(tweet_records)
    if df.empty:
        raise ValueError(f"No valid tweets found in {json_path}")
    df = df.sort_values("created_at").reset_index(drop=True)
    return df


def aggregate_features(tweet_df: pd.DataFrame) -> pd.DataFrame:
    agg_map = {
        "text_length": ["mean", "std", "max"],
        "word_count": ["mean", "std", "max"],
        "has_media": "mean",
        "has_url": "mean",
        "has_mentions": "mean",
        "has_hashtags": "mean",
        "like_count": ["mean", "max"],
        "retweet_count": ["mean", "max"],
        "reply_count": ["mean", "max"],
        "quote_count": ["mean", "max"],
        "view_count": ["mean", "max"],
        "engagement_rate": ["mean", "max", "std"],
        "sentiment_polarity": ["mean", "std"],
        "sentiment_subjectivity": ["mean", "std"],
        "hour": "mean",
        "is_weekend": "mean"
    }

    agg_df = tweet_df.agg(agg_map)
    agg_df.columns = ['{}_{}'.format(col[0], col[1]) if isinstance(col, tuple) else col for col in agg_df.columns]

    user_info = tweet_df.iloc[0]["_user_legacy"]
    created_at_user = parse_datetime(user_info.get("created_at"))
    account_age = (datetime.utcnow() - created_at_user.replace(tzinfo=None)).days if created_at_user else None

    profile_features = {
        "followers_count": user_info.get("followers_count", 0),
        "friends_count": user_info.get("friends_count", 0),
        "listed_count": user_info.get("listed_count", 0),
        "statuses_count": user_info.get("statuses_count", 0),
        "media_count": user_info.get("media_count", 0),
        "favourites_count": user_info.get("favourites_count", 0),
        "account_age_days": account_age,
        "follower_following_ratio": user_info.get("followers_count", 0) / (user_info.get("friends_count", 0) + 1)
    }

    final_features = pd.concat(
        [agg_df.reset_index(drop=True), pd.DataFrame([profile_features])], axis=1
    )
    return final_features


def load_multiple_influencers(json_paths: dict) -> pd.DataFrame:
    """
    Load and concatenate tweet-level features from multiple influencers.
    json_paths: dict {influencer_id: json_file_path}
    Returns concatenated DataFrame with influencer_id column.
    """
    all_tweets = []
    failed = []

    for influencer_id, path in json_paths.items():
        try:
            df = extract_tweet_features(path)
            df['influencer_id'] = influencer_id
            all_tweets.append(df)
        except Exception as e:
            print(f"[ERROR] Failed to process {influencer_id} at {path}: {e}")
            failed.append(influencer_id)

    if not all_tweets:
        raise ValueError("No valid tweet data extracted. Check if JSON structure has changed or files are empty.")

    print(f"✅ Successfully processed {len(all_tweets)} influencers.")
    if failed:
        print(f"⚠️ Failed to process {len(failed)} influencers: {failed}")

    return pd.concat(all_tweets, ignore_index=True)



def train_test_split_time_based(df: pd.DataFrame, split_ratio=0.8) -> tuple[pd.DataFrame, pd.DataFrame]:
    train_list, test_list = [], []
    for influencer_id, group in df.groupby("influencer_id"):
        group_sorted = group.sort_values("created_at")
        split_idx = int(len(group_sorted) * split_ratio)
        train_list.append(group_sorted.iloc[:split_idx])
        test_list.append(group_sorted.iloc[split_idx:])
    return pd.concat(train_list).reset_index(drop=True), pd.concat(test_list).reset_index(drop=True)


In [None]:
input_dir = "data/raw/tweets/"
output_dir = "data/outputs/engagement_features/"
os.makedirs(output_dir, exist_ok=True)

# Auto-generate json_paths dict: {screen_name: full_path}
json_paths = {
    file.replace(".json", ""): os.path.join(input_dir, file)
    for file in os.listdir(input_dir)
    if file.endswith(".json")
}

print(f"Found {len(json_paths)} influencer JSON files.")

# Load all influencers' tweet data into one DataFrame
print(json_paths)
all_tweets_df = load_multiple_influencers(json_paths)

print(f"Loaded tweets for {all_tweets_df['influencer_id'].nunique()} influencers, total {len(all_tweets_df)} tweets.")

# Split into train/test sets by time (80% train, 20% test)
train_df, test_df = train_test_split_time_based(all_tweets_df, split_ratio=0.8)

print(f"Train set: {len(train_df)} tweets, Test set: {len(test_df)} tweets.")

# Aggregate features per influencer (using all tweets)
agg_features_list = []
for influencer_id in all_tweets_df['influencer_id'].unique():
    inf_tweets = all_tweets_df[all_tweets_df['influencer_id'] == influencer_id]
    agg_df = aggregate_features(inf_tweets)
    agg_df['influencer_id'] = influencer_id
    agg_features_list.append(agg_df)

agg_features_df = pd.concat(agg_features_list, ignore_index=True)

# Save aggregated features to CSV
output_path = os.path.join(output_dir, "aggregated_engagement_features.csv")
agg_features_df.to_csv(output_path, index=False)

print(f"Saved aggregated features for {len(agg_features_df)} influencers to {output_path}")