In [None]:
import pandas as pd                         
import numpy as np                          
from tqdm.notebook import tqdm              
from itertools import combinations          

## Tweet IDs and Twitter/X Policy

According to Twitter/X's Developer Policy, direct sharing of tweet content (such as full text, user data, or media) is not permitted. Instead, researchers and developers are encouraged to share only **tweet IDs**, which can be used to retrieve the original tweet data via the official Twitter API—a process known as **hydration**.

All tweet IDs used in this project are provided in the `Twitter/dataset` folder. These files contain only tweet identifiers, in compliance with Twitter/X's data sharing policy.

### How to Reconstruct the Network

To reconstruct the original tweet data and network:

1. **Hydrate the tweet IDs** using a tool such as:
   - [twarc](https://github.com/DocNow/twarc)
   - [Hydrator](https://github.com/DocNow/hydrator)
   - Or any custom script using the [Twitter API](https://developer.twitter.com/en/docs)

2. **Collect the full tweet objects** by querying the IDs listed in the CSV files.

3. **Rebuild the network** using the hydrated tweets and the provided edge definitions.

> ⚠️ Please note: The success of hydration depends on the availability of the tweets. Tweets that have been deleted, protected, or otherwise made unavailable will not be retrievable.

By following this approach, we ensure ethical data sharing and full compliance with Twitter/X’s terms of use.


## 🔁 Retweet Co-Engagement Network Construction

This script processes a dataset of tweets to build **retweet co-engagement networks** for specific target dates.

Each network captures **user-to-user edges** based on co-retweeting behavior:  
If two users retweeted the **same original tweet** on the same day, they are connected in the network.

### 🔧 Key Steps:
1. **Load tweet data** with selected columns only.
2. **Construct edges** between user pairs who retweeted the same tweet:
   - `weight_count`: number of shared retweets.
   - `weight_time`: average time difference (in seconds) between their retweet timestamps.
3. **Export** the edge list as a `.csv` for further network analysis.

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter environments

# ----------------------------------------
# Load and prepare tweet dataset
# ----------------------------------------

# List only the necessary columns for loading
required_columns = [
    'id', 'conversation_id', 'referenced_tweets.replied_to.id', 'referenced_tweets.retweeted.id',
    'referenced_tweets.quoted.id', 'author_id', 'in_reply_to_user_id', 'retweeted_user_id',
    'quoted_user_id', 'created_at', 'text', 'entities.hashtags', 'entities.mentions',
    'author.id', 'author.created_at', 'author.verified',
    'public_metrics.retweet_count', 'public_metrics.like_count', 'public_metrics.quote_count',
    'public_metrics.reply_count', 'public_metrics.impression_count'
]

# Load the CSV file
df = pd.read_csv('dataset/complete_data.csv', usecols=required_columns)

# Convert the timestamp column to datetime format
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Drop rows with missing datetime or retweet reference
df = df[df['created_at'].notnull()]
df = df[df['referenced_tweets.retweeted.id'].notnull()]

# Keep only necessary columns for network construction
df = df[['id', 'author_id', 'created_at', 'referenced_tweets.retweeted.id']]
df = df.rename(columns={'referenced_tweets.retweeted.id': 'retweeted_tweet_id'})

# Add a simplified date column (date only, no time)
df['date'] = df['created_at'].dt.date

# ----------------------------------------
# Set analysis parameters
# ----------------------------------------

# Dates for which we want to build the networks
target_dates = ['2022-11-01', '2023-01-08']
target_dates = [pd.to_datetime(date).date() for date in target_dates]

# Loop over each target date to build a retweet co-engagement network
for target_date in target_dates:
    print(f"Processing date: {target_date}")

    # Filter tweets from the target date only (no temporal windowing applied)
    df_filtered = df[df['date'] == target_date]

    # Filter out authors with fewer than 3 retweets (hardcoded threshold)

    # Build the co-retweet network:
    # Nodes = users
    # Edges = co-retweeting the same original tweet
    # Edge weight = number of shared retweets
    # Edge time = average time difference in seconds between the retweets

    # Set of all retweeted tweet IDs for this date
    retweeted_ids = set(df_filtered['retweeted_tweet_id'])

    # Dictionaries to accumulate weights and temporal distances
    edge_weights = {}
    edge_time_differences = {}

    # For each original tweet that was retweeted:
    for original_tweet_id in tqdm(retweeted_ids, desc="Building edges"):
        # Select all retweets of this tweet on the current date
        sub_df = df_filtered[df_filtered['retweeted_tweet_id'] == original_tweet_id]

        # Build a map of user_id -> retweet timestamp
        user_time_map = dict(zip(sub_df['author_id'], sub_df['created_at']))

        # Create all unique user pairs who retweeted the same tweet
        for user_pair in combinations(sorted(user_time_map.keys()), 2):
            time_diff = abs((user_time_map[user_pair[0]] - user_time_map[user_pair[1]]).total_seconds())

            # Store time difference and update edge weight
            edge_time_differences.setdefault(user_pair, []).append(time_diff)
            edge_weights[user_pair] = edge_weights.get(user_pair, 0) + 1

    # ----------------------------------------
    # Export the edge list
    # ----------------------------------------

    # Create DataFrame with: source, target, weight, average time diff
    edge_list = [
        (*pair, weight, np.mean(edge_time_differences[pair]))
        for pair, weight in edge_weights.items()
    ]

    edge_df = pd.DataFrame(edge_list, columns=['src', 'trg', 'weight_count', 'weight_time'])

    # Save edge list to CSV
    output_path = f"networks/{target_date}-edges-data.csv"
    edge_df.to_csv(output_path, index=False)

    print(f"Saved edge list to {output_path}")