In [111]:
# Cell 1: Imports and Installs
import subprocess
import sys

def install_package(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
for package in ['pandas', 'requests', 'tqdm']:
    install_package(package)

# Imports
import pandas as pd
import requests
import json
from datetime import datetime, timedelta, timezone
import time
from typing import List, Dict, Optional, Tuple
from tqdm import tqdm

print("All packages imported successfully!")


All packages imported successfully!


In [129]:
# Configuration Constants
DEBUG = False  # Set to True for verbose output in functions
MIN_VOLUME = 100000  # Minimum volume in USD
SAFE_THRESHOLD = 0.90  # Minimum probability to be considered "safe"
LOOKBACK_HOURS = 48  # Hours before endDate to fetch price
MIN_PROBABILITY_7D = 0.70  # Minimum probability at 7d to include market (filters high variability)
MARKET_MIN_AGE_DAYS = 7  # Markets must have been alive for at least this many days

In [113]:
# Cell 2: Fetch filtered markets for a day
def fetch_markets_for_day(date: datetime, min_volume: int = None) -> pd.DataFrame:
    """
    Fetch all closed markets for a given day from the Polymarket Gamma API.
    
    Args:
        date: The date to fetch markets for (markets that closed on this date)
        min_volume: Minimum volume in USD (defaults to MIN_VOLUME constant)
    
    Returns:
        DataFrame with market info: id, question, slug, clobTokenIds, closedTime, 
        endDate, outcomePrices, outcomes, volumeNum, startDate
    """
    if min_volume is None:
        min_volume = MIN_VOLUME
    
    # Format dates for API
    end_date_min = date.strftime("%Y-%m-%dT00:00:00Z")
    end_date_max = (date + timedelta(days=1)).strftime("%Y-%m-%dT00:00:00Z")
    # Markets must have been alive for at least MARKET_MIN_AGE_DAYS days
    start_date_max = (date - timedelta(days=MARKET_MIN_AGE_DAYS)).strftime("%Y-%m-%dT00:00:00Z")
    
    all_markets = []
    offset = 0
    limit = 100
    
    while True:
        url = (
            f"https://gamma-api.polymarket.com/markets?"
            f"ascending=true&closed=true"
            f"&end_date_min={end_date_min}"
            f"&end_date_max={end_date_max}"
            f"&volume_num_min={min_volume}"
            f"&start_date_max={start_date_max}"
            f"&limit={limit}&offset={offset}&order=volume"
        )
        
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            markets = response.json()
            
            if not markets:
                break
                
            all_markets.extend(markets)
            offset += limit
            
            # Rate limiting
            time.sleep(0.2)
            
        except requests.RequestException as e:
            if DEBUG:
                print(f"Error fetching markets for {date.date()}: {e}")
            break
    
    if not all_markets:
        return pd.DataFrame()
    
    # Extract relevant fields
    records = []
    for market in all_markets:
        records.append({
            'id': market.get('id'),
            'question': market.get('question'),
            'slug': market.get('slug'),
            'clobTokenIds': market.get('clobTokenIds'),
            'closedTime': market.get('closedTime'),
            'endDate': market.get('endDate'),
            'outcomePrices': market.get('outcomePrices'),
            'outcomes': market.get('outcomes'),
            'volumeNum': market.get('volumeNum'),
            'startDate': market.get('startDate')
        })
    
    df = pd.DataFrame(records)
    if DEBUG:
        print(f"Fetched {len(df)} markets for {date.date()}")
    return df


In [114]:
# Cell 3: Filter markets by keyword
def filter_markets_by_keywords(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter out markets containing sports, esports, crypto, or weather keywords.
    
    Args:
        df: DataFrame with market data (must have 'question' and 'slug' columns)
    
    Returns:
        Filtered DataFrame with problematic markets removed
    """
    if df.empty:
        return df
    
    sports_keywords = [
        'vs.', ' vs ', 'NFL', 'NBA', 'MLB', 'NHL', 'UFC', 'MMA',
        'Soccer', 'Football', 'Basketball', 'Baseball', 'Hockey',
        'Tennis', 'Golf', 'Boxing', 'Premier League', 'Champions League',
        'ucl-', 'mlb-', 'nba-', 'nfl-', 'nhl-', 'wnba-', 'Serie A',
        'La Liga', 'Bundesliga', 'Ligue 1', 'UEFA', 'FIFA', 'World Cup',
        'Cavaliers', 'Lakers', 'Warriors', 'Celtics', 'Knicks', 'Nets',
        'Yankees', 'Dodgers', 'Astros', 'Heisman', "Davey O'Brien",
        'Doak Walker', 'Biletnikoff', 'Award Winner', 'cfb-', 'ncaa',
        'Bowl Game', 'Championship Game', 'playoffs', 'tournament',
        'fantasy', 'NFC', 'AFC', 'epl-'
    ]
    
    esports_keywords = [
        'LoL:', 'Dota', 'CS:GO', 'Valorant', 'Overwatch', 'Rocket League',
        'Fortnite', 'PUBG', 'Apex Legends', 'Rainbow Six', 'Call of Duty',
        'esports', 'e-sports', '(BO3)', '(BO5)', 'Gen.G', 'T1', 'TSM',
        'Team Liquid', 'Cloud9', 'FaZe', 'NaVi', 'Fnatic', 'G2',
        'Mobile Legends', 'MLBB', 'Honor of Kings', 'Arena of Valor',
        'League of Legends:', 'StarCraft', 'Hearthstone', 'Overwatch League'
    ]
    
    crypto_keywords = [
        'bitcoin', 'ethereum', 'btc', 'eth', 'solana', 'sol', 'xrp',
        'crypto', 'coin', 'token', 'above', 'below', 'hit',
        'multistrike', '4pm et', '8pm et', '12pm et', 'trading',
        'market cap', 'defi', 'nft', 'blockchain', '3:00pm', '3:15pm',
        '3:30pm', '3:45pm', 'price -', 'above ___', 'below ___',
        'price on october', 'price on november', 'price on december',
        'price on january', 'price on february', 'price on march',
        'what price will', 'binance', 'coinbase', 'doge', 'shib',
        'cardano', 'ada', 'bnb', 'polygon', 'matic', 'avalanche',
        'avax', 'polkadot', 'dot', 'chainlink', 'link'
    ]
    
    weather_keywords = [
        'temperature', 'degrees', 'rain', 'snow', 'weather', 'storm',
        'hurricane', 'tornado', 'hotter', 'colder', 'warmest', 'coldest',
        'precipitation', 'humidity', 'forecast', 'climate'
    ]
    
    all_keywords = sports_keywords + esports_keywords + crypto_keywords + weather_keywords
    
    def contains_keyword(row):
        text = f"{row['question']} {row['slug']}".lower()
        return any(kw.lower() in text for kw in all_keywords)
    
    mask = ~df.apply(contains_keyword, axis=1)
    filtered_df = df[mask].copy()
    
    if DEBUG:
        removed = len(df) - len(filtered_df)
        print(f"Filtered out {removed} markets by keywords, {len(filtered_df)} remaining")
    
    return filtered_df


In [115]:
# Cell 4: Fetch 7-day price history using single API call with fidelity=1440
def fetch_7day_price_history(clob_token_id: str, reference_date: datetime, start_date: datetime = None) -> Dict[int, float]:
    """
    Fetch price history for 7 days before reference date using a single API call.
    Returns daily prices (one per day) for days 7 through 1 before reference date.
    
    The API returns data at 00:00 GMT for each day when fidelity=1440.
    For a market with reference date Jan 1, we want data at 00:00 GMT on:
    Dec 25, 26, 27, 28, 29, 30, 31 (7 days before through 1 day before)
    
    Args:
        clob_token_id: The CLOB token ID for the market outcome
        reference_date: The reference date (earlier of closedTime and endDate)
        start_date: When the market was created (to avoid looking for data before it existed)
    
    Returns:
        Dictionary mapping days before (7, 6, 5, 4, 3, 2, 1) to prices
    """
    # Ensure we're working in UTC/GMT
    if reference_date.tzinfo is None:
        reference_date = reference_date.replace(tzinfo=timezone.utc)
    
    if start_date and start_date.tzinfo is None:
        start_date = start_date.replace(tzinfo=timezone.utc)
    
    # Set bounds to capture 7 days of data at 00:00 GMT
    # Start: 8 days before at 00:00 GMT
    # End: Use 23:59 on day before to ensure we capture the 1d data point at 00:00
    api_start = (reference_date - timedelta(days=8)).replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
    api_end = (reference_date - timedelta(days=1)).replace(hour=23, minute=59, second=59, microsecond=0, tzinfo=timezone.utc)
    
    start_ts = int(api_start.timestamp())
    end_ts = int(api_end.timestamp())
    
    url = (
        f"https://clob.polymarket.com/prices-history?"
        f"fidelity=1440&market={clob_token_id}&startTs={start_ts}&endTs={end_ts}"
    )
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        history = data.get('history', [])
        if not history:
            return {}
        
        # Create a dictionary mapping days before to prices
        result = {}
        
        for days_before in range(7, 0, -1):
            # Calculate target time at 00:00 GMT for the day that is 'days_before' days before reference_date
            target_time = (reference_date - timedelta(days=days_before)).replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
            target_ts = int(target_time.timestamp())
            
            # Skip if this is before the market was created
            if start_date and target_time < start_date:
                continue
            
            # Find the price entry that matches this timestamp
            # First try exact match (within 1 hour), then fallback to closest before target
            closest = None
            min_diff = float('inf')
            
            for entry in history:
                diff = abs(entry['t'] - target_ts)
                if diff < min_diff and diff < 7200:  # Within 2 hours
                    min_diff = diff
                    closest = entry
            
            # Fallback: if no close match, find the most recent price BEFORE the target
            if closest is None:
                for entry in sorted(history, key=lambda x: x['t'], reverse=True):
                    if entry['t'] <= target_ts:
                        closest = entry
                        break
            
            if closest:
                result[days_before] = closest['p']
        
        return result
        
    except requests.RequestException as e:
        if DEBUG:
            print(f"Error fetching price history for {clob_token_id}: {e}")
        return {}


def parse_datetime(dt_str: str) -> Optional[datetime]:
    """Parse a datetime string to datetime object with timezone."""
    if not dt_str:
        return None
    try:
        if dt_str.endswith('Z'):
            dt_str = dt_str.replace('Z', '+00:00')
        return datetime.fromisoformat(dt_str)
    except:
        return None


def append_7day_price_history(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fetch 7-day price history for each market and append daily columns.
    Deduplicates by clobID before fetching.
    
    Uses the earlier of closedTime and endDate as the reference point for lookback.
    This ensures we get price data from when the market was actively trading.
    
    Filters out markets that closed more than 1 day before their endDate, as these
    won't have meaningful price data in the lookback window.
    
    Args:
        df: DataFrame with market data (must have 'clobTokenIds', 'endDate', 'closedTime', 'startDate')
    
    Returns:
        DataFrame with columns: probability7d, probability6d, probability5d, 
        probability4d, probability3d, probability2d, probability1d
    """
    if df.empty:
        return df
    
    df = df.copy()
    
    # Deduplicate by clobID before fetching
    def get_clob_id(row):
        try:
            clob_ids = json.loads(row['clobTokenIds'])
            return clob_ids[0] if clob_ids else None
        except:
            return None
    
    df['clobID'] = df.apply(get_clob_id, axis=1)
    df = df.drop_duplicates(subset=['clobID'], keep='first').reset_index(drop=True)
    
    if DEBUG:
        print(f"After deduplication by clobID: {len(df)} markets")
    
    # Parse dates and calculate reference date (earlier of closedTime and endDate)
    df['_endDate_parsed'] = df['endDate'].apply(parse_datetime)
    df['_closedTime_parsed'] = df['closedTime'].apply(parse_datetime)
    df['_startDate_parsed'] = df['startDate'].apply(parse_datetime)
    
    def get_reference_date(row):
        end_date = row['_endDate_parsed']
        closed_time = row['_closedTime_parsed']
        if end_date is None:
            return closed_time
        if closed_time is None:
            return end_date
        return min(end_date, closed_time)
    
    df['_reference_date'] = df.apply(get_reference_date, axis=1)
    
    # Filter out markets that closed too early (more than 1 day before endDate)
    # These markets resolved early and won't have good price data near endDate
    def closed_too_early(row):
        end_date = row['_endDate_parsed']
        closed_time = row['_closedTime_parsed']
        if end_date is None or closed_time is None:
            return False
        return closed_time < end_date - timedelta(days=1)
    
    early_close_mask = df.apply(closed_too_early, axis=1)
    if DEBUG and early_close_mask.any():
        print(f"Filtering out {early_close_mask.sum()} markets that closed > 1 day before endDate")
    
    df = df[~early_close_mask].reset_index(drop=True)
    
    if df.empty:
        return df
    
    # Initialize columns for daily probabilities
    for days in range(7, 0, -1):
        df[f'probability{days}d'] = None
    
    desc = "Fetching 7-day price history" if DEBUG else None
    disable_tqdm = not DEBUG
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc, disable=disable_tqdm):
        try:
            clob_ids = json.loads(row['clobTokenIds'])
            if not clob_ids:
                continue
            
            clob_id = clob_ids[0]
            reference_date = row['_reference_date']
            start_date = row['_startDate_parsed']
            
            if reference_date is None:
                continue
            
            # Fetch 7-day price history using reference date
            prices = fetch_7day_price_history(clob_id, reference_date, start_date)
            
            # Store each day's price in the appropriate column
            for days_before, price in prices.items():
                df.at[idx, f'probability{days_before}d'] = price
            
            # Rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            if DEBUG:
                print(f"Error processing row {idx}: {e}")
            continue
    
    # Count successful fetches
    if DEBUG:
        for days in range(7, 0, -1):
            col = f'probability{days}d'
            successful = df[col].notna().sum()
            print(f"Successfully fetched {col} for {successful}/{len(df)} markets")
    
    # Clean up temporary columns
    temp_cols = ['_endDate_parsed', '_closedTime_parsed', '_startDate_parsed', '_reference_date']
    df = df.drop(columns=[c for c in temp_cols if c in df.columns], errors='ignore')
    
    return df


In [126]:
# Cell 5: Process all markets (no filtering) and produce final dataset
def process_all_markets(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process all filtered markets (no threshold filter) and produce final dataset.
    The 7-day price history should already be in the DataFrame from append_7day_price_history.
    
    Flips probabilities (takes max of prob and 1-prob) for all probability columns.
    Determines outcome: True if the higher-probability side (the "safe bet") won.
    
    Handles incomplete data gracefully - markets with partial 7-day data are still included.
    
    Args:
        df: DataFrame with market data and 7-day price history columns
    
    Returns:
        Final DataFrame with columns: market, slug, clobID, closingDate, outcome, 
        probability7d, probability6d, probability5d, probability4d, probability3d, 
        probability2d, probability1d, volume
    """
    if df.empty:
        return pd.DataFrame()
    
    df = df.copy()
    
    # Determine outcome BEFORE flipping probabilities
    # outcome = True if the higher-probability side won
    def get_safe_bet_won(row):
        try:
            # Get raw probability for outcome 0 (use probability7d as reference, fallback to others)
            raw_prob = None
            for days in [7, 6, 5, 4, 3, 2, 1]:
                col = f'probability{days}d'
                if col in row and row[col] is not None and not pd.isna(row[col]):
                    raw_prob = float(row[col])
                    break
            
            if raw_prob is None:
                return None
            
            # Determine which outcome was the "safe bet" (higher probability)
            # If raw_prob >= 0.5, outcome 0 was favored
            # If raw_prob < 0.5, outcome 1 was favored (1 - raw_prob is the higher prob)
            safe_bet_idx = 0 if raw_prob >= 0.5 else 1
            
            # Determine which outcome actually won
            prices = json.loads(row['outcomePrices'])
            winning_idx = prices.index(max(prices, key=lambda x: float(x)))
            
            # Return True if the safe bet won
            return winning_idx == safe_bet_idx
        except:
            return None
    
    df['outcome'] = df.apply(get_safe_bet_won, axis=1)
    
    # Helper function to flip probability (take max of prob and 1-prob)
    # Handles None, NaN, and numeric values
    def flip_probability(prob):
        if prob is None or pd.isna(prob):
            return None
        try:
            prob = float(prob)
            return max(prob, 1 - prob)
        except (ValueError, TypeError):
            return None
    
    # Ensure all probability columns exist (even if empty)
    for days in range(7, 0, -1):
        col = f'probability{days}d'
        if col not in df.columns:
            df[col] = None
    
    # Flip all probability columns (AFTER determining outcome)
    for days in range(7, 0, -1):
        col = f'probability{days}d'
        df[col] = df[col].apply(flip_probability)
    
    # Build final dataset - use .get() to safely handle missing columns
    final_df = pd.DataFrame({
        'market': df['question'],
        'slug': df['slug'],
        'clobID': df['clobID'],
        'closingDate': df['endDate'],
        'outcome': df['outcome'],
        'probability7d': df['probability7d'],
        'probability6d': df['probability6d'],
        'probability5d': df['probability5d'],
        'probability4d': df['probability4d'],
        'probability3d': df['probability3d'],
        'probability2d': df['probability2d'],
        'probability1d': df['probability1d'],
        'volume': df['volumeNum']
    })
    
    if DEBUG:
        # Report how many markets have complete vs partial data
        complete_data = final_df[['probability7d', 'probability6d', 'probability5d', 
                                   'probability4d', 'probability3d', 'probability2d', 
                                   'probability1d']].notna().all(axis=1).sum()
        print(f"Final dataset: {len(final_df)} markets ({complete_data} with complete 7-day data)")
    
    return final_df


In [131]:
# Cell 6: Wrapper function to collect data for a date range
def collect_dataset(start_date: str, end_date: str, 
                    min_volume: int = None,
                    save_path: Optional[str] = None) -> pd.DataFrame:
    """
    Collect and process market data for a range of dates.
    No threshold filter - collects all filtered markets.
    
    Args:
        start_date: Start date in 'YYYY-MM-DD' format
        end_date: End date in 'YYYY-MM-DD' format (inclusive)
        min_volume: Minimum volume filter (defaults to MIN_VOLUME constant)
        save_path: Optional path to save CSV incrementally
    
    Returns:
        Complete DataFrame with all filtered markets in the date range
    """
    if min_volume is None:
        min_volume = MIN_VOLUME
    
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    
    all_data = []
    current = start
    
    total_days = (end - start).days + 1
    print(f"Collecting data for {total_days} days: {start_date} to {end_date}")
    print("="*60)
    
    while current <= end:
        print(f"\n--- Processing {current.date()} ---")
        
        # Step 1: Fetch markets for the day
        markets_df = fetch_markets_for_day(current, min_volume)
        
        if markets_df.empty:
            if DEBUG:
                print(f"No markets found for {current.date()}")
            current += timedelta(days=1)
            continue
        
        # Step 2: Filter by keywords
        filtered_df = filter_markets_by_keywords(markets_df)
        
        if filtered_df.empty:
            if DEBUG:
                print(f"No markets remaining after keyword filter")
            current += timedelta(days=1)
            continue
        
        # Step 3: Fetch 7-day price history (deduplicates by clobID internally)
        with_price_history_df = append_7day_price_history(filtered_df)
        
        # Step 4: Process all markets (no threshold filter) and produce final dataset
        final_df = process_all_markets(with_price_history_df)
        
        if not final_df.empty:
            # Deduplicate: remove any clobIDs that already exist in accumulated data
            if all_data:
                existing_clob_ids = set(pd.concat(all_data, ignore_index=True)['clobID'].dropna().unique())
                before_dedup = len(final_df)
                final_df = final_df[~final_df['clobID'].isin(existing_clob_ids)].copy()
                removed = before_dedup - len(final_df)
                if DEBUG and removed > 0:
                    print(f"Removed {removed} duplicate clobIDs (already in dataset)")
            
            # Also deduplicate within the current batch (shouldn't happen, but safety check)
            final_df = final_df.drop_duplicates(subset=['clobID'], keep='first')
            
            if not final_df.empty:
                all_data.append(final_df)
                
                # Incremental save if path provided
                if save_path:
                    combined = pd.concat(all_data, ignore_index=True)
                    combined.to_csv(save_path, index=False)
                    if DEBUG:
                        print(f"Saved {len(combined)} total records to {save_path}")
        
        current += timedelta(days=1)
        
        # Small delay between days to be respectful to API
        time.sleep(0.5)
    
    print("\n" + "="*60)
    
    if not all_data:
        print("No data collected!")
        return pd.DataFrame()
    
    final_dataset = pd.concat(all_data, ignore_index=True)
    print(f"Collection complete! Total markets: {len(final_dataset)}")
    
    # Final save
    if save_path:
        final_dataset.to_csv(save_path, index=False)
        print(f"Final dataset saved to {save_path}")
    
    return final_dataset


In [122]:
# Test 1: Fetch markets for January 1st, 2025
test_date = datetime(2025, 1, 1)
raw_markets_df = fetch_markets_for_day(test_date)

print(f"\n=== Raw Markets DataFrame ===")
print(f"Shape: {raw_markets_df.shape}")
print(f"\nColumns: {list(raw_markets_df.columns)}")
print(f"\nHead:")
raw_markets_df.head()


Fetched 20 markets for 2025-01-01

=== Raw Markets DataFrame ===
Shape: (20, 10)

Columns: ['id', 'question', 'slug', 'clobTokenIds', 'closedTime', 'endDate', 'outcomePrices', 'outcomes', 'volumeNum', 'startDate']

Head:


Unnamed: 0,id,question,slug,clobTokenIds,closedTime,endDate,outcomePrices,outcomes,volumeNum,startDate
0,516211,76ers vs. Celtics,nba-phi-bos-2024-12-25,"[""10260102152120325117037936020682032118936290...",2024-12-26 02:34:04+00,2025-01-01T22:00:00Z,"[""1"", ""0""]","[""76ers"", ""Celtics""]",1130028.0,2024-12-22T05:08:16.423255Z
1,506668,September temperature increase by between 1.23...,september-temperature-increase-by-between-1pt2...,"[""32867164006675525949065653942615290229805210...",2024-10-21 19:30:42+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",170613.9,2024-09-10T17:08:04.775023Z
2,514401,Will Solana hit $210 in December?,will-solana-hit-210-in-december,"[""57385243338730163448622852164456859675586769...",2024-12-09 23:35:41+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",170732.5,2024-12-02T18:15:43.190892Z
3,516242,Chiefs vs. Steelers,nfl-kc-pit-2024-12-25,"[""56724154750377002831649637065803386870259449...",2024-12-25 22:57:34+00,2025-01-01T18:00:00Z,"[""1"", ""0""]","[""Chiefs"", ""Steelers""]",2186450.0,2024-12-22T06:03:56.667201Z
4,506672,September temperature increase by between 1.29...,september-temperature-increase-by-between-1pt2...,"[""36832878757548959760983297172765835834504846...",2024-10-21 19:26:01+00,2025-01-01T12:00:00Z,"[""0"", ""1""]","[""Yes"", ""No""]",225614.5,2024-09-10T17:08:35.234502Z


In [123]:
# Test 2: Filter markets by keywords
filtered_df = filter_markets_by_keywords(raw_markets_df)

print(f"\n=== Filtered Markets DataFrame ===")
print(f"Shape: {filtered_df.shape}")
print(f"\nRemaining markets:")
for idx, row in filtered_df.iterrows():
    print(f"  - {row['question'][:60]}...")
print(f"\nHead:")
filtered_df

Filtered out 18 markets by keywords, 2 remaining

=== Filtered Markets DataFrame ===
Shape: (2, 10)

Remaining markets:
  - Will Trump be Speaker by January 1?...
  - No Israel x Hamas ceasefire in 2024?...

Head:


Unnamed: 0,id,question,slug,clobTokenIds,closedTime,endDate,outcomePrices,outcomes,volumeNum,startDate
6,508204,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,"[""90762671961607378550984019509208138759337675...",2025-01-02 07:10:46+00,2025-01-01T12:00:00Z,"[""0"", ""1""]","[""Yes"", ""No""]",2771364.0,2024-09-26T21:42:48.851735Z
17,502265,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,"[""41248677391516436501520443748383894699563681...",2025-01-01 10:23:14+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",617817.7,2024-08-29T17:36:52.585Z


In [127]:
# Test 3: Fetch 7-day price history
with_price_history_df = append_7day_price_history(filtered_df)

print(f"\n=== DataFrame with 7-Day Price History ===")
print(f"Shape: {with_price_history_df.shape}")
print(f"\nNew columns added: probability7d, probability6d, probability5d, probability4d, probability3d, probability2d, probability1d")

with_price_history_df


After deduplication by clobID: 2 markets


Fetching 7-day price history:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 7-day price history: 100%|██████████| 2/2 [00:01<00:00,  1.57it/s]

Successfully fetched probability7d for 2/2 markets
Successfully fetched probability6d for 2/2 markets
Successfully fetched probability5d for 2/2 markets
Successfully fetched probability4d for 2/2 markets
Successfully fetched probability3d for 2/2 markets
Successfully fetched probability2d for 2/2 markets
Successfully fetched probability1d for 2/2 markets

=== DataFrame with 7-Day Price History ===
Shape: (2, 18)

New columns added: probability7d, probability6d, probability5d, probability4d, probability3d, probability2d, probability1d





Unnamed: 0,id,question,slug,clobTokenIds,closedTime,endDate,outcomePrices,outcomes,volumeNum,startDate,clobID,probability7d,probability6d,probability5d,probability4d,probability3d,probability2d,probability1d
0,508204,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,"[""90762671961607378550984019509208138759337675...",2025-01-02 07:10:46+00,2025-01-01T12:00:00Z,"[""0"", ""1""]","[""Yes"", ""No""]",2771364.0,2024-09-26T21:42:48.851735Z,9076267196160737855098401950920813875933767533...,0.004,0.0025,0.002,0.003,0.0035,0.0025,0.002
1,502265,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,"[""41248677391516436501520443748383894699563681...",2025-01-01 10:23:14+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",617817.7,2024-08-29T17:36:52.585Z,4124867739151643650152044374838389469956368134...,0.915,0.935,0.945,0.9555,0.977,0.9905,0.9945


In [128]:
# Test 4, Final Processing
final_df = process_all_markets(with_price_history_df)
final_df

Final dataset: 2 markets (2 with complete 7-day data)


Unnamed: 0,market,slug,clobID,closingDate,outcome,probability7d,probability6d,probability5d,probability4d,probability3d,probability2d,probability1d,volume
0,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,9076267196160737855098401950920813875933767533...,2025-01-01T12:00:00Z,True,0.996,0.9975,0.998,0.997,0.9965,0.9975,0.998,2771364.0
1,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,4124867739151643650152044374838389469956368134...,2025-01-01T12:00:00Z,True,0.915,0.935,0.945,0.9555,0.977,0.9905,0.9945,617817.7


In [132]:
# Full Dataset Collection: January 1, 2024 - December 25, 2025

full_dataset = collect_dataset(
    start_date="2024-01-01",
    end_date="2025-12-25",
    save_path="all_filtered_markets_full_2024_2025.csv"
)

Collecting data for 725 days: 2024-01-01 to 2025-12-25

--- Processing 2024-01-01 ---

--- Processing 2024-01-02 ---

--- Processing 2024-01-03 ---

--- Processing 2024-01-04 ---

--- Processing 2024-01-05 ---

--- Processing 2024-01-06 ---

--- Processing 2024-01-07 ---

--- Processing 2024-01-08 ---

--- Processing 2024-01-09 ---

--- Processing 2024-01-10 ---

--- Processing 2024-01-11 ---

--- Processing 2024-01-12 ---

--- Processing 2024-01-13 ---

--- Processing 2024-01-14 ---

--- Processing 2024-01-15 ---

--- Processing 2024-01-16 ---

--- Processing 2024-01-17 ---

--- Processing 2024-01-18 ---

--- Processing 2024-01-19 ---

--- Processing 2024-01-20 ---

--- Processing 2024-01-21 ---

--- Processing 2024-01-22 ---

--- Processing 2024-01-23 ---

--- Processing 2024-01-24 ---

--- Processing 2024-01-25 ---

--- Processing 2024-01-26 ---

--- Processing 2024-01-27 ---

--- Processing 2024-01-28 ---

--- Processing 2024-01-29 ---

--- Processing 2024-01-30 ---

--- Processing