In [40]:
# Cell 1: Imports and Installs
import subprocess
import sys

def install_package(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
for package in ['pandas', 'requests', 'tqdm']:
    install_package(package)

# Imports
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
import time
from typing import List, Dict, Optional, Tuple
from tqdm import tqdm

print("All packages imported successfully!")


All packages imported successfully!


In [41]:
# Configuration Constants
DEBUG = False  # Set to True for verbose output in functions
MIN_VOLUME = 100000  # Minimum volume in USD
SAFE_THRESHOLD = 0.90  # Minimum probability to be considered "safe"
LOOKBACK_HOURS = 48  # Hours before endDate to fetch price
MIN_PROBABILITY_7D = 0.70  # Minimum probability at 7d to include market (filters high variability)
MARKET_MIN_AGE_DAYS = 7  # Markets must have been alive for at least this many days

In [42]:
# Cell 2: Fetch filtered markets for a day
def fetch_markets_for_day(date: datetime, min_volume: int = None) -> pd.DataFrame:
    """
    Fetch all closed markets for a given day from the Polymarket Gamma API.
    
    Args:
        date: The date to fetch markets for (markets that closed on this date)
        min_volume: Minimum volume in USD (defaults to MIN_VOLUME constant)
    
    Returns:
        DataFrame with market info: id, question, slug, clobTokenIds, closedTime, 
        endDate, outcomePrices, outcomes, volumeNum, startDate
    """
    if min_volume is None:
        min_volume = MIN_VOLUME
    
    # Format dates for API
    end_date_min = date.strftime("%Y-%m-%dT00:00:00Z")
    end_date_max = (date + timedelta(days=1)).strftime("%Y-%m-%dT00:00:00Z")
    # Markets must have been alive for at least MARKET_MIN_AGE_DAYS days
    start_date_max = (date - timedelta(days=MARKET_MIN_AGE_DAYS)).strftime("%Y-%m-%dT00:00:00Z")
    
    all_markets = []
    offset = 0
    limit = 100
    
    while True:
        url = (
            f"https://gamma-api.polymarket.com/markets?"
            f"ascending=true&closed=true"
            f"&end_date_min={end_date_min}"
            f"&end_date_max={end_date_max}"
            f"&volume_num_min={min_volume}"
            f"&start_date_max={start_date_max}"
            f"&limit={limit}&offset={offset}&order=volume"
        )
        
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            markets = response.json()
            
            if not markets:
                break
                
            all_markets.extend(markets)
            offset += limit
            
            # Rate limiting
            time.sleep(0.2)
            
        except requests.RequestException as e:
            if DEBUG:
                print(f"Error fetching markets for {date.date()}: {e}")
            break
    
    if not all_markets:
        return pd.DataFrame()
    
    # Extract relevant fields
    records = []
    for market in all_markets:
        records.append({
            'id': market.get('id'),
            'question': market.get('question'),
            'slug': market.get('slug'),
            'clobTokenIds': market.get('clobTokenIds'),
            'closedTime': market.get('closedTime'),
            'endDate': market.get('endDate'),
            'outcomePrices': market.get('outcomePrices'),
            'outcomes': market.get('outcomes'),
            'volumeNum': market.get('volumeNum'),
            'startDate': market.get('startDate')
        })
    
    df = pd.DataFrame(records)
    if DEBUG:
        print(f"Fetched {len(df)} markets for {date.date()}")
    return df


In [None]:
# Cell 3: Filter markets by keyword
def filter_markets_by_keywords(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filter out markets containing sports, esports, crypto, or weather keywords.
    
    Args:
        df: DataFrame with market data (must have 'question' and 'slug' columns)
    
    Returns:
        Filtered DataFrame with problematic markets removed
    """
    if df.empty:
        return df
    
    sports_keywords = [
        'vs.', ' vs ', 'NFL', 'NBA', 'MLB', 'NHL', 'UFC', 'MMA',
        'Soccer', 'Football', 'Basketball', 'Baseball', 'Hockey',
        'Tennis', 'Golf', 'Boxing', 'Premier League', 'Champions League',
        'ucl-', 'mlb-', 'nba-', 'nfl-', 'nhl-', 'wnba-', 'Serie A',
        'La Liga', 'Bundesliga', 'Ligue 1', 'UEFA', 'FIFA', 'World Cup',
        'Cavaliers', 'Lakers', 'Warriors', 'Celtics', 'Knicks', 'Nets',
        'Yankees', 'Dodgers', 'Astros', 'Heisman', "Davey O'Brien",
        'Doak Walker', 'Biletnikoff', 'Award Winner', 'cfb-', 'ncaa',
        'Bowl Game', 'Championship Game', 'playoffs', 'tournament',
        'fantasy', 'NFC', 'AFC', 'epl-'
    ]
    
    esports_keywords = [
        'LoL:', 'Dota', 'CS:GO', 'Valorant', 'Overwatch', 'Rocket League',
        'Fortnite', 'PUBG', 'Apex Legends', 'Rainbow Six', 'Call of Duty',
        'esports', 'e-sports', '(BO3)', '(BO5)', 'Gen.G', 'T1', 'TSM',
        'Team Liquid', 'Cloud9', 'FaZe', 'NaVi', 'Fnatic', 'G2',
        'Mobile Legends', 'MLBB', 'Honor of Kings', 'Arena of Valor',
        'League of Legends:', 'StarCraft', 'Hearthstone', 'Overwatch League'
    ]
    
    crypto_keywords = [
        'bitcoin', 'ethereum', 'btc', 'eth', 'solana', 'sol', 'xrp',
        'crypto', 'coin', 'token', 'above', 'below', 'hit',
        'multistrike', '4pm et', '8pm et', '12pm et', 'trading',
        'market cap', 'defi', 'nft', 'blockchain', '3:00pm', '3:15pm',
        '3:30pm', '3:45pm', 'price -', 'above ___', 'below ___',
        'price on october', 'price on november', 'price on december',
        'price on january', 'price on february', 'price on march',
        'what price will', 'binance', 'coinbase', 'doge', 'shib',
        'cardano', 'ada', 'bnb', 'polygon', 'matic', 'avalanche',
        'avax', 'polkadot', 'dot', 'chainlink', 'link'
    ]
    
    weather_keywords = [
        'temperature', 'degrees', 'rain', 'snow', 'weather', 'storm',
        'hurricane', 'tornado', 'hotter', 'colder', 'warmest', 'coldest',
        'precipitation', 'humidity', 'forecast', 'climate'
    ]
    
    all_keywords = sports_keywords + esports_keywords + crypto_keywords + weather_keywords
    
    def contains_keyword(row):
        text = f"{row['question']} {row['slug']}".lower()
        return any(kw.lower() in text for kw in all_keywords)
    
    mask = ~df.apply(contains_keyword, axis=1)
    filtered_df = df[mask].copy()
    
    if DEBUG:
        removed = len(df) - len(filtered_df)
        print(f"Filtered out {removed} markets by keywords, {len(filtered_df)} remaining")
    
    return filtered_df


In [44]:
# Cell 4: Fetch price history and append to DataFrame
def fetch_price_at_timestamp(clob_token_id: str, target_ts: int, window_seconds: int = 300) -> Optional[float]:
    """
    Fetch the price of a market at a specific timestamp from the CLOB API.
    
    Args:
        clob_token_id: The CLOB token ID for the market outcome
        target_ts: Unix timestamp to get price for
        window_seconds: Time window to search for price data (default 5 minutes)
    
    Returns:
        Price as float, or None if not found
    """
    start_ts = target_ts - window_seconds
    end_ts = target_ts + window_seconds
    
    url = (
        f"https://clob.polymarket.com/prices-history?"
        f"market={clob_token_id}&startTs={start_ts}&endTs={end_ts}"
    )
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        history = data.get('history', [])
        if not history:
            return None
        
        # Find the price closest to target timestamp
        closest = min(history, key=lambda x: abs(x['t'] - target_ts))
        return closest['p']
        
    except requests.RequestException as e:
        return None


def append_price_history_lookback(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fetch price history LOOKBACK_HOURS before end date for each market and append to DataFrame.
    
    Args:
        df: DataFrame with market data (must have 'clobTokenIds' and 'endDate')
    
    Returns:
        DataFrame with 'probability' column added (named based on LOOKBACK_HOURS)
    """
    if df.empty:
        return df
    
    df = df.copy()
    probabilities = []
    
    desc = f"Fetching {LOOKBACK_HOURS}h prices" if DEBUG else None
    disable_tqdm = not DEBUG
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=desc, disable=disable_tqdm):
        try:
            # Parse clobTokenIds - it's a JSON string array
            clob_ids = json.loads(row['clobTokenIds'])
            if not clob_ids:
                probabilities.append(None)
                continue
            
            # Use first token ID (corresponds to first outcome)
            clob_id = clob_ids[0]
            
            # Parse end date and calculate lookback hours before
            end_date_str = row['endDate']
            # Handle ISO format: "2025-01-01T12:00:00Z"
            if end_date_str.endswith('Z'):
                end_date_str = end_date_str.replace('Z', '+00:00')
            end_date = datetime.fromisoformat(end_date_str)
            target_time = end_date - timedelta(hours=LOOKBACK_HOURS)
            target_ts = int(target_time.timestamp())
            
            price = fetch_price_at_timestamp(clob_id, target_ts)
            probabilities.append(price)
            
            # Rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            probabilities.append(None)
    
    # Use dynamic column name based on LOOKBACK_HOURS
    col_name = f'probability{LOOKBACK_HOURS}h'
    df[col_name] = probabilities
    
    # Count successful fetches
    if DEBUG:
        successful = df[col_name].notna().sum()
        print(f"Successfully fetched {LOOKBACK_HOURS}h price for {successful}/{len(df)} markets")
    
    return df


In [46]:
# Cell 5: Filter safe markets and fetch 7-day price history
def process_safe_markets(df: pd.DataFrame, safe_threshold: float = None) -> pd.DataFrame:
    """
    Filter markets to only include "safe" ones (probability >= threshold at lookback),
    then fetch 7-day prior price and produce final dataset.
    
    Args:
        df: DataFrame with probability column (based on LOOKBACK_HOURS)
        safe_threshold: Minimum probability to be considered "safe" (defaults to SAFE_THRESHOLD)
    
    Returns:
        Final DataFrame with columns: market, slug, clobID, closingDate, outcome, 
        probability, probability7d, volume
    """
    if df.empty:
        return pd.DataFrame()
    
    if safe_threshold is None:
        safe_threshold = SAFE_THRESHOLD
    
    df = df.copy()
    
    # Get the dynamic column name based on LOOKBACK_HOURS
    prob_col = f'probability{LOOKBACK_HOURS}h'
    
    # Filter to safe markets (probability >= threshold for either outcome)
    # A market is "safe" if the leading outcome has >= safe_threshold probability
    def is_safe(prob):
        if prob is None:
            return False
        # Either the first outcome is very likely, or the second is (1 - prob >= threshold)
        return prob >= safe_threshold or (1 - prob) >= safe_threshold
    
    safe_mask = df[prob_col].apply(is_safe)
    safe_df = df[safe_mask].copy()
    
    if DEBUG:
        print(f"Found {len(safe_df)} safe markets (>= {safe_threshold*100}% probability)")
    
    if safe_df.empty:
        return pd.DataFrame()
    
    # Fetch 7-day price history for safe markets
    probabilities_7d = []
    
    desc = "Fetching 7d prices" if DEBUG else None
    disable_tqdm = not DEBUG
    
    for idx, row in tqdm(safe_df.iterrows(), total=len(safe_df), desc=desc, disable=disable_tqdm):
        try:
            clob_ids = json.loads(row['clobTokenIds'])
            clob_id = clob_ids[0]
            
            # Use endDate instead of closedTime for 7d calculation
            end_date_str = row['endDate']
            if end_date_str.endswith('Z'):
                end_date_str = end_date_str.replace('Z', '+00:00')
            end_date = datetime.fromisoformat(end_date_str)
            target_time = end_date - timedelta(days=7)
            target_ts = int(target_time.timestamp())
            
            price = fetch_price_at_timestamp(clob_id, target_ts)
            probabilities_7d.append(price)
            
            time.sleep(0.1)
            
        except Exception as e:
            probabilities_7d.append(None)
    
    safe_df['probability7d'] = probabilities_7d
    
    # Filter out markets where probability7d < MIN_PROBABILITY_7D (high variability filter)
    def get_safe_probability_7d(prob):
        if prob is None:
            return None
        return max(prob, 1 - prob)
    
    safe_df['probability7d_safe'] = safe_df['probability7d'].apply(get_safe_probability_7d)
    before_filter = len(safe_df)
    min_prob_mask = safe_df['probability7d_safe'] >= MIN_PROBABILITY_7D
    safe_df = safe_df[min_prob_mask].copy()
    
    if DEBUG:
        removed = before_filter - len(safe_df)
        if removed > 0:
            print(f"Filtered out {removed} markets with 7d probability < {MIN_PROBABILITY_7D*100}%")
    
    if safe_df.empty:
        return pd.DataFrame()
    
    # Determine the outcome (which outcome won based on outcomePrices)
    def get_winning_outcome(row):
        try:
            outcomes = json.loads(row['outcomes'])
            prices = json.loads(row['outcomePrices'])
            # The winning outcome has price = 1 (or closest to 1)
            winning_idx = prices.index(max(prices, key=lambda x: float(x)))
            return outcomes[winning_idx]
        except:
            return None
    
    # Determine which outcome was the "safe" bet at lookback
    def get_safe_bet_outcome(row):
        try:
            outcomes = json.loads(row['outcomes'])
            prob = row[prob_col]
            if prob >= safe_threshold:
                return outcomes[0]  # First outcome was the safe bet
            else:
                return outcomes[1]  # Second outcome was the safe bet
        except:
            return None
    
    # Get the safe bet's probability (always the higher one)
    def get_safe_probability(prob):
        if prob is None:
            return None
        return max(prob, 1 - prob)
    
    safe_df['winningOutcome'] = safe_df.apply(get_winning_outcome, axis=1)
    safe_df['safeBetOutcome'] = safe_df.apply(get_safe_bet_outcome, axis=1)
    safe_df['safeBetWon'] = safe_df['winningOutcome'] == safe_df['safeBetOutcome']
    
    # Normalize probabilities to be the safe bet's probability
    safe_df[f'{prob_col}_safe'] = safe_df[prob_col].apply(get_safe_probability)
    
    # Build final dataset
    final_df = pd.DataFrame({
        'market': safe_df['question'],
        'slug': safe_df['slug'],
        'clobID': safe_df['clobTokenIds'].apply(lambda x: json.loads(x)[0] if x else None),
        'closingDate': safe_df['closedTime'],
        'outcome': safe_df['safeBetWon'],  # True if safe bet won
        prob_col: safe_df[f'{prob_col}_safe'],
        'probability7d': safe_df['probability7d_safe'],
        'volume': safe_df['volumeNum']
    })
    
    if DEBUG:
        print(f"Final dataset: {len(final_df)} markets")
    return final_df


In [47]:
# Cell 6: Wrapper function to collect data for a date range
def collect_dataset(start_date: str, end_date: str, 
                    min_volume: int = None,
                    safe_threshold: float = None,
                    save_path: Optional[str] = None) -> pd.DataFrame:
    """
    Collect and process market data for a range of dates.
    
    Args:
        start_date: Start date in 'YYYY-MM-DD' format
        end_date: End date in 'YYYY-MM-DD' format (inclusive)
        min_volume: Minimum volume filter (defaults to MIN_VOLUME constant)
        safe_threshold: Probability threshold for "safe" markets (defaults to SAFE_THRESHOLD)
        save_path: Optional path to save CSV incrementally
    
    Returns:
        Complete DataFrame with all safe markets in the date range
    """
    if min_volume is None:
        min_volume = MIN_VOLUME
    if safe_threshold is None:
        safe_threshold = SAFE_THRESHOLD
    
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    
    all_data = []
    current = start
    
    total_days = (end - start).days + 1
    print(f"Collecting data for {total_days} days: {start_date} to {end_date}")
    print("="*60)
    
    while current <= end:
        print(f"\n--- Processing {current.date()} ---")
        
        # Step 1: Fetch markets for the day
        markets_df = fetch_markets_for_day(current, min_volume)
        
        if markets_df.empty:
            if DEBUG:
                print(f"No markets found for {current.date()}")
            current += timedelta(days=1)
            continue
        
        # Step 2: Filter by keywords
        filtered_df = filter_markets_by_keywords(markets_df)
        
        if filtered_df.empty:
            if DEBUG:
                print(f"No markets remaining after keyword filter")
            current += timedelta(days=1)
            continue
        
        # Step 3: Fetch lookback price history
        with_lookback_df = append_price_history_lookback(filtered_df)
        
        # Step 4: Filter safe markets and get 7d history
        final_df = process_safe_markets(with_lookback_df, safe_threshold)
        
        if not final_df.empty:
            all_data.append(final_df)
            
            # Incremental save if path provided
            if save_path:
                combined = pd.concat(all_data, ignore_index=True)
                combined.to_csv(save_path, index=False)
                if DEBUG:
                    print(f"Saved {len(combined)} total records to {save_path}")
        
        current += timedelta(days=1)
        
        # Small delay between days to be respectful to API
        time.sleep(0.5)
    
    print("\n" + "="*60)
    
    if not all_data:
        print("No data collected!")
        return pd.DataFrame()
    
    final_dataset = pd.concat(all_data, ignore_index=True)
    print(f"Collection complete! Total markets: {len(final_dataset)}")
    
    # Final save
    if save_path:
        final_dataset.to_csv(save_path, index=False)
        print(f"Final dataset saved to {save_path}")
    
    return final_dataset


In [9]:
# Testing Pipeline - January 1st, 2025
# The following cells test each function step by step, printing intermediate results.

In [48]:
# Test 1: Fetch markets for January 1st, 2025
test_date = datetime(2025, 1, 1)
raw_markets_df = fetch_markets_for_day(test_date)

print(f"\n=== Raw Markets DataFrame ===")
print(f"Shape: {raw_markets_df.shape}")
print(f"\nColumns: {list(raw_markets_df.columns)}")
print(f"\nHead:")
raw_markets_df.head()



=== Raw Markets DataFrame ===
Shape: (20, 10)

Columns: ['id', 'question', 'slug', 'clobTokenIds', 'closedTime', 'endDate', 'outcomePrices', 'outcomes', 'volumeNum', 'startDate']

Head:


Unnamed: 0,id,question,slug,clobTokenIds,closedTime,endDate,outcomePrices,outcomes,volumeNum,startDate
0,516211,76ers vs. Celtics,nba-phi-bos-2024-12-25,"[""10260102152120325117037936020682032118936290...",2024-12-26 02:34:04+00,2025-01-01T22:00:00Z,"[""1"", ""0""]","[""76ers"", ""Celtics""]",1130028.0,2024-12-22T05:08:16.423255Z
1,506668,September temperature increase by between 1.23...,september-temperature-increase-by-between-1pt2...,"[""32867164006675525949065653942615290229805210...",2024-10-21 19:30:42+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",170613.9,2024-09-10T17:08:04.775023Z
2,514401,Will Solana hit $210 in December?,will-solana-hit-210-in-december,"[""57385243338730163448622852164456859675586769...",2024-12-09 23:35:41+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",170732.5,2024-12-02T18:15:43.190892Z
3,516242,Chiefs vs. Steelers,nfl-kc-pit-2024-12-25,"[""56724154750377002831649637065803386870259449...",2024-12-25 22:57:34+00,2025-01-01T18:00:00Z,"[""1"", ""0""]","[""Chiefs"", ""Steelers""]",2186450.0,2024-12-22T06:03:56.667201Z
4,506672,September temperature increase by between 1.29...,september-temperature-increase-by-between-1pt2...,"[""36832878757548959760983297172765835834504846...",2024-10-21 19:26:01+00,2025-01-01T12:00:00Z,"[""0"", ""1""]","[""Yes"", ""No""]",225614.5,2024-09-10T17:08:35.234502Z


In [49]:
# Test 2: Filter markets by keywords
filtered_df = filter_markets_by_keywords(raw_markets_df)

print(f"\n=== Filtered Markets DataFrame ===")
print(f"Shape: {filtered_df.shape}")
print(f"\nRemaining markets:")
for idx, row in filtered_df.iterrows():
    print(f"  - {row['question'][:60]}...")
print(f"\nHead:")
filtered_df.head()



=== Filtered Markets DataFrame ===
Shape: (2, 10)

Remaining markets:
  - Will Trump be Speaker by January 1?...
  - No Israel x Hamas ceasefire in 2024?...

Head:


Unnamed: 0,id,question,slug,clobTokenIds,closedTime,endDate,outcomePrices,outcomes,volumeNum,startDate
6,508204,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,"[""90762671961607378550984019509208138759337675...",2025-01-02 07:10:46+00,2025-01-01T12:00:00Z,"[""0"", ""1""]","[""Yes"", ""No""]",2771364.0,2024-09-26T21:42:48.851735Z
17,502265,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,"[""41248677391516436501520443748383894699563681...",2025-01-01 10:23:14+00,2025-01-01T12:00:00Z,"[""1"", ""0""]","[""Yes"", ""No""]",617817.7,2024-08-29T17:36:52.585Z


In [50]:
# Test 3: Fetch lookback price history
with_lookback_df = append_price_history_lookback(filtered_df)
prob_col = f'probability{LOOKBACK_HOURS}h'

print(f"\n=== DataFrame with {LOOKBACK_HOURS}h Prices ===")
print(f"Shape: {with_lookback_df.shape}")
print(f"\nNew column added: '{prob_col}'")
print(f"\n{prob_col} values:")
for idx, row in with_lookback_df.iterrows():
    print(f"  - {row['question'][:40]}... : {row[prob_col]}")
print(f"\nHead:")
with_lookback_df[['question', prob_col]].head()



=== DataFrame with 48h Prices ===
Shape: (2, 11)

New column added: 'probability48h'

probability48h values:
  - Will Trump be Speaker by January 1?... : 0.002
  - No Israel x Hamas ceasefire in 2024?... : 0.992

Head:


Unnamed: 0,question,probability48h
6,Will Trump be Speaker by January 1?,0.002
17,No Israel x Hamas ceasefire in 2024?,0.992


In [51]:
# Test 4: Process safe markets (filter + 7d prices + final dataset)
final_df = process_safe_markets(with_lookback_df)

print(f"\n=== Final Dataset ===")
print(f"Shape: {final_df.shape}")
print(f"\nColumns: {list(final_df.columns)}")
print(f"\nHead:")
final_df.head()



=== Final Dataset ===
Shape: (2, 8)

Columns: ['market', 'slug', 'clobID', 'closingDate', 'outcome', 'probability48h', 'probability7d', 'volume']

Head:


Unnamed: 0,market,slug,clobID,closingDate,outcome,probability48h,probability7d,volume
6,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,9076267196160737855098401950920813875933767533...,2025-01-02 07:10:46+00,True,0.998,0.9985,2771364.0
17,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,4124867739151643650152044374838389469956368134...,2025-01-01 10:23:14+00,True,0.992,0.925,617817.7


In [53]:
# Test 5: Summary statistics
prob_col = f'probability{LOOKBACK_HOURS}h'
print("=" * 60)
print("PIPELINE SUMMARY FOR JANUARY 1st, 2025")
print("=" * 60)
print(f"\n1. Raw markets fetched:        {len(raw_markets_df)}")
print(f"2. After keyword filtering:    {len(filtered_df)}")
print(f"3. With {LOOKBACK_HOURS}h prices:            {len(with_lookback_df)} (with {with_lookback_df[prob_col].notna().sum()} valid prices)")
print(f"4. Safe markets (>={SAFE_THRESHOLD*100}%):       {len(final_df)}")

if not final_df.empty:
    print(f"\n--- Safe Market Statistics ---")
    print(f"Safe bets that won:    {final_df['outcome'].sum()} / {len(final_df)} ({100*final_df['outcome'].mean():.1f}%)")
    print(f"Avg {prob_col}:    {final_df[prob_col].mean():.4f}")
    if final_df['probability7d'].notna().any():
        print(f"Avg probability7d:     {final_df['probability7d'].mean():.4f}")
    
    print(f"\n--- Final Dataset ---")
    display(final_df)


PIPELINE SUMMARY FOR JANUARY 1st, 2025

1. Raw markets fetched:        20
2. After keyword filtering:    2
3. With 48h prices:            2 (with 2 valid prices)
4. Safe markets (>=90.0%):       2

--- Safe Market Statistics ---
Safe bets that won:    2 / 2 (100.0%)
Avg probability48h:    0.9950
Avg probability7d:     0.9618

--- Final Dataset ---


Unnamed: 0,market,slug,clobID,closingDate,outcome,probability48h,probability7d,volume
6,Will Trump be Speaker by January 1?,will-trump-be-speaker-by-january-1,9076267196160737855098401950920813875933767533...,2025-01-02 07:10:46+00,True,0.998,0.9985,2771364.0
17,No Israel x Hamas ceasefire in 2024?,next-israel-x-hamas-ceasefire-not-in-2024,4124867739151643650152044374838389469956368134...,2025-01-01 10:23:14+00,True,0.992,0.925,617817.7


In [None]:
# Full Dataset Collection: January 1, 2024 - December 24, 2025
print("=" * 70)
print("COLLECTING DATA FROM JANUARY 1, 2024 TO DECEMBER 24, 2025")
print("=" * 70)

full_dataset = collect_dataset(
    start_date="2024-01-01",
    end_date="2025-12-24",
    save_path="safe_markets_full_2024_2025.csv"
)

prob_col = f'probability{LOOKBACK_HOURS}h'

print("\n" + "=" * 70)
print("FULL DATASET - SAFE MARKET STATISTICS")
print("=" * 70)

if not full_dataset.empty:
    print(f"\nTotal Safe Markets Collected: {len(full_dataset)}")
    print(f"Date Range: {full_dataset['closingDate'].min()} to {full_dataset['closingDate'].max()}")
    
    print(f"\n--- Outcome Statistics ---")
    safe_bets_won = full_dataset['outcome'].sum()
    safe_bets_lost = len(full_dataset) - safe_bets_won
    win_rate = 100 * full_dataset['outcome'].mean()
    print(f"Safe bets that won:     {safe_bets_won} / {len(full_dataset)} ({win_rate:.2f}%)")
    print(f"Safe bets that lost:    {safe_bets_lost} / {len(full_dataset)} ({100-win_rate:.2f}%)")
    
    print(f"\n--- Probability Statistics ---")
    print(f"Average {prob_col}: {full_dataset[prob_col].mean():.4f}")
    print(f"Median {prob_col}:   {full_dataset[prob_col].median():.4f}")
    print(f"Min {prob_col}:     {full_dataset[prob_col].min():.4f}")
    print(f"Max {prob_col}:     {full_dataset[prob_col].max():.4f}")
    
    if full_dataset['probability7d'].notna().any():
        valid_7d = full_dataset['probability7d'].notna().sum()
        print(f"\n7-day probability data available for {valid_7d} markets:")
        print(f"Average probability7d:  {full_dataset['probability7d'].mean():.4f}")
        print(f"Median probability7d:    {full_dataset['probability7d'].median():.4f}")
        print(f"Min probability7d:      {full_dataset['probability7d'].min():.4f}")
        print(f"Max probability7d:      {full_dataset['probability7d'].max():.4f}")
    
    print(f"\n--- Distribution by Probability Bands ---")
    bands = [
        (0.90, 0.95, "90-95%"),
        (0.95, 0.98, "95-98%"),
        (0.98, 0.99, "98-99%"),
        (0.99, 1.00, "99-100%")
    ]
    for low, high, label in bands:
        count = ((full_dataset[prob_col] >= low) & (full_dataset[prob_col] < high)).sum()
        if count > 0:
            won = full_dataset[(full_dataset[prob_col] >= low) & 
                                  (full_dataset[prob_col] < high)]['outcome'].sum()
            print(f"{label}: {count} markets ({won} won, {count-won} lost) - {100*won/count:.1f}% win rate")
    
    print(f"\n--- Monthly Breakdown ---")
    full_dataset['date'] = pd.to_datetime(full_dataset['closingDate']).dt.date
    full_dataset['month'] = pd.to_datetime(full_dataset['closingDate']).dt.to_period('M')
    monthly_stats = full_dataset.groupby('month').agg({
        'outcome': ['count', 'sum'],
        prob_col: 'mean'
    }).round(4)
    monthly_stats.columns = ['Total', 'Won', f'Avg_{prob_col}']
    monthly_stats['Win_Rate'] = (monthly_stats['Won'] / monthly_stats['Total'] * 100).round(1)
    print(monthly_stats)
    
    print(f"\n--- Full Dataset Preview ---")
    display(full_dataset.head(20))
    
    print(f"\nDataset saved to: safe_markets_full_2024_2025.csv")
else:
    print("No safe markets found!")


COLLECTING DATA FROM JANUARY 1, 2024 TO DECEMBER 24, 2025
Collecting data for 724 days: 2024-01-01 to 2025-12-24

--- Processing 2024-01-01 ---

--- Processing 2024-01-02 ---

--- Processing 2024-01-03 ---

--- Processing 2024-01-04 ---

--- Processing 2024-01-05 ---

--- Processing 2024-01-06 ---

--- Processing 2024-01-07 ---

--- Processing 2024-01-08 ---

--- Processing 2024-01-09 ---

--- Processing 2024-01-10 ---

--- Processing 2024-01-11 ---

--- Processing 2024-01-12 ---

--- Processing 2024-01-13 ---

--- Processing 2024-01-14 ---

--- Processing 2024-01-15 ---

--- Processing 2024-01-16 ---

--- Processing 2024-01-17 ---

--- Processing 2024-01-18 ---

--- Processing 2024-01-19 ---

--- Processing 2024-01-20 ---

--- Processing 2024-01-21 ---

--- Processing 2024-01-22 ---

--- Processing 2024-01-23 ---

--- Processing 2024-01-24 ---

--- Processing 2024-01-25 ---

--- Processing 2024-01-26 ---

--- Processing 2024-01-27 ---

--- Processing 2024-01-28 ---

--- Processing 202

In [38]:
january_dataset[january_dataset['outcome'] == False]

Unnamed: 0,market,slug,clobID,closingDate,outcome,probability72h,probability7d,date
39,Will Saquon Barkley be the top Fantasy Running...,will-saquon-barkley-be-the-top-fantasy-running...,1025953301974656857221535117231627604321943847...,2025-01-06 09:27:26+00,False,0.9875,0.9705,2025-01-06
97,Will Biden issue more pardons than Trump?,will-biden-issue-more-pardons-than-trump,7149556142147187976262420142034058584535708653...,2025-01-20 20:28:24+00,False,0.93,0.55,2025-01-20
99,Will Caitlyn Jenner attend presidential inaugu...,will-caitlyn-jenner-attend-presidential-inaugu...,4490769197326194991239940679213142349132102650...,2025-01-24 00:48:27+00,False,0.94,0.755,2025-01-24
