# Complete Exploratory Data Analysis

This is where your full EDA goes. We look forward to digging deeper into your analysis here. 

Read the [eda_outline.md](eda_outline.md) for more details.  

In [36]:
# load libraries
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
import os
from typing import Optional

import matplotlib.pyplot as plt
import polars as pl
import psutil
import seaborn as sns

In [37]:
# -- Congiguration --
# Robustly determine the project root directory

try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    # This executes if running in a Jupyter Notebook
    SCRIPT_DIR = Path.cwd()

PROJECT_ROOT = SCRIPT_DIR.parent # Or SCRIPT_DIR if your notebook is in the root
DATA_DIR = PROJECT_ROOT / "data"
PLOTS_DIR = SCRIPT_DIR / "plots"
COINMETRICS_PATH = DATA_DIR / "Coin Metrics" / "coinmetrics_btc.csv"
POLYMARKET_DIR = DATA_DIR / "Polymarket"

# Create plots directory if it doesn't exist
PLOTS_DIR.mkdir(exist_ok=True)

print(f"Project Root identified as: {PROJECT_ROOT}")

Project Root identified as: /Users/kshitijgurung/Desktop/OMSA GT/practicum/bitcoin-analytics-capstone


In [38]:

# --- Memory Tracking Utilities ---


def get_memory_usage_mb() -> float:
    """
    Get current memory usage of the process in MB.

    Returns:
        Memory usage in megabytes
    """
    process = psutil.Process()
    return process.memory_info().rss / 1024 / 1024


def format_memory(mb: float) -> str:
    """
    Format memory value in MB to human-readable string.

    Args:
        mb: Memory value in megabytes

    Returns:
        Formatted string (e.g., "123.45 MB" or "1.23 GB")
    """
    if mb < 1024:
        return f"{mb:.2f} MB"
    else:
        return f"{mb / 1024:.2f} GB"


@contextmanager
def track_memory(operation_name: str):
    """
    Context manager to track memory usage before and after an operation.

    Args:
        operation_name: Name of the operation being tracked

    Yields:
        None
    """
    memory_before = get_memory_usage_mb()
    print(f"[Memory] Before {operation_name}: {format_memory(memory_before)}")

    try:
        yield
    finally:
        memory_after = get_memory_usage_mb()
        memory_delta = memory_after - memory_before
        print(
            f"[Memory] After {operation_name}: {format_memory(memory_after)} "
            f"(Δ {format_memory(memory_delta)})"
        )


# --- Data Loading Functions ---


def load_bitcoin_data(filepath: Path) -> Optional[pl.DataFrame]:
    """
    Load Bitcoin data from CSV using Polars lazy scan.

    Args:
        filepath: Path to the Coin Metrics CSV file

    Returns:
        Polars DataFrame with parsed datetime column, or None if loading fails
    """
    print(f"Loading Bitcoin data from {filepath}...")
    try:
        with track_memory("loading Bitcoin data"):
            df = (
                pl.scan_csv(filepath, infer_schema_length=10000)
                .with_columns(pl.col("time").str.to_datetime())
                .collect()
            )
        print(f"Successfully loaded {len(df)} rows.")
        return df
    except Exception as e:
        print(f"Error loading Bitcoin data: {e}")
        return None


def load_polymarket_data(datadir: Path) -> Optional[dict[str, pl.DataFrame]]:
    """
    Load Polymarket data from parquet files using Polars lazy scan.

    Args:
        datadir: Directory containing Polymarket parquet files

    Returns:
        Dictionary mapping data type names to Polars DataFrames, or None if loading fails
    """
    print(f"Loading Polymarket data from {datadir}...")
    markets_path = datadir / "finance_politics_markets.parquet"
    odds_path = datadir / "finance_politics_odds_history.parquet"
    summary_path = datadir / "finance_politics_summary.parquet"

    data: dict[str, pl.DataFrame] = {}

    try:
        with track_memory("loading Polymarket data"):
            if markets_path.exists():
                # Load with lazy scan, then collect and handle datetime columns
                markets_df = pl.scan_parquet(markets_path).collect()
                
                # Convert datetime columns only if they exist and are strings
                # (parquet files may already have proper datetime types)
                datetime_cols = []
                for col_name in ["created_at", "end_date"]:
                    if col_name in markets_df.columns:
                        col_dtype = markets_df[col_name].dtype
                        if col_dtype == pl.String or col_dtype == pl.Utf8:
                            datetime_cols.append(pl.col(col_name).str.to_datetime())
                
                if datetime_cols:
                    markets_df = markets_df.with_columns(datetime_cols)
                
                # Fix timestamp corruption
                for col in markets_df.columns:
                    if any(x in col.lower() for x in ["timestamp", "trade", "created_at", "end_date"]):
                        if markets_df[col].dtype == pl.Datetime or markets_df[col].dtype == pl.Date:
                            if not markets_df[col].is_empty() and markets_df[col].max() < datetime(2020, 1, 1):
                                markets_df = markets_df.with_columns((pl.col(col).cast(pl.Int64) * 1000).cast(pl.Datetime))
                                
                        # Enforce 2020+ constraint (replace placeholders/zeros with null)
                        if markets_df[col].dtype == pl.Datetime or markets_df[col].dtype == pl.Date:
                             markets_df = markets_df.with_columns(
                                 pl.when(pl.col(col) < datetime(2020, 1, 1))
                                 .then(None)
                                 .otherwise(pl.col(col))
                                 .alias(col)
                             )
                
                data["markets"] = markets_df
                print(f"Loaded {len(markets_df)} markets.")

            if odds_path.exists():
                odds_df = pl.scan_parquet(odds_path).collect()
                
                # Fix timestamp corruption
                for col in odds_df.columns:
                    if any(x in col.lower() for x in ["timestamp", "trade", "created_at", "end_date"]):
                        if odds_df[col].dtype == pl.Datetime or odds_df[col].dtype == pl.Date:
                            if not odds_df[col].is_empty() and odds_df[col].max() < datetime(2020, 1, 1):
                                odds_df = odds_df.with_columns((pl.col(col).cast(pl.Int64) * 1000).cast(pl.Datetime))
                                
                        # Enforce 2020+ constraint (replace placeholders/zeros with null)
                        if odds_df[col].dtype == pl.Datetime or odds_df[col].dtype == pl.Date:
                             odds_df = odds_df.with_columns(
                                 pl.when(pl.col(col) < datetime(2020, 1, 1))
                                 .then(None)
                                 .otherwise(pl.col(col))
                                 .alias(col)
                             )
                            
                data["odds"] = odds_df
                print(f"Loaded {len(odds_df)} odds history records.")

            if summary_path.exists():
                summary_df = pl.scan_parquet(summary_path).collect()
                
                # Fix timestamp corruption
                for col in summary_df.columns:
                    if any(x in col.lower() for x in ["timestamp", "trade", "created_at", "end_date"]):
                        if summary_df[col].dtype == pl.Datetime or summary_df[col].dtype == pl.Date:
                            if not summary_df[col].is_empty() and summary_df[col].max() < datetime(2020, 1, 1):
                                summary_df = summary_df.with_columns((pl.col(col).cast(pl.Int64) * 1000).cast(pl.Datetime))
                                
                        # Enforce 2020+ constraint (replace placeholders/zeros with null)
                        if summary_df[col].dtype == pl.Datetime or summary_df[col].dtype == pl.Date:
                             summary_df = summary_df.with_columns(
                                 pl.when(pl.col(col) < datetime(2020, 1, 1))
                                 .then(None)
                                 .otherwise(pl.col(col))
                                 .alias(col)
                             )
                            
                data["summary"] = summary_df
                print(f"Loaded {len(summary_df)} summary records.")

        return data if data else None
    except Exception as e:
        print(f"Error loading Polymarket data: {e}")
        return None



In [39]:
# Load data using lazy evaluation
btc_df = load_bitcoin_data(COINMETRICS_PATH)
poly_data = load_polymarket_data(POLYMARKET_DIR)

Loading Bitcoin data from /Users/kshitijgurung/Desktop/OMSA GT/practicum/bitcoin-analytics-capstone/data/Coin Metrics/coinmetrics_btc.csv...
[Memory] Before loading Bitcoin data: 41.22 MB
[Memory] After loading Bitcoin data: 57.22 MB (Δ 16.00 MB)
Successfully loaded 6221 rows.
Loading Polymarket data from /Users/kshitijgurung/Desktop/OMSA GT/practicum/bitcoin-analytics-capstone/data/Polymarket...
[Memory] Before loading Polymarket data: 57.33 MB
Loaded 78321 markets.
Loaded 2143181 odds history records.
Loaded 78321 summary records.
[Memory] After loading Polymarket data: 234.73 MB (Δ 177.41 MB)


In [40]:
btc_df.head()

time,AdrActCnt,AdrBalCnt,AssetCompletionTime,AssetEODCompletionTime,BlkCnt,CapMVRVCur,CapMrktCurUSD,CapMrktEstUSD,FeeTotNtv,FlowInExNtv,FlowInExUSD,FlowOutExNtv,FlowOutExUSD,HashRate,IssTotNtv,IssTotUSD,PriceBTC,PriceUSD,ROI1yr,ROI30d,ReferenceRate,ReferenceRateETH,ReferenceRateEUR,ReferenceRateUSD,SplyCur,SplyExNtv,SplyExUSD,SplyExpFut10yr,TxCnt,TxTfrCnt,volume_reported_spot_usd_1d
datetime[μs],i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,f64
2009-01-03 00:00:00,0,0,1614334883,1614334883,0,,,,0.0,0.0,,0.0,,,,,1,,,,,,,,0.0,0.0,,,0,0,
2009-01-04 00:00:00,0,0,1614334883,1614334883,0,,,,0.0,0.0,,0.0,,,,,1,,,,,,,,0.0,0.0,,,0,0,
2009-01-05 00:00:00,0,0,1614334883,1614334883,0,,,,0.0,0.0,,0.0,,,,,1,,,,,,,,0.0,0.0,,,0,0,
2009-01-06 00:00:00,0,0,1614334884,1614334884,0,,,,0.0,0.0,,0.0,,,,,1,,,,,,,,0.0,0.0,,,0,0,
2009-01-07 00:00:00,0,0,1614334885,1614334885,0,,,,0.0,0.0,,0.0,,,,,1,,,,,,,,0.0,0.0,,,0,0,


In [41]:
poly_data.values()

dict_values([shape: (78_321, 10)
┌───────────┬────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬────────────┐
│ market_id ┆ question   ┆ slug       ┆ event_slug ┆ … ┆ active ┆ closed ┆ created_at ┆ end_date   │
│ ---       ┆ ---        ┆ ---        ┆ ---        ┆   ┆ ---    ┆ ---    ┆ ---        ┆ ---        │
│ str       ┆ str        ┆ str        ┆ str        ┆   ┆ bool   ┆ bool   ┆ datetime[μ ┆ datetime[μ │
│           ┆            ┆            ┆            ┆   ┆        ┆        ┆ s]         ┆ s]         │
╞═══════════╪════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪════════════╡
│ 242851    ┆ Will       ┆ will-bongb ┆ will-bongb ┆ … ┆ true   ┆ true   ┆ 2022-04-05 ┆ 2022-05-09 │
│           ┆ Bongbong   ┆ ong-marcos ┆ ong-marcos ┆   ┆        ┆        ┆ 20:37:30   ┆ 00:00:00   │
│           ┆ Marcos be  ┆ -be-electe ┆ -be-electe ┆   ┆        ┆        ┆            ┆            │
│           ┆ electe…    ┆ …          ┆ …          ┆   ┆  

In [42]:
markets_df = poly_data.get("markets")

In [52]:
print(markets_df.columns)
markets_df.sort(by='volume', descending=True)[:10, :]

['market_id', 'question', 'slug', 'event_slug', 'category', 'volume', 'active', 'closed', 'created_at', 'end_date']


market_id,question,slug,event_slug,category,volume,active,closed,created_at,end_date
str,str,str,str,str,f64,bool,bool,datetime[μs],datetime[μs]
"""253591""","""Will Donald Trump win the 2024…","""will-donald-trump-win-the-2024…","""presidential-election-winner-2…","""""",1531500000.0,True,True,2024-01-04 17:33:51,2024-11-05 12:00:00
"""253597""","""Will Kamala Harris win the 202…","""will-kamala-harris-win-the-202…","""presidential-election-winner-2…","""""",1037000000.0,True,True,2024-01-04 17:40:17,2024-11-04 12:00:00
"""511754""","""Will Donald Trump be inaugurat…","""will-donald-trump-be-inaugurat…","""who-will-be-inaugurated-as-pre…","""""",400410000.0,True,True,2024-11-01 20:59:58,2025-01-20 12:00:00
"""512340""","""Will Nicolae Ciucă win the 202…","""will-nicolae-ciuca-win-the-202…","""romania-presidential-election""","""""",326510000.0,True,True,2024-11-07 23:53:29,2024-12-08 12:00:00
"""253642""","""Will any other Republican Poli…","""will-any-other-republican-poli…","""presidential-election-winner-2…","""""",241660000.0,True,True,2024-01-06 19:52:35,2024-11-05 00:00:00
"""253727""","""Kamala Harris wins the popular…","""will-kamala-harris-win-the-pop…","""presidential-election-popular-…","""""",163780000.0,True,True,2024-01-09 18:29:49,2024-11-05 12:00:00
"""253609""","""Will Michelle Obama win the 20…","""will-michelle-obama-win-the-20…","""presidential-election-winner-2…","""""",153380000.0,True,True,2024-01-05 20:53:17,2024-11-05 00:00:00
"""538932""","""Will Zohran Mamdani win the 20…","""will-zohran-mamdani-win-the-20…","""new-york-city-mayoral-election""","""""",143250000.0,True,True,2025-04-22 15:32:27,2025-11-04 12:00:00
"""253595""","""Will Robert F. Kennedy Jr. win…","""will-robert-f-kennedy-jr-win-t…","""presidential-election-winner-2…","""""",141610000.0,True,True,2024-01-04 17:39:07,2024-11-04 12:00:00
"""504494""","""Fed increases interest rates b…","""fed-increases-interest-rates-b…","""fed-interest-rates-november-20…","""""",133960000.0,True,True,2024-08-02 19:36:17,2024-11-07 12:00:00
