In [81]:
import pandas as pd
import requests

In [82]:
# helper funciton to get time series data from a given URL (blockchain API)
def get_series(url):
    r = requests.get(url)
    data = r.json().get("values", [])
    df = pd.DataFrame(data)
    if len(df) == 0:
        return pd.DataFrame()  
    
    df["x"] = pd.to_datetime(df["x"], unit="s")
    df = df.rename(columns={"x": "date", "y": url.split("/")[-1]})
    df = df.set_index("date")
    return df


In [83]:
# api calls to get various bitcoin time series data
difficulty = get_series("https://api.blockchain.info/charts/difficulty?timespan=5year&sampled=true&format=json")
hashrate = get_series("https://api.blockchain.info/charts/hash-rate?timespan=5year&sampled=true&format=json")
price = get_series("https://api.blockchain.info/charts/market-price?timespan=5year&sampled=true&format=json")
mempool = get_series("https://api.blockchain.info/charts/mempool-count?timespan=5year&sampled=true&format=json")
miner_rev = get_series("https://api.blockchain.info/charts/miners-revenue?timespan=5year&sampled=true&format=json")

In [85]:
# merge all series on the date index using outer join
df = difficulty.join([hashrate, price, mempool, miner_rev], how="outer")

# reindex to a **continuous daily index**
daily_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq="D")
df = df.reindex(daily_index)

# fill missing values with forward-fill
df = df.ffill()

df.columns = ["difficulty", "hashrate", "price", "mempool", "miner_rev"]
df.head()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2020-12-10 to 2025-12-09
Freq: D
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   difficulty  1826 non-null   float64
 1   hashrate    1826 non-null   float64
 2   price       1826 non-null   float64
 3   mempool     1312 non-null   float64
 4   miner_rev   1826 non-null   float64
dtypes: float64(5)
memory usage: 85.6 KB
None


# Bitcoin Mining Dataset Explanation

This table contains daily Bitcoin network and mining statistics used for predicting mining profitability.

| Column        | Description                                                                 | Units / Notes                                                                                     | Important Equations / Relationships |
|---------------|-----------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|-----------------------------------|
| `difficulty`  | Mining difficulty is a network-wide parameter that determines how hard it is to mine a new block. | Unitless, relative value; higher values = harder to mine. Adjusted every 2016 blocks (~2 weeks). | Expected blocks found: <br> `Expected blocks per miner = (Miner Hashrate / Network Hashrate) * 2016` |
| `hashrate`    | Total computational power of the Bitcoin network.                            | Hashes per second (H/s); often expressed as TH/s = 10¹² H/s.                                     | Miner probability of finding a block: <br> `Miner Hashrate / Network Hashrate` |
| `price`       | Bitcoin market price in USD.                                                | USD per BTC                                                                                       | Revenue in USD: <br> `BTC mined * price` |
| `mempool`     | Number of unconfirmed transactions waiting to be included in blocks.       | Count of transactions or transaction weight units (depending on API)                              | Transaction fees revenue: <br> `fee_rate_per_byte * tx_size` |
| `miner_rev`   | Total miner revenue for the day, including block reward + transaction fees. | BTC or USD depending on source (here USD)                                                        | Profit per TH/s per day: <br> `miner_rev / Network Hashrate` |