# Tumult with Turbo Demo 

In [1]:
import math
import warnings
from pathlib import Path
warnings.filterwarnings(action='ignore', category=UserWarning)

from pyspark import SparkFiles
from pyspark.sql import SparkSession
from tmlt.analytics.privacy_budget import PureDPBudget
from tmlt.analytics.protected_change import AddMaxRows
from tmlt.analytics.query_builder import QueryBuilder
from termcolor import colored

def print_budget(prev_budget, remaining_privacy_budget):
    print(colored(f"Consumed Budget: {(prev_budget._epsilon-remaining_privacy_budget._epsilon).to_float(round_up=True)} \n", "red"))
    print(colored(f"Remaining Budget: {remaining_privacy_budget._epsilon.to_float(round_up=True)} \n", "green"))
    

# Read dataset
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("OFF")

turbo_suite_path = Path.home().joinpath("turbo-suite")
citibike_path = turbo_suite_path.joinpath("datasets/citibike.csv")
spark.sparkContext.addFile(str(citibike_path))
citibike_df = spark.read.csv(
    SparkFiles.get("citibike.csv"), header=True, inferSchema=True
)

23/10/21 03:10:07 WARN Utils: Your hostname, ds-07 resolves to a loopback address: 127.0.1.1; using 128.59.23.56 instead (on interface eth0)
23/10/21 03:10:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/10/21 03:10:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

### Step 1: Import `TurboSession` and `Accuracy` from Turbo

In [2]:
from turbo.core import Accuracy
from tmlt.turbo import TurboSession

### Step 2: setup turbo configuration
- Turbo requires from users to specify info about the data domain.
- Optionally users can also specify parameters that configure Turbo (default config used otherwise).

In [3]:
# User needs to define a configuration for Turbo
turbo_config = {
    "alpha": 0.05,
    "beta": 0.001,
    "histogram_cfg": {"learning_rate": 4, "heuristic": "bin_visits:5-1", "tau": 0.01},
    "attributes_info": [
        (
            "weekday",
            [
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
                "Sunday",
            ],
        ),
        (
            "hour",
            [
                "00:00-4:00",
                "4:00-8:00",
                "8:00-12:00",
                "12:00-16:00",
                "16:00-20:00",
                "20:00-00:00",
            ],
        ),
        (
            "duration_minutes",
            ["0'-20'", "20'-40'", "40'-60'", "60'-80'", "80'-100'", "100'-120'"],
        ),
        ("start_station", ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
        ("end_station", ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
        ("usertype", ["customer", "subscriber"]),
        ("gender", ["unknown", "male", "female"]),
        ("age", ["0-17", "18-49", "50-64", "65+"]),
    ],
}

### Step 3: Create a `TurboSession` and pass `turbo_config` as an argument
- Note: setting `AddMaxRows(2)` is mandatory. Turbo uses a `ReplaceOneRow` definition which is not supported by Tumult. `AddMaxRows(2)` entails using `ReplaceOneRow` (see `https://github.com/columbia/turbo-suite/blob/dp_audit/tumult-turbo/tmlt/turbo/neighborhood_definitions.pdf`).

In [4]:
# Build TurboSession
session = TurboSession.from_dataframe(
    privacy_budget=PureDPBudget(1),
    source_id="citibike",
    dataframe=citibike_df,
    protected_change=AddMaxRows(2),
    turbo_config=turbo_config,
)

### Step 4: Evaluate queries 
- `TurboSession` supports passing both a `privacy budget` or a desired `accuracy` target. If a user specifies an accuracy target Turbo session will automatically convert this to a `privacy budget` (works only when using `PureDP` and `counts`, otherwise it will throw an error). 

In [5]:
# Specify count query
query1 = QueryBuilder("citibike").filter("gender = 'male'").count()

### Case 1: Exact-Cache miss / Histogram Bypass
- First time we evaluate we miss on `exact-cache` and bypass `histogram`.
- We pay `requested_epsilon = 0.0008358447411768716`.

In [6]:
%%time
prev_budget = session.remaining_privacy_budget

count = session.evaluate(
    query1,
    Accuracy(turbo_config["alpha"], turbo_config["beta"]),
)
count.show()

print_budget(prev_budget, session.remaining_privacy_budget)

                                                                                

+------+
| count|
+------+
|225864|
+------+

[31mConsumed Budget: 0.0008358447411768716 
[0m
[32mRemaining Budget: 0.9991641552588241 
[0m
CPU times: user 1.13 s, sys: 176 ms, total: 1.31 s
Wall time: 13.1 s


### Case 2: Exact-Cache hit
- Second time we evaluate we hit on `exact-cache` getting the exact same DP output.
- We pay 0 epsilon.

In [7]:
%%time
prev_budget = session.remaining_privacy_budget
count = session.evaluate(
    query1,
    Accuracy(turbo_config["alpha"], turbo_config["beta"]),
)
count.show()

print_budget(prev_budget, session.remaining_privacy_budget)

                                                                                

+------+
| count|
+------+
|225864|
+------+

[31mConsumed Budget: 0 
[0m
[32mRemaining Budget: 0.9991641552588241 
[0m
CPU times: user 195 ms, sys: 31.2 ms, total: 226 ms
Wall time: 3.83 s


### Case 3: Exact-Cache miss / Histogram Miss
- Let's create a new setup where the histogram heuristic is too eager to accept a histogram run and run the query again! 
- You'll see some SV-check debugging messages that I left. This is proof that we use the histogram this time!

In [8]:
turbo_config["histogram_cfg"]["heuristic"] = "bin_visits:0-1"
session = TurboSession.from_dataframe(
    privacy_budget=PureDPBudget(1),
    source_id="citibike",
    dataframe=citibike_df,
    protected_change=AddMaxRows(2),
    turbo_config=turbo_config,
)

- Run the query again in this new setup!
- The debugging message shows that the dp-result was `0.333`. 
- This makes sense! We have 3 genders `unknown, male, female` in the dataset and the histogram is initialized uniformly!
- Look at the budget consumed! It's a lot! 
- We paid for: 
    - `SV initialization` = `0.002835625695648337`, and 
    - `tumult's Laplace run` = `0.0008358447411768716`
- Note that Tumult might round-up the budget to account for floating-point precision errors.

In [9]:
%%time
prev_budget = session.remaining_privacy_budget
count = session.evaluate(
    query1,
    Accuracy(turbo_config["alpha"], turbo_config["beta"]),
)
count.show()
print_budget(prev_budget, session.remaining_privacy_budget)

[32m2023-10-21 03:12:06.276[0m | [34m[1mDEBUG   [0m | [36mturbo.core.turbo[0m:[36mprobeL2[0m:[36m158[0m - [34m[1m[33mdp_result, 0.33333333333334275[0m[0m
[32m2023-10-21 03:12:06.278[0m | [34m[1mDEBUG   [0m | [36mturbo.core.turbo[0m:[36mprobeL2[0m:[36m162[0m - [34m[1m[33mtrue_result, 0.6817585063646484[0m[0m
[32m2023-10-21 03:12:06.279[0m | [34m[1mDEBUG   [0m | [36mturbo.core.turbo[0m:[36m_run_sv_check[0m:[36m186[0m - [34m[1m[31mSV_init_budget, 0.002835668584812941[0m[0m
[32m2023-10-21 03:12:06.280[0m | [34m[1mDEBUG   [0m | [36mturbo.core.cache.sparse_vectors[0m:[36mcheck[0m:[36m35[0m - [34m[1m[33mtrue_error, 0.34842517303130566[0m[0m
[32m2023-10-21 03:12:06.281[0m | [34m[1mDEBUG   [0m | [36mturbo.core.cache.sparse_vectors[0m:[36mcheck[0m:[36m38[0m - [34m[1m[33mnoisy_error, 0.3491520611681897[0m[0m
[32m2023-10-21 03:12:06.282[0m | [34m[1mDEBUG   [0m | [36mturbo.core.cache.sparse_vectors[0m:[36mcheck

+------+
| count|
+------+
|226408|
+------+

[31mConsumed Budget: 0.0036715133259898128 
[0m
[32mRemaining Budget: 0.9963284866740102 
[0m
CPU times: user 1.28 s, sys: 180 ms, total: 1.46 s
Wall time: 6.41 s


- And what happens if I re-run? 
- After failing the SV check and running a Laplace we update the exact-cache and the histogram.
- So, now we hit the `exact-cache`!

In [10]:
%%time
prev_budget = session.remaining_privacy_budget
count = session.evaluate(
    query1,
    Accuracy(turbo_config["alpha"], turbo_config["beta"]),
)
count.show()

print_budget(prev_budget, session.remaining_privacy_budget)

                                                                                

+------+
| count|
+------+
|226408|
+------+

[31mConsumed Budget: 0 
[0m
[32mRemaining Budget: 0.9963284866740102 
[0m
CPU times: user 166 ms, sys: 41.3 ms, total: 207 ms
Wall time: 3.72 s
