## Big Query 1

### Install requirements

`pip3 install google-cloud-bigquery google-cloud-bigquery-storage pyarrow tqdm ipywidgets pandas matplotlib db-dtypes pandas-gbq`

In [None]:
project = "cs544-spring2024" 
# this name will probably be different for you

In [None]:
from google.cloud import bigquery

In [None]:
bq = bigquery.Client()

In [None]:
q = bq.query(
"""
SELECT counties.geo_id, county_name, states.state_name
FROM `bigquery-public-data.geo_us_boundaries.counties` AS counties
INNER JOIN `bigquery-public-data.geo_us_boundaries.states` AS states
ON counties.state_fips_code = states.state_fips_code
WHERE states.state_name = "Wisconsin"
LIMIT 100
"""
)
q.to_dataframe()

### Structure

"project" contains "datasets" contain "tables"

#### What datasets do I have in my project?

In [None]:
bq.list_datasets(project)

In [None]:
for ds in bq.list_datasets(project):
    print(ds.dataset_id)

### Dataset creation

In [None]:
ds = bigquery.Dataset(f"{project}.lec_demo")
# ds.location = "us-central1"
bq.create_dataset(ds, exists_ok=True)

### Public datasets

In [None]:
for ds in bq.list_datasets("bigquery-public-data"):
    print(ds.dataset_id)

### List tables

In [None]:
for t in bq.list_tables("bigquery-public-data.github_repos"):
    print(t.table_id)

### Extension access

In [None]:
%load_ext google.cloud.bigquery

#### Run a query using `%%bigquery`

In [None]:
%%bigquery
SELECT *
FROM `bigquery-public-data.github_repos.languages`
LIMIT 5

#### Save a query result into `df` using `%%bigquery df`

In [None]:
%%bigquery df
SELECT *
FROM `bigquery-public-data.github_repos.languages`
LIMIT 5

In [None]:
df

#### Python API

In [None]:
no_cache = bigquery.QueryJobConfig(use_query_cache=False)

In [None]:
q = bq.query("""
SELECT *
FROM `bigquery-public-data.github_repos.languages`
LIMIT 5
""", job_config=no_cache)

In [None]:
q.to_dataframe()

#### Total bytes processed and billed

In [None]:
q.total_bytes_processed / 1024**2 # MB

In [None]:
q.total_bytes_billed / 1024**2 # MB

#### How many times can we do this in the free tier?

In [None]:
tb = 1024**4
tb / q.total_bytes_billed

#### How much will it per query after that in say Tokyo?

In [None]:
price_per_tb = 7.5
q.total_bytes_billed / tb * price_per_tb

### Pricing factors

1. you pay for storage too (not just queries)
2. they have a minimum of 10 MB per query
3. they round up to the nearest 1 MB per query

In [None]:
%%bigquery df
SELECT *
FROM `bigquery-public-data.github_repos.languages`
WHERE repo_name LIKE 'open-lambda/%'
LIMIT 5

In [None]:
df