## Big Query 1 & 2

### Things to do before lecture

1. Install ``pip3 install google-cloud-bigquery google-cloud-bigquery-storage pyarrow tqdm ipywidgets pandas matplotlib db-dtypes pandas-gbq``
2. Gloud authentication: `gcloud auth application-default login --scopes=openid,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/drive.readonly`
3. Start notebook on your VM: `python3 -m jupyterlab --no-browser`
4. Establish SSH tunnel for port 8888

In [None]:
project = "cs544-spring2024" 
# this name will probably be different for you

In [None]:
# import statement
from google.cloud import bigquery

In [None]:
# bigquery Client
bq = bigquery.Client()

### Dataset creation

In [None]:
ds = bigquery.Dataset(f"{project}.lec_demo")
# ds.location = "us-central1"
bq.create_dataset(ds, exists_ok=True)

In [None]:
q = bq.query("""
SELECT *
FROM `bigquery-public-data.github_repos.languages`
LIMIT 5
""", job_config=no_cache)

In [None]:
q.to_dataframe()

#### Total bytes processed and billed (in MB)

In [None]:
q.total_bytes_processed / 1024**2 # MB

In [None]:
q.total_bytes_billed / 1024**2 # MB

### `open-lambda` repositories

#### What are the ten most common languages on GitHub?

In [None]:
%%bigquery top10
SELECT *
FROM bigquery-public-data.github_repos.languages
CROSS JOIN UNNEST(language) AS L
LIMIT 5

In [None]:
%%bigquery top10
SELECT L.name, COUNT(*) as count
FROM bigquery-public-data.github_repos.languages
CROSS JOIN UNNEST(language) AS L
GROUP BY L.name
ORDER BY count DESC
LIMIT 10

In [None]:
top10

In [None]:
top10.set_index("name")

In [None]:
top10.set_index("name").plot.bar()

#### What software licenses are used most often for Python projects?

In [None]:
%%bigquery top10
SELECT *
FROM bigquery-public-data.github_repos.languages
CROSS JOIN UNNEST(language) AS L
LIMIT 5

In [None]:
%%bigquery lic
SELECT license, COUNT(*) as count
FROM bigquery-public-data.github_repos.languages
CROSS JOIN UNNEST(language) AS L
INNER JOIN bigquery-public-data.github_repos.licenses ON languages.repo_name = licenses.repo_name
WHERE L.name = 'Python'
GROUP BY license
ORDER BY count DESC

In [None]:
lic.set_index("license").plot.bar()

### Using Bigquery on our custom data

### Example 1: BigQuery Table

In [None]:
config = bigquery.LoadJobConfig(source_format="PARQUET", write_disposition="WRITE_TRUNCATE")
# Get this "gsutil URI" from your GCP account 
source = "gs://s24_msyamkumar/hdma-wi-2021.parquet"
dataset = "lec_demo"
job = bq.load_table_from_uri(source, f"{project}.{dataset}.loans", job_config=config)
job.result()

### Example 2: External Table (GCS)

In [None]:
config = bigquery.ExternalConfig(source_format="PARQUET")
config.source_uris = [source]
# config.autodetect = True
table = bigquery.Table(f"{project}.{dataset}.loans-external")
table.external_data_configuration = config
bq.create_table(table, exists_ok=True)

### Example 3: external table (sheets)
Form: https://forms.gle/wwqt8XBXmFj6pES56 <br>
Sheet: https://docs.google.com/spreadsheets/d/1FfalqAWdzz01D1zIvBxsDWLW05-lvANWjjAj2vI4A04/

In [None]:
config = bigquery.ExternalConfig(source_format="GOOGLE_SHEETS")
config.source_uris = ["https://docs.google.com/spreadsheets/d/1FfalqAWdzz01D1zIvBxsDWLW05-lvANWjjAj2vI4A04/"]
config.autodetect = True
table = bigquery.Table(f"{project}.{dataset}.applications")
table.external_data_configuration = config
bq.create_table(table, exists_ok=True)

In [None]:
%%bigquery
SELECT *
FROM `cs544-spring2024.test1.applications`

### Geopandas GeoDataFrame

- a geopandas GeoDataFrame is a subclass of pandas's DataFrame, and it has a geo column
- `to_dataframe` gives a pandas DataFrame
- `to_geodataframe` gives a geopandas GeoDataFrame

In [None]:
# FIPS code for WI is 55
wi = bq.query("""
SELECT county_geom
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""").to_geodataframe()

In [None]:
type(wi)

In [None]:
wi.plot(color="0.9", edgecolor="0")

### Export WI to our private table

In [None]:
bq.query("""
CREATE OR REPLACE TABLE `???.???.wi`
AS
SELECT county_name, county_geom
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""").to_geodataframe()

In [None]:
wi = bq.query("SELECT * FROM ???.wi").to_geodataframe()
wi.head(3)

In [None]:
with open("wi.geojson", "w") as f:
    f.write(wi.to_json())

### ST_\<SOME_FUNCTION\>

ST => Spacial Type

In [None]:
bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

In [None]:
bq.query("SELECT ST_UNION_AGG(county_geom) FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

In [None]:
ax = bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT ST_CENTROID(county_geom) FROM test2.wi").to_geodataframe().plot(ax=ax)

In [None]:
ax = bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT ST_CENTROID_AGG(county_geom) FROM test2.wi").to_geodataframe().plot(ax=ax)

In [None]:
bq.query("""
CREATE OR REPLACE TABLE test2.houses
AS
SELECT *, ST_GEOGPOINT(longitude, latitude) AS loc
FROM test2.applications
""").to_geodataframe()

In [None]:
ax = bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT * FROM test2.houses").to_geodataframe().plot(ax=ax)

#### What are the ten houses closest to the WI capitol?  
- ST_DISTANCE
- ST_GEOGPOINT(-89.384107, 43.074715)

In [None]:
%%bigquery
SELECT *, ST_DISTANCE(loc, ST_GEOGPOINT(-89.384107, 43.074715)) / 1000 AS km
FROM test2.houses
ORDER BY km ASC
LIMIT 10

In [None]:
ax = bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("""
SELECT *, ST_DISTANCE(loc, ST_GEOGPOINT(-89.384107, 43.074715)) / 1000 AS km
FROM test2.houses
ORDER BY km ASC
LIMIT 10
""").to_geodataframe().plot(ax=ax)

In [None]:
%%bigquery
SELECT houses.*, wi.county_name
FROM test2.houses CROSS JOIN test2.wi
WHERE ST_Within(houses.loc, wi.county_geom)

### `ST_SNAPTOGRID`, `ST_GEOHASH`

In [None]:
ax = bq.query("SELECT * FROM test2.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

# exact locations:
bq.query("""
SELECT houses.*, wi.county_name
FROM test2.houses CROSS JOIN test2.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_geodataframe().plot(ax=ax, color="k")

# approx locations:
bq.query("""
SELECT ST_SNAPTOGRID(houses.loc, 0.1), wi.county_name
FROM test2.houses CROSS JOIN test2.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_geodataframe().plot(ax=ax, color="r")

In [None]:
bq.query("""
SELECT houses.loc, ST_GEOHASH(houses.loc, 3), wi.county_name
FROM test2.houses CROSS JOIN test2.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_dataframe()

### Bigquery ML

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df.head()

In [None]:
# gbq: google big query
df.to_gbq("???.???.sampleml", if_exists="replace")

In [None]:
%%bigquery
CREATE OR REPLACE TABLE test2.traintest

AS

SELECT x1,x2,y,rand() < 0.25 AS test
FROM test2.samp1

In [None]:
%%bigquery
CREATE OR REPLACE MODEL test2.lr
OPTIONS(model_type="LINEAR_REG", INPUT_LABEL_COLS=["y"])

AS

select x1,x2,y
from test2.traintest
where NOT test

In [None]:
%%bigquery
SELECT *
FROM ML.WEIGHTS(MODEL test2.lr)

In [None]:
%%bigquery
SELECT *
FROM ML.PREDICT(MODEL test2.lr, (
    select x1,x2,y
    from test2.traintest
    where test
))

In [None]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL test2.lr, (
    select x1,x2,y
    from test2.traintest
    where test
))

In [None]:
df

In [None]:
df.at[0, "r2_score"]