## Big Query 3 continued

In [None]:
project = "cs544-spring2024" 
dataset = "lec_demo"

In [None]:
# import statement
from google.cloud import bigquery

In [None]:
# bigquery Client
bq = bigquery.Client()

In [None]:
%load_ext google.cloud.bigquery

### Geopandas GeoDataFrame

- Installation requirements: `pip3 install geopandas`
- a geopandas GeoDataFrame is a subclass of pandas's DataFrame, and it has a geo column
- `to_dataframe` gives a pandas DataFrame
- `to_geodataframe` gives a geopandas GeoDataFrame

In [None]:
# FIPS code for WI is 55
wi = bq.query("""
SELECT county_geom
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""").to_dataframe()

In [None]:
wi

In [None]:
# FIPS code for WI is 55
wi = bq.query("""
SELECT county_geom
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""").to_geodataframe()

In [None]:
wi

In [None]:
type(wi)

In [None]:
wi.plot()

In [None]:
wi.plot(color="0.9", edgecolor="0")

### Export WI to our private table

Why? So as to not keep querying the whole US data to retrieve just WI dataset.

In [None]:
bq.query("""
CREATE OR REPLACE TABLE `cs544-spring2024.lec_demo.wi`
AS
SELECT county_name, county_geom
FROM bigquery-public-data.geo_us_boundaries.counties
WHERE state_fips_code = '55'
""").to_geodataframe()

In [None]:
wi = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe()
wi.head(3)

In [None]:
with open("wi.geojson", "w") as f:
    f.write(wi.to_json())

### ST_\<SOME_FUNCTION\>

ST => Spacial Type

In [None]:
bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

### `ST_UNION_AGG`

- could be used with `GROUP BY`

In [None]:
bq.query("SELECT ST_UNION_AGG(county_geom) FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

### `ST_CENTROID` and `ST_CENTROID_AGG`

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT ST_CENTROID(county_geom) FROM lec_demo.wi").to_geodataframe().plot(ax=ax, color="r")

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT ST_CENTROID_AGG(county_geom) FROM lec_demo.wi").to_geodataframe().plot(ax=ax, color="r")

### `ST_GEOGPOINT`

In [None]:
bq.query("""
SELECT *
FROM lec_demo.applications
""").to_dataframe()

In [None]:
# doesn't work because we need geo column
# lat / lon are just floats
bq.query("""
SELECT *
FROM lec_demo.applications
""").to_geodataframe()

In [None]:
bq.query("""
SELECT *, ST_GEOGPOINT(longitude, latitude) AS loc
FROM lec_demo.applications
""").to_geodataframe()

#### Creating a `houses` table

In [None]:
bq.query("""
CREATE OR REPLACE TABLE lec_demo.houses
AS
SELECT *, ST_GEOGPOINT(longitude, latitude) AS loc
FROM lec_demo.applications
""").to_geodataframe()

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("SELECT * FROM lec_demo.houses").to_geodataframe().plot(ax=ax)

In [None]:
import geopandas
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
ax = world.plot(color="0.9", edgecolor="0")
bq.query("SELECT * FROM lec_demo.houses").to_geodataframe().plot(ax=ax, color="r")

#### What are the ten houses closest to the WI capitol?  
- `ST_DISTANCE`
- `ST_GEOGPOINT(-89.384107, 43.074715)`

In [None]:
%%bigquery
SELECT *, ST_DISTANCE(loc, ST_GEOGPOINT(-89.384107, 43.074715)) / 1000 AS km
FROM lec_demo.houses
ORDER BY km ASC
LIMIT 10

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("""
SELECT *, ST_DISTANCE(loc, ST_GEOGPOINT(-89.384107, 43.074715)) / 1000 AS km
FROM lec_demo.houses
ORDER BY km ASC
LIMIT 10
""").to_geodataframe().plot(ax=ax, color="r")

### Spatial JOIN

- `CROSS JOIN` followed by a filter using `ST_WITHIN` or `ST_CONTAINS`
- or `INNER JOIN` with the filter inside `ON` clause
  
#### What houses are in WI?

In [None]:
%%bigquery
SELECT houses.*, wi.county_name
FROM lec_demo.houses CROSS JOIN lec_demo.wi
WHERE ST_Within(houses.loc, wi.county_geom)

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")
bq.query("""
SELECT houses.*, wi.county_name
FROM lec_demo.houses CROSS JOIN lec_demo.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_geodataframe().plot(ax=ax, color="r")

### `ST_SNAPTOGRID`, `ST_GEOHASH`

- enables us to provide approximate location instead of actual location
- very useful to avoid privacy concerns

In [None]:
ax = bq.query("SELECT * FROM lec_demo.wi").to_geodataframe().plot(color="0.9", edgecolor="0")

# exact locations:
bq.query("""
SELECT houses.*, wi.county_name
FROM lec_demo.houses CROSS JOIN lec_demo.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_geodataframe().plot(ax=ax, color="k")

# approx locations:
bq.query("""
SELECT ST_SNAPTOGRID(houses.loc, 0.1), wi.county_name
FROM lec_demo.houses CROSS JOIN lec_demo.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_geodataframe().plot(ax=ax, color="r")

In [None]:
bq.query("""
SELECT houses.loc, ST_GEOHASH(houses.loc, 3), wi.county_name
FROM lec_demo.houses CROSS JOIN lec_demo.wi
WHERE ST_Within(houses.loc, wi.county_geom)
""").to_dataframe()

### Bigquery ML

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df.head()

In [None]:
# gbq: google big query
df.to_gbq(f"{project}.{dataset}.sampleml", if_exists="replace")

### Train-test split using `rand()`

In [None]:
%%bigquery
SELECT x1,x2,y,rand() < 0.25 AS test
FROM lec_demo.sampleml

In [None]:
%%bigquery
CREATE OR REPLACE TABLE lec_demo.traintest

AS

SELECT x1,x2,y,rand() < 0.25 AS test
FROM lec_demo.sampleml

### Creating and training a `LINEAR_REG` model

In [None]:
%%bigquery 
select x1, x2, y
from lec_demo.traintest
where NOT test

In [None]:
%%bigquery
CREATE OR REPLACE MODEL lec_demo.lr
OPTIONS(model_type="LINEAR_REG", INPUT_LABEL_COLS=["y"])

AS

select x1, x2, y
from lec_demo.traintest
where NOT test

### Model weights: `ML.WEIGHTS`

In [None]:
%%bigquery
SELECT *
FROM ML.WEIGHTS(MODEL lec_demo.lr)

### Using the model to make predictions: `ML.PREDICT`

In [None]:
%%bigquery
SELECT *
FROM ML.PREDICT(MODEL lec_demo.lr, (
    select x1, x2, y
    from lec_demo.traintest
    where test
))

In [None]:
%%bigquery
SELECT *
FROM ML.PREDICT(MODEL lec_demo.lr, (
    select y, x1, x2
    from lec_demo.traintest
    where test
))

For `ML.PREDICT`, label column is optional.

In [None]:
%%bigquery
SELECT *
FROM ML.PREDICT(MODEL lec_demo.lr, (
    select x1, x2
    from lec_demo.traintest
    where test
))

### Evaluating how well the model is performing: `ML.EVALUATE`

For `ML.EVALUATE`, label column is required.

In [None]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.lr, (
    select x1, x2
    from lec_demo.traintest
    where test
))

In [None]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.lr, (
    select x1, x2, y
    from lec_demo.traintest
    where test
))

In [None]:
df

In [None]:
df.at[0, "r2_score"]

**FINAL EXAM CUT OFF POINT**
***

### Transformers

In [None]:
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = 2 * df["x1"] ** 2 - 3 * df["x2"] + np.random.rand(len(df))
df.head()

In [None]:
df.to_gbq(f"{project}.{dataset}.tbl1")

In [None]:
%%bigquery
CREATE OR REPLACE MODEL lec_demo.m1
OPTIONS(model_type="LINEAR_REG", INPUT_LABEL_COLS=["y"])

AS

select x1, x2, y
from lec_demo.tbl1

In [None]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.m1, (
    select x1, x2, y
    from lec_demo.tbl1
))

In [None]:
df

In [None]:
%%bigquery
select x1, POWER(x1, 2) AS x1_2, x2, POWER(x2, 2) AS x2_2, y
from lec_demo.tbl1
LIMIT 5

In [None]:
%%bigquery
CREATE OR REPLACE MODEL lec_demo.m2
TRANSFORM(x1, POWER(x1, 2) AS x1_2, x2, POWER(x2, 2) AS x2_2, y)
OPTIONS(model_type="LINEAR_REG", INPUT_LABEL_COLS=["y"])

AS

select x1, x2, y
from lec_demo.tbl1

In [None]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.m2, (
    select x1, x2, y
    from lec_demo.tbl1
))

In [None]:
df

#### Compare m1 to m2

In [None]:
%%bigquery
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.m1, (
    select x1, x2, y
    from lec_demo.tbl1
))
UNION ALL
SELECT *
FROM ML.EVALUATE(MODEL lec_demo.m2, (
    select x1, x2, y
    from lec_demo.tbl1
))

In [None]:
%%bigquery df
SELECT "m1" as model, r2_score
FROM ML.EVALUATE(MODEL lec_demo.m1, (
    select x1, x2, y
    from lec_demo.tbl1
))
UNION ALL
SELECT "m2" as model, r2_score
FROM ML.EVALUATE(MODEL lec_demo.m2, (
    select x1, x2, y
    from lec_demo.tbl1
))

In [None]:
df

In [None]:
ax = df.set_index("model").plot.bar(figsize=(3, 3))
ax.set_ylabel("R^2 Score")

### OneHot encoding example

In [None]:
df = pd.DataFrame({"x1": np.random.choice(["A", "B", "C"], 100), 
                   "x2": np.random.randint(0, 10, 100).astype(float)})
df["y"] = df["x1"].map({"A":1, "B":5, "C":15}) + 2 * df["x2"] + np.random.rand(len(df))
df.head(3)

In [None]:
df.to_gbq(f"{project}.{dataset}.tbl2")

In [None]:
%%bigquery
CREATE MODEL lec_demo.m3
OPTIONS(model_type="LINEAR_REG", input_label_cols=["y"])

AS

SELECT x1, x2, y
FROM lec_demo.tbl2

In [None]:
%%bigquery
SELECT "m3" as model, r2_score
FROM ML.EVALUATE(MODEL lec_demo.m3, (
    select x1, x2, y
    from lec_demo.tbl2
))

In [None]:
%%bigquery df 
SELECT *
FROM ML.WEIGHTS(MODEL lec_demo.m3)

In [None]:
df

In [None]:
w = df.set_index("processed_input").at["x1", "category_weights"].tolist()
w

In [None]:
weights = pd.DataFrame(w)
weights

In [None]:
weights.set_index("category").plot.bar()