### Create modeled dataset with design principles

The structure of our data is already broken down into entities, so we just copied over the original tables from the staging dataset. The only problem with the original tables is that many fields were autodetected with type String because the data uses "/N" to represent nulls. Because of this, we were not able to cast columns to Integers, so we plan to accomplish this with Apache Beam.

In [1]:
dataset_id = "musicbrainz_modeled"

In [2]:
!bq --location=US mk --dataset {dataset_id}

Dataset 'earnest-keep-266820:musicbrainz_modeled' successfully created.


Example:
%%bigquery
create table college_modeled.Student as
select distinct sid, fname, lname, dob, 'CUR' as status
from college_staging.Current_Students
union all
select distinct sid, fname, lname, cast(dob as string) as dob, 'PRO' as status
from college_staging.New_Students;

In [4]:
%%bigquery
create table musicbrainz_modeled.Area as
select area_id, area_name, area_type, begin_year, begin_month, begin_day, end_year, end_month, end_day, ended
from musicbrainz_staging.Area

In [5]:
%%bigquery
create table musicbrainz_modeled.Artist as
select artist_id, artist_name, sort_name, begin_year, begin_month, begin_day, end_year, 
end_month, end_day, artist_type, area_id, gender, comment, ended, begin_area_id, end_area_id
from musicbrainz_staging.Artist

In [6]:
%%bigquery
create table musicbrainz_modeled.Event as
select event_id, event_name, begin_year, begin_month, begin_day, end_year, end_month, end_day, start_time, event_type,
cancelled, setlist, comment
from musicbrainz_staging.Event

In [1]:
%%bigquery
create table musicbrainz_modeled.Label as
select label_id, label_name, begin_year, begin_month, begin_day, end_year, end_month, end_day, label_code, label_type, 
label_area_id, comment, ended
from musicbrainz_staging.Label

In [8]:
%%bigquery
create table musicbrainz_modeled.Place as
select place_id, place_name, place_type, address, area_id, coordinates, comment, begin_year, begin_month, begin_day, end_year, end_month, end_day,
ended
from musicbrainz_staging.Place

In [10]:
%%bigquery
create table musicbrainz_modeled.Recording as
select rec_id, rec_name, artist_id, length, comment
from musicbrainz_staging.Recording

In [11]:
%%bigquery
create table musicbrainz_modeled.Release as
select rel_id, rel_name, artist_id, rel_group, status, packaging, language, script, barcode, comment, quality
from musicbrainz_staging.Release

In [3]:
%%bigquery
create table musicbrainz_modeled.Release_Group as
select rel_gr_id, rel_gr_name, artist_id, rel_gr_type, comment
from musicbrainz_staging.Release_Group

In [14]:
%%bigquery
create table musicbrainz_modeled.URL as
select url_id, link
from musicbrainz_staging.URL

In [15]:
%%bigquery
create table musicbrainz_modeled.Work as
select work_id, work_name, work_type, comment
from musicbrainz_staging.Work

In [29]:
%%bigquery
create table musicbrainz_modeled.Area_Type as
select area_type_id, area_type, area_comment
from musicbrainz_staging.Area_Type

In [30]:
%%bigquery
create table musicbrainz_modeled.Artist_Type as
select artist_type_id, artist_type
from musicbrainz_staging.Artist_Type

In [7]:
%%bigquery
create table musicbrainz_modeled.Event_Type as
select event_type_id, type, comment
from musicbrainz_staging.Event_Type

In [32]:
%%bigquery
create table musicbrainz_modeled.Gender as
select gender_id, gender_type, comment
from musicbrainz_staging.Gender

In [34]:
%%bigquery
create table musicbrainz_modeled.Label_Type as
select label_id, label_type
from musicbrainz_staging.Label_Type

In [21]:
%%bigquery
create table musicbrainz_modeled.Language as
select language_id, language_name, lang_name_short
from musicbrainz_staging.Language

In [35]:
%%bigquery
create table musicbrainz_modeled.Place_Type as
select place_type_id, place_name, comment
from musicbrainz_staging.Place_Type

In [36]:
%%bigquery
create table musicbrainz_modeled.Release_Group_Type as
select release_group_type_id, release_group_name
from musicbrainz_staging.Release_Group_Type

In [37]:
%%bigquery
create table musicbrainz_modeled.Release_Status as
select release_status_id, release_status_name, comment
from musicbrainz_staging.Release_Status

In [25]:
%%bigquery
create table musicbrainz_modeled.Script as
select script_id, short_name, script_code, script_name
from musicbrainz_staging.Script

### Run apache beam DirectRunner to clean 500 rows of the Area data

In [1]:
%run area_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.internal.gcp.auth:Setting socket default timeout to 60 seconds.
INFO:apache_beam.internal.gcp.auth:socket default timeout is 60.0 seconds.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Area'> referenced by query SELECT * from musicbrainz_modeled.Area limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.1

Check the tabel created in bigquery by Beam to ensure that FK and PK relationships still exist.
First check if it has a valid primary key.

In [3]:
%%bigquery
select count(distinct ab.area_id) from musicbrainz_modeled.Area_Beam as ab

Unnamed: 0,f0_
0,500


In [4]:
%%bigquery
select count(ab.area_id) from musicbrainz_modeled.Area_Beam as ab

Unnamed: 0,f0_
0,500


Then check if it still has a foreign key corresponding to the Area_Type table.

In [6]:
%%bigquery
select count(*) from musicbrainz_modeled.Area_Type as aty
right join musicbrainz_modeled.Area_Beam as ab
on cast(aty.area_type_id as INT64) = ab.area_type
where aty.area_type_id is null

Unnamed: 0,f0_
0,0


<font size = "40">Milestone 6</font>

For this milestone, we ran dataflow transformations to cast values in our tables that could not be casted in SQL. We then checked that each of these tables has a primary key and that the foreign key relationships we discovered before still exist. We did not perform beam transformations on every table in our modeled dataset because many of our tables are 'type' tables and contain only a few rows and require no transformations (we checked with the TAs that this was okay!)

### AREA TRANSFORMATIONS
Use dataflow to run the area transformations.

In [2]:
%run area_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/area-beam-dataflow.1583693977.124099/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/area-beam-dataflow.1583693977.124099/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpwl4ugpke', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/area-beam-dataflow.1583693977.124099/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/area-beam-dataflow.1583693977.124099/da

Check primary key for full area table.

In [21]:
%%bigquery
select count(*) from musicbrainz_modeled.Area_Beam_DF

Unnamed: 0,f0_
0,118076


In [22]:
%%bigquery
select count(distinct area_id) from musicbrainz_modeled.Area_Beam_DF

Unnamed: 0,f0_
0,118076


Area foreign key relationships:

In [24]:
%%bigquery
select count(a.area_id) from musicbrainz_modeled.Area_Beam_DF as a
right join musicbrainz_modeled.Place_Beam_DF as p on p.area_id = a.area_id
where a.area_id is null

Unnamed: 0,f0_
0,0


In [27]:
%%bigquery
select count(a.area_id) from musicbrainz_modeled.Area_Beam_DF as a
right join musicbrainz_modeled.Label_Beam_DF as l on l.label_area_id = a.area_id
where a.area_id is null

Unnamed: 0,f0_
0,0


In [38]:
%%bigquery
select count(a.area_id) from musicbrainz_modeled.Area_Beam_DF as a
right join musicbrainz_modeled.Area_Type as aty on aty.area_type_id = a.area_type
where a.area_type is null

Unnamed: 0,f0_
0,0


### RELEASE TRANSFORMATIONS

In [14]:
%run 'beam programs'/release_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:oauth2client.transport:Refreshing due to a 401 (attempt 1/2)
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Release'> referenced by query SELECT * from musicbrainz_modeled.Release limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.fileb

In [19]:
%run 'beam programs'/release_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/release-beam-dataflow.1583723468.781853/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/release-beam-dataflow.1583723468.781853/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmprhst1az1', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/release-beam-dataflow.1583723468.781853/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/release-beam-dataflow.15837234

Check release primary key

In [41]:
%%bigquery
select count(*) from musicbrainz_modeled.Release_Beam_DF

Unnamed: 0,f0_
0,2434536


In [42]:
%%bigquery
select count(distinct rb.rel_id) from musicbrainz_modeled.Release_Beam_DF as rb

Unnamed: 0,f0_
0,2434536


Check release foreign key relationships

In [43]:
%%bigquery
select count(a.artist_id) from musicbrainz_modeled.Artist_Beam_DF as a
right join musicbrainz_modeled.Release_Beam_DF as r on r.artist_id = a.artist_id
where a.artist_id is null

Unnamed: 0,f0_
0,0


In [44]:
%%bigquery
select count(r.language) from musicbrainz_modeled.Release_Beam_DF as r
right join musicbrainz_modeled.Language_Beam_DF as l on l.language_id = r.language
where r.language is null

Unnamed: 0,f0_
0,0


In [45]:
%%bigquery
select count(r.status) from musicbrainz_modeled.Release_Beam_DF as r
right join musicbrainz_modeled.Release_Status as rs on rs.release_status_id = r.status
where r.status is null

Unnamed: 0,f0_
0,0


### LABEL BEAM TRANSFORMS

In [34]:
%run 'beam programs'/label_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Label'> referenced by query SELECT * from musicbrainz_modeled.Label limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.gcp.big

In [35]:
%run 'beam programs'/label_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/label-beam-dataflow.1583727051.802232/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/label-beam-dataflow.1583727051.802232/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpsmcas6pb', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/label-beam-dataflow.1583727051.802232/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/label-beam-dataflow.1583727051.80223

Check that label still has a primary key

In [48]:
%%bigquery
select count(*) from musicbrainz_modeled.Label_Beam_DF

Unnamed: 0,f0_
0,174391


In [49]:
%%bigquery
select count(distinct lb.label_id) from musicbrainz_modeled.Label_Beam_DF as lb

Unnamed: 0,f0_
0,174391


Check foreign key relationships

In [50]:
%%bigquery
select count(l.label_type) from musicbrainz_modeled.Label_Beam_DF as l
right join musicbrainz_modeled.Label_Type as lt on lt.label_id = l.label_type
where l.label_type is null

Unnamed: 0,f0_
0,0


### LANGUAGE BEAM TRANSFORMS

In [27]:
%run 'beam programs'/language_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Language'> referenced by query SELECT * from musicbrainz_modeled.Language limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.g

In [9]:
%run 'beam programs'/language_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/language-beam-dataflow.1583731416.390911/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/language-beam-dataflow.1583731416.390911/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpf0cng3wk', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/language-beam-dataflow.1583731416.390911/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/language-beam-dataflow.1583

Check that language has a valid primary key

In [51]:
%%bigquery
select count(*) from musicbrainz_modeled.Language_Beam_DF

Unnamed: 0,f0_
0,7843


In [52]:
%%bigquery
select count(distinct lb.language_id) from musicbrainz_modeled.Language_Beam_DF as lb

Unnamed: 0,f0_
0,7843


### PLACE BEAM TRANSFORMS

In [33]:
%run 'beam programs'/place_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Place'> referenced by query SELECT * from musicbrainz_modeled.Place limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.gcp.big

In [10]:
%run 'beam programs'/place_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/place-beam-dataflow.1583731929.648058/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/place-beam-dataflow.1583731929.648058/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpnmybf78s', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/place-beam-dataflow.1583731929.648058/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/place-beam-dataflow.1583731929.64805

Check that place has a valid primary key

In [53]:
%%bigquery
select count(*) from musicbrainz_modeled.Place_Beam_DF

Unnamed: 0,f0_
0,39888


In [54]:
%%bigquery
select count(distinct pb.place_id) from musicbrainz_modeled.Place_Beam_DF as pb

Unnamed: 0,f0_
0,39888


Check foreign keys

In [None]:
%%bigquery
select count(p.place_type) from musicbrainz_modeled.Place_Beam_DF as p
right join musicbrainz_modeled.Place_Type as pt on pt.place_type_id = p.place_type
where p.place_type is null

### RECORDING BEAM TRANSFORMS

In [37]:
%run 'beam programs'/recording_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Recording'> referenced by query SELECT * from musicbrainz_modeled.Recording limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io

In [None]:
%run 'beam programs'/recording_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/recording-beam-dataflow.1583813459.364519/pipeline.pb...
INFO:oauth2client.transport:Refreshing due to a 401 (attempt 1/2)
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/recording-beam-dataflow.1583813459.364519/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmprldxpbij', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/recording-beam-dataflow.1583813459.364519/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upl

Check that recording has a primary key

In [87]:
%%bigquery
select count(*) from musicbrainz_modeled.Recording_Beam_DF

Unnamed: 0,f0_
0,21874183


In [88]:
%%bigquery
select count(distinct rb.rec_id) from musicbrainz_modeled.Recording_Beam_DF as rb

Unnamed: 0,f0_
0,21874183


### RELEASE GROUP TRANSFORMS

In [8]:
%run 'beam programs'/release_group_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Release_Group'> referenced by query SELECT * from musicbrainz_modeled.Release_Group limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache

In [12]:
%run 'beam programs'/release_group_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/release-group-beam-dataflow.1583733603.802385/pipeline.pb...
INFO:oauth2client.transport:Refreshing due to a 401 (attempt 1/2)
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/release-group-beam-dataflow.1583733603.802385/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpv3utoyj5', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/release-group-beam-dataflow.1583733603.802385/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Star

Check that release group has a primary key

In [59]:
%%bigquery
select count(*) from musicbrainz_modeled.Release_Group_Beam_DF

Unnamed: 0,f0_
0,1923172


In [60]:
%%bigquery
select count(distinct rgb.rel_gr_id) from musicbrainz_modeled.Release_Group_Beam_DF as rgb

Unnamed: 0,f0_
0,1923172


Find foreign key relationships

In [61]:
%%bigquery
select count(rg.rel_gr_type) from musicbrainz_modeled.Release_Group_Beam_DF as rg
right join musicbrainz_modeled.Release_Group_Type as rgt on rgt.release_group_type_id = rg.rel_gr_type
where rg.rel_gr_type is null

Unnamed: 0,f0_
0,0


### WORK TRANSFORMS

In [62]:
%run 'beam programs'/work_beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:oauth2client.transport:Refreshing due to a 401 (attempt 1/2)
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Work'> referenced by query SELECT * from musicbrainz_modeled.Work limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsi

In [None]:
%run 'beam programs'/work_beam_dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/work-beam-dataflow.1583734219.011242/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/work-beam-dataflow.1583734219.011242/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpgp4v45ww', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/work-beam-dataflow.1583734219.011242/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/work-beam-dataflow.1583734219.011242/da

Check primary key relationships for work

In [63]:
%%bigquery
select count(*) from musicbrainz_modeled.Work_Beam_DF

Unnamed: 0,f0_
0,1270291


In [64]:
%%bigquery
select count(distinct wb.work_id) from musicbrainz_modeled.Work_Beam_DF as wb

Unnamed: 0,f0_
0,1270291


### Clean URL Tables

In [4]:
%run 'beam programs'/URL_Beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.internal.gcp.auth:Setting socket default timeout to 60 seconds.
INFO:apache_beam.internal.gcp.auth:socket default timeout is 60.0 seconds.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'URL'> referenced by query SELECT * from musicbrainz_modeled.URL limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.11 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 

In [6]:
%run 'beam programs'/URL_Beam_Dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/url-beam-dataflow.1583779211.645743/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/url-beam-dataflow.1583779211.645743/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpx4ln0j47', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/url-beam-dataflow.1583779211.645743/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/url-beam-dataflow.1583779211.645743/datafl

Check URL Primary key

In [65]:
%%bigquery
select count(*) from musicbrainz_modeled.URL_Beam_DF

Unnamed: 0,f0_
0,6373091


In [66]:
%%bigquery
select count(distinct url_id) from musicbrainz_modeled.URL_Beam_DF

Unnamed: 0,f0_
0,6373091


### CLEAN EVENT TABLES

In [11]:
%run 'beam programs'/Event_Beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Event'> referenced by query SELECT * from musicbrainz_modeled.Event limit 500
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.gcp.big

In [13]:
%run 'beam programs'/Event_Beam_Dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/event-beam-dataflow.1583780672.895681/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/event-beam-dataflow.1583780672.895681/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpx822fdn7', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/event-beam-dataflow.1583780672.895681/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/event-beam-dataflow.1583780672.89568

Check event primary key

In [68]:
%%bigquery
select count(*) from musicbrainz_modeled.Event_Beam_DF

Unnamed: 0,f0_
0,40693


In [69]:
%%bigquery
select count(distinct event_id) from musicbrainz_modeled.Event_Beam_DF

Unnamed: 0,f0_
0,40693


Check foreign keys

In [70]:
%%bigquery
select count(e.event_id) from musicbrainz_modeled.Event_Beam_DF as e
right join musicbrainz_modeled.Event_Type as et on et.event_type_id = e.event_type
where e.event_type is null

Unnamed: 0,f0_
0,0


### CLEAN ARTIST TABLES

In [16]:
%run 'beam programs'/Artist_Beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'musicbrainz_modeled'
 projectId: 'earnest-keep-266820'
 tableId: 'Artist'> referenced by query SELECT * from musicbrainz_modeled.Artist limit 500
INFO:apache_beam.io.gcp.bigquery_tools:Waiting on response from query: SELECT * from musicbrainz_modeled.Artist limit 500 ...
INFO:apache_beam.io.gcp.bigquery_tools:Waiting on response from query: SELECT * from musicbrainz_modeled.Artist limit 500 ...
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.10 seconds.
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:R

In [18]:
%run 'beam programs'/Artist_Beam_Dataflow.py

  kms_key=transform.kms_key))
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/artist-beam-dataflow.1583781343.474323/pipeline.pb...
INFO:apache_beam.runners.dataflow.internal.apiclient:Completed GCS upload to gs://jeffersonballers-yeet/staging/artist-beam-dataflow.1583781343.474323/pipeline.pb in 0 seconds.
INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/home/jupyter/venv/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpll6kh1wa', 'apache-beam==2.19.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI to gs://jeffersonballers-yeet/staging/artist-beam-dataflow.1583781343.474323/dataflow_python_sdk.tar
INFO:apache_beam.runners.dataflow.internal.apiclient:Starting GCS upload to gs://jeffersonballers-yeet/staging/artist-beam-dataflow.1583781343.4

Check primary key for artist

In [71]:
%%bigquery
select count(*) from musicbrainz_modeled.Artist_Beam_DF

Unnamed: 0,f0_
0,1604371


In [72]:
%%bigquery
select count(distinct artist_id) from musicbrainz_modeled.Artist_Beam_DF

Unnamed: 0,f0_
0,1604371


Check foreign keys

In [90]:
%%bigquery
select count(a.artist_id) from musicbrainz_modeled.Artist_Beam_DF as a
right join musicbrainz_modeled.Recording_Beam_DF as r on r.artist_id = a.artist_id
where a.artist_id is null

Unnamed: 0,f0_
0,0


In [74]:
%%bigquery
select count(artist.artist_id) from musicbrainz_modeled.Artist_Beam_DF as artist
right join musicbrainz_modeled.Release_Group_Beam_DF as rg on artist.artist_id = rg.artist_id
where artist.artist_id is null

Unnamed: 0,f0_
0,0


In [76]:
%%bigquery
select count(a.artist_id) from musicbrainz_modeled.Artist_Beam_DF as a
right join musicbrainz_modeled.Release_Beam_DF as r on r.artist_id = a.artist_id
where a.artist_id is null

Unnamed: 0,f0_
0,0


In [77]:
%%bigquery
select count(a.artist_id) from musicbrainz_modeled.Artist_Beam_DF as a
right join musicbrainz_modeled.Artist_Type as aty on aty.artist_type_id = a.artist_type
where a.artist_type is null

Unnamed: 0,f0_
0,0


In [78]:
%%bigquery
select count(a.gender) from musicbrainz_modeled.Artist_Beam_DF as a
right join musicbrainz_modeled.Gender as g on g.gender_id = a.gender
where a.gender is null

Unnamed: 0,f0_
0,0
