### Create modeled dataset with design principles

In [2]:
dataset_id = "discogs_modeled"

In [47]:
!bq --location=US mk --dataset {dataset_id}

BigQuery error in mk operation: Invalid dataset ID "{dataset_id}". Dataset IDs
must be alphanumeric (plus underscores and dashes) and must be at most 1024
characters long.


Create Label table

In [46]:
%%bigquery
create table discogs_modeled.Label as
select 
    id.Val as label_id,
    contactinfo.Val as label_contact, 
    parentLabel.Val as par_name,
    parentLabel.Zid as par_id,
    name.Val as label_name,
    profile.Val as label_profile,
from discogs_staging.Label

Check Label Primary Key

In [50]:
%%bigquery
select count(*) from discogs_modeled.Label

Unnamed: 0,f0_
0,1506930


In [51]:
%%bigquery
select count(distinct label_id) from discogs_modeled.Label

Unnamed: 0,f0_
0,1506930


Create sublabel table. Like a junction table to avoid repeating information in label. The primary key is a combination of both columns in the table.

In [57]:
%%bigquery
create table discogs_modeled.Sublabel as
select  
    id.Val as origin_label_id,
    sl.Zid as sub_id
from discogs_staging.Label as l, l.sublabels.label as sl

Check sublabel primary key

In [60]:
%%bigquery
select count(*) from (
  select distinct a.origin_label_id, a.sub_id from discogs_modeled.Sublabel a
)

Unnamed: 0,f0_
0,81804


In [61]:
%%bigquery
select count(*) from discogs_modeled.Sublabel

Unnamed: 0,f0_
0,81804


In [62]:
%%bigquery
create table discogs_modeled.LabelURL as
select
    GENERATE_UUID() as uuid,
    id.Val as label_id,
    lu.Val as url
from discogs_staging.Label as l, l.urls.url as lu

Check LabelURL Primary key

In [64]:
%%bigquery
select count(*) from discogs_modeled.LabelURL

Unnamed: 0,f0_
0,242001


In [65]:
%%bigquery
select count(distinct uuid) from discogs_modeled.LabelURL

Unnamed: 0,f0_
0,242001


Check foreign keys for Label

In [67]:
%%bigquery
select count(*) from discogs_modeled.Sublabel s
left join discogs_modeled.Label l
on l.label_id = s.origin_label_id
where l.label_id is null

Unnamed: 0,f0_
0,0


We will fix this in beam

In [69]:
%%bigquery
select count(*) from discogs_modeled.Sublabel s
left join discogs_modeled.Label l
on l.label_id = s.sub_id
where l.label_id is null

Unnamed: 0,f0_
0,550


In [70]:
%%bigquery
select count(*) from discogs_modeled.LabelURL u
left join discogs_modeled.Label l
on l.label_id = u.label_id
where l.label_id is null

Unnamed: 0,f0_
0,0


Create Artist URL Table - Primary Key is the UUID

In [9]:
%%bigquery
create table discogs_modeled.Artist_URL as
select GENERATE_UUID() uuid, z.Zid as group_id, a.id.Val as artist_id, y.Val as url 
from discogs_staging.Artist_Clean a, a.groups.name z, a.urls.url y

In [35]:
%%bigquery
select count(*) from discogs_modeled.Artist_URL

Unnamed: 0,f0_
0,721879


In [36]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Artist_URL

Unnamed: 0,f0_
0,721879


Create Band table

In [10]:
%%bigquery
create table discogs_modeled.Band as
select n.val as name, n.Zid as id from discogs_staging.Artist_Clean a, a.groups.name as n

Remove PK violations from Band - Rows were duplicated for each artist in the band (ex. if a band had 5 artists Band would have 5 rows holding the exact same information). This is easily fixed by selecting only distinct bands.

In [25]:
%%bigquery
create or replace table discogs_modeled.Band as
select distinct * from discogs_modeled.Band

Check that Band has a primary key

In [26]:
%%bigquery
select count(*) from discogs_modeled.Band

Unnamed: 0,f0_
0,410971


In [27]:
%%bigquery
select count(distinct b.id) from discogs_modeled.Band as b

Unnamed: 0,f0_
0,410971


Create Artist table - Primary key is the uuid. An alternative is taking the artist id and band id as the primary key together.

In [12]:
%%bigquery
create table discogs_modeled.Artist as
select GENERATE_UUID() uuid, id.val as id, x.Zid as band_id, 
a.realname.val as name, a.profile.val as profile 
from discogs_staging.Artist_Clean a, a.groups.name x

In [31]:
%%bigquery
select count(*) from discogs_modeled.Artist

Unnamed: 0,f0_
0,1374642


In [32]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Artist

Unnamed: 0,f0_
0,1374642


In [34]:
%%bigquery
select count(*) from (
  select distinct id, band_id from discogs_modeled.Artist
)

Unnamed: 0,f0_
0,1374642


Create Aliases table - Primary key is the uuid.

In [44]:
%%bigquery
create table discogs_modeled.Aliases as
select GENERATE_UUID() uuid, id.val as artist_id, x.Zid as band_id,
a.name.Val as name, y.Zid as alias_id, y.val as alias
from discogs_staging.Artist_Clean a, a.groups.name x, a.aliases.name y

In [37]:
%%bigquery
select count(*) from discogs_modeled.Aliases

Unnamed: 0,f0_
0,647545


In [38]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Aliases

Unnamed: 0,f0_
0,647545


Create Variations table - Primary key is the uuid

In [45]:
%%bigquery
create table discogs_modeled.Variations as
select GENERATE_UUID() uuid, id.val as artist_id, x.Zid as band_id, y.val as variation
from discogs_staging.Artist_Clean a, a.groups.name x, a.namevariations.name y

In [39]:
%%bigquery
select count(*) from discogs_modeled.Variations

Unnamed: 0,f0_
0,3647433


In [40]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Variations

Unnamed: 0,f0_
0,3647433


Check Foreign Key relationships between Artist tables
- Artist and Artist_URL (on artist.id = artist_url.artist_id)
- Artist and Band (on artist.band_id = band.id)

In [42]:
%%bigquery
select count(*) from discogs_modeled.Artist_URL u
left join discogs_modeled.Artist a on a.id = u.artist_id
where a.id is null

Unnamed: 0,f0_
0,0


In [43]:
%%bigquery
select count(*) from discogs_modeled.Artist a
left join discogs_modeled.Band b on a.band_id = b.id
where b.id is null

Unnamed: 0,f0_
0,0
