### Milestone 9

Create modeled dataset with design principles

In [2]:
dataset_id = "discogs_modeled"

In [47]:
!bq --location=US mk --dataset {dataset_id}

BigQuery error in mk operation: Invalid dataset ID "{dataset_id}". Dataset IDs
must be alphanumeric (plus underscores and dashes) and must be at most 1024
characters long.


Create Label table

In [6]:
%%bigquery
create table discogs_modeled.Label as
select 
    id.Val as label_id,
    contactinfo.Val as label_contact, 
    parentLabel.Val as par_name,
    parentLabel.Zid as par_id,
    name.Val as label_name,
    profile.Val as label_profile,
from discogs_staging.Label_Clean

Check Label Primary Key

In [7]:
%%bigquery
select count(*) from discogs_modeled.Label

Unnamed: 0,f0_
0,1506925


In [8]:
%%bigquery
select count(distinct label_id) from discogs_modeled.Label

Unnamed: 0,f0_
0,1506925


Create sublabel table. Like a junction table to avoid repeating information in label. The primary key is a combination of both columns in the table.

In [57]:
%%bigquery
create table discogs_modeled.Sublabel as
select  
    id.Val as origin_label_id,
    sl.Zid as sub_id
from discogs_staging.Label_Clean as l, l.sublabels.label as sl

Check sublabel primary key

In [3]:
%%bigquery
select count(distinct sub_id) from discogs_modeled.Sublabel

Unnamed: 0,f0_
0,81804


In [61]:
%%bigquery
select count(*) from discogs_modeled.Sublabel

Unnamed: 0,f0_
0,81804


In [3]:
%%bigquery
create table discogs_modeled.Label_URL as
select
    GENERATE_UUID() as uuid,
    id.Val as label_id,
    lu.Val as url
from discogs_staging.Label_Clean as l, l.urls.url as lu

Check Label_URL Primary key

In [4]:
%%bigquery
select count(*) from discogs_modeled.Label_URL

Unnamed: 0,f0_
0,241992


In [5]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Label_URL

Unnamed: 0,f0_
0,241992


Check foreign keys for Label

In [67]:
%%bigquery
select count(*) from discogs_modeled.Sublabel s
left join discogs_modeled.Label l
on l.label_id = s.origin_label_id
where l.label_id is null

Unnamed: 0,f0_
0,0


We will fix this in beam

In [69]:
%%bigquery
select count(*) from discogs_modeled.Sublabel s
left join discogs_modeled.Label l
on l.label_id = s.sub_id
where l.label_id is null

Unnamed: 0,f0_
0,550


In [10]:
%%bigquery
select count(*) from discogs_modeled.Label_URL u
left join discogs_modeled.Label l
on l.label_id = u.label_id
where l.label_id is null

Unnamed: 0,f0_
0,0


Create Artist URL Table - Primary Key is the UUID

In [9]:
%%bigquery
create table discogs_modeled.Artist_URL as
select GENERATE_UUID() uuid, z.Zid as group_id, a.id.Val as artist_id, y.Val as url 
from discogs_staging.Artist_Clean a, a.groups.name z, a.urls.url y

In [35]:
%%bigquery
select count(*) from discogs_modeled.Artist_URL

Unnamed: 0,f0_
0,721879


In [21]:
%%bigquery
select count(distinct url) from discogs_modeled.Artist_URL

Unnamed: 0,f0_
0,246187


In [22]:
%%bigquery
# fucking around
select * from discogs_modeled.Artist_URL where url = "https://soundcloud.com/susan-alcorn"

Unnamed: 0,uuid,group_id,artist_id,url
0,25cfed74-c67a-4042-ace6-27d79120c3d8,7000644,547684,https://soundcloud.com/susan-alcorn
1,d5eef0a9-ed8c-4956-b54b-844268c358e3,5339463,547684,https://soundcloud.com/susan-alcorn


Create Band table

In [10]:
%%bigquery
create table discogs_modeled.Band as
select n.val as name, n.Zid as id from discogs_staging.Artist_Clean a, a.groups.name as n

Remove PK violations from Band - Rows were duplicated for each artist in the band (ex. if a band had 5 artists Band would have 5 rows holding the exact same information). This is easily fixed by selecting only distinct bands.

In [25]:
%%bigquery
create or replace table discogs_modeled.Band as
select distinct * from discogs_modeled.Band

Check that Band has a primary key

In [26]:
%%bigquery
select count(*) from discogs_modeled.Band

Unnamed: 0,f0_
0,410971


In [27]:
%%bigquery
select count(distinct b.id) from discogs_modeled.Band as b

Unnamed: 0,f0_
0,410971


Create Artist table - Primary key is the uuid. An alternative is taking the artist id and band id as the primary key together.

In [17]:
%%bigquery
create or replace table discogs_modeled.Artist as
select GENERATE_UUID() uuid, id.val as id, x.Zid as band_id, 
a.realname.val as name, a.profile.val as profile 
from discogs_staging.Artist_Clean a, a.groups.name x

In [18]:
%%bigquery
select count(*) from discogs_modeled.Artist

Unnamed: 0,f0_
0,1374642


In [19]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Artist

Unnamed: 0,f0_
0,1374642


In [20]:
%%bigquery
select count(*) from (
  select distinct id, band_id from discogs_modeled.Artist
)

Unnamed: 0,f0_
0,1374642


Create Aliases table - Primary key is the uuid.

In [44]:
%%bigquery
create table discogs_modeled.Aliases as
select GENERATE_UUID() uuid, id.val as artist_id, x.Zid as band_id,
a.name.Val as name, y.Zid as alias_id, y.val as alias
from discogs_staging.Artist_Clean a, a.groups.name x, a.aliases.name y

In [37]:
%%bigquery
select count(*) from discogs_modeled.Aliases

Unnamed: 0,f0_
0,647545


In [38]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Aliases

Unnamed: 0,f0_
0,647545


Create Variations table - Primary key is the uuid

In [45]:
%%bigquery
create table discogs_modeled.Variations as
select GENERATE_UUID() uuid, id.val as artist_id, x.Zid as band_id, y.val as variation
from discogs_staging.Artist_Clean a, a.groups.name x, a.namevariations.name y

In [39]:
%%bigquery
select count(*) from discogs_modeled.Variations

Unnamed: 0,f0_
0,3647433


In [40]:
%%bigquery
select count(distinct uuid) from discogs_modeled.Variations

Unnamed: 0,f0_
0,3647433


Check Foreign Key relationships between Artist tables
- Artist and Artist_URL (on artist.id = artist_url.artist_id)
- Artist and Band (on artist.band_id = band.id)

In [42]:
%%bigquery
select count(*) from discogs_modeled.Artist_URL u
left join discogs_modeled.Artist a on a.id = u.artist_id
where a.id is null

Unnamed: 0,f0_
0,0


In [43]:
%%bigquery
select count(*) from discogs_modeled.Artist a
left join discogs_modeled.Band b on a.band_id = b.id
where b.id is null

Unnamed: 0,f0_
0,0


### Milestone 10

The Lable table contains two attributes called par_id and par_name that identify the parent label of the label identified with the a given record. As each label can have only one parent label, this relationship should be captured in the Sublabel table, where the child label (a.k.a. sublabel) ID is the primary key. However, in the previous milestone we were unable to export all of the parent-child relationship from the Label table to the Sublabel table because par_id and par_name attributes contained the names and IDs of labels that did not have their own records within the Label table. Any attempt at modeling the parent-child relationship with these "recordless" parent labels within the Sublabel table would have violated the Sublabel table's foreign key.

We have corrected this issue by creating new records for these formerly "recordless" parent labels that were added to the Label table. As all the parent labels now have their own records, they can be included within the Sublabel table without violating its foreign key. The isolation of "recordless" parent labels and the creation of their records were performed using the SQL transformations shown below.

**Query#1:** Identifies the parent labels that did not have their own records in the Label table. A self join was used so that the parent information contained within the child label records could be paired with the parents own records. The child labels that had a parent label, but the parent label did not have its own record, were identified through the use of a right join and filtering for null values from the results. The recordless parent label information was projected in these cases.

In [12]:
%%bigquery
select distinct l2.par_id, l2.par_name
from discogs_modeled.Label l1 right join discogs_modeled.Label l2
on l1.label_id = l2.par_id
where l1.label_id is null and l2.par_name is not null

order by l2.par_id, l2.par_name
limit 5

Unnamed: 0,par_id,par_name
0,0,BangbamRecords
1,0,Core Tex Labs
2,0,Sanctuary Records (UK)
3,0,Venusworks
4,43,Axis


These results are then stored in an intermediate table:

In [14]:
%%bigquery
create table discogs_modeled.Label_SQL_1 as
select distinct l2.par_id, l2.par_name
from discogs_modeled.Label l1 right join discogs_modeled.Label l2
on l1.label_id = l2.par_id
where l1.label_id is null and l2.par_name is not null

**Query#2:** While the recordless parent labels from Query#1 have IDs, some of the IDs are not unique. Since none of the parent labels have their own records in the Label table yet new unique IDs will be assigned to each of them. These are assigned by adding their row number in the previously projected table with the maximum ID that is present in the Label tabel, which will prevent collisions of the new IDs with IDs already exisiting in the table.

First, the row numbers must be generated for the set of recordless parent labels. A column containing the greatest label ID in the whole Label table is added as well.

In [22]:
%%bigquery
select par_name,par_id as old_id,
  (select max(L.label_id) from discogs_modeled.Label L) as max_id,
  row_number() over() as row_num
from discogs_modeled.Label_SQL_1

order by row_num, par_name
limit 5

Unnamed: 0,par_name,old_id,max_id,row_num
0,BangbamRecords,0,1803125,1
1,Venusworks,0,1803125,2
2,Sanctuary Records (UK),0,1803125,3
3,Core Tex Labs,0,1803125,4
4,C & P Dance Classics,68353,1803125,5


These are then stored in an intermediate table:

In [15]:
%%bigquery
create table discogs_modeled.Label_SQL_2 as
select par_name,par_id as old_id,
  (select max(L.label_id) from discogs_modeled.Label L) as max_id,
  row_number() over() as row_num
from discogs_modeled.Label_SQL_1

**Query#3:** Then the new IDs are calculated for the recordless parent labels using the row number and the max ID.

In [23]:
%%bigquery
select par_name, old_id, (max_id + row_num) as new_id
from discogs_modeled.Label_SQL_2

order by new_id
limit 5

Unnamed: 0,par_name,old_id,new_id
0,BangbamRecords,0,1803126
1,Venusworks,0,1803127
2,Sanctuary Records (UK),0,1803128
3,Core Tex Labs,0,1803129
4,C & P Dance Classics,68353,1803130


In [16]:
%%bigquery
create table discogs_modeled.Label_SQL_3 as
select par_name, old_id, (max_id + row_num) as new_id
from discogs_modeled.Label_SQL_2

**Query#4:** The new records are then unioned with the existing table.

In [17]:
%%bigquery
create table discogs_modeled.Label_SQL_Final as
select * from discogs_modeled.Label
union all
select
    new_id as label_id,
    null as contact,
    null as par_name,
    null as par_id,
    par_name as label_name,
    null as label_profile
from discogs_modeled.Label_SQL_3

### Check that PK and FK relationships are still valid

We deicded that a sublabel table is not necessary to model the parent/child relationships in the labels. Each lable in the Lable Table that has a parent has a corresponding parent id, so it's clear which parent a child has. Which children a parent has can be ascertained by a simple query on the table (see transforms.txt)

Here are checks on all the primary and foreign keys without the sublabel table, which has been deleted from our dataset.

Label tables: label_id is the primary key

In [18]:
%%bigquery
select count(*) from discogs_modeled.Label_SQL_Final

Unnamed: 0,f0_
0,1507042


In [19]:
%%bigquery
select count(distinct label_id) from discogs_modeled.Label_SQL_Final

Unnamed: 0,f0_
0,1507042


Label_URL still has a valid foreign key relationship to Label.label_id

In [20]:
%%bigquery
select count(*) from discogs_modeled.Label_URL u
left join discogs_modeled.Label_SQL_Final l
on l.label_id = u.label_id
where l.label_id is null

Unnamed: 0,f0_
0,0


The parent IDs in the Label table all correspond to records within the table. Therefore, par_id is a foreign key to label_id.