In [2]:

from google.cloud import bigquery

project_id = "dylanericsp25"
dataset = "movies_entertainment_int"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset dylanericsp25.movies_entertainment_int


In [2]:
%load_ext google.cloud.bigquery



### **Criteria 8:**
There exists a table in the raw layer of the warehouse that stores a list of elements in a cell.
- This happens when multiple values are stored in one cell instead of separate rows:
    1. movies_metadata.csv → The "genres" column stores: "[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]"
    2. Instead of separate rows, genres are stored as lists inside a single cell. Thus, this criterion is fulfilled.

In [4]:
%%bigquery
select adult, budget, genres, split(genres, ',') as genres_array
from movies_entertainment_stg.movies_metadata

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,adult,budget,genres,genres_array
0,FALSE,0,"Action, War, Drama, History","[Action, War, Drama, History]"
1,FALSE,0,Drama,[Drama]
2,FALSE,0,Drama,[Drama]
3,FALSE,0,"Drama, Adventure","[Drama, Adventure]"
4,FALSE,0,"Action, Crime, Romance","[Action, Crime, Romance]"
...,...,...,...,...
45458,FALSE,0,Drama,[Drama]
45459,FALSE,2500000,"Drama, Horror, Thriller, Foreign","[Drama, Horror, Thriller, Foreign]"
45460,FALSE,0,"Drama, Romance","[Drama, Romance]"
45461,FALSE,31000000,"Drama, Romance","[Drama, Romance]"


In [5]:
%%bigquery
with genres as (select split(genres, ',') as genres_array,adult,
    budget,
    id,
    imdb_id,
    original_language,
    original_title,
    overview,
    _data_source, 
    _load_time
                    from movies_entertainment_stg.movies_metadata)
select
    adult,
    budget,
    genre,
    id,
    imdb_id,
    original_language,
    original_title,
    overview,
    _data_source, 
    _load_time
from genres, unnest(genres_array) as genre

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,adult,budget,genre,id,imdb_id,original_language,original_title,overview,_data_source,_load_time
0,FALSE,0,Action,104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
1,FALSE,0,War,104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
2,FALSE,0,Drama,104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
3,FALSE,0,History,104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
4,FALSE,0,Drama,44284,tt0111055,ar,صمت القصور,The death of a prince brings a young woman bac...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
...,...,...,...,...,...,...,...,...,...,...
91089,FALSE,31000000,Drama,121602,tt2071441,zh,危險關係,Dangerous Liaisons is a Chinese film by Hur Ji...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
91090,FALSE,31000000,Romance,121602,tt2071441,zh,危險關係,Dangerous Liaisons is a Chinese film by Hur Ji...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
91091,FALSE,0,War,34561,tt0363290,zh,紫蝴蝶,"Ding Hui is a member of Purple Butterfly, a po...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
91092,FALSE,0,Drama,34561,tt0363290,zh,紫蝴蝶,"Ding Hui is a member of Purple Butterfly, a po...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00


In [27]:
%%bigquery
create or replace table movies_entertainment_int.Movies_Metadata as
    with genres as (select split(genres, ',') as genres_array,adult,
        budget,
        id,
        imdb_id,
        original_language,
        original_title,
        overview,
        _data_source, 
        _load_time
                        from movies_entertainment_stg.movies_metadata)
    select
        adult,
        budget,
        genre,
        id,
        imdb_id,
        original_language,
        original_title,
        overview,
        _data_source, 
        _load_time
    from genres, unnest(genres_array) as genre

Query is running:   0%|          |

### **Criteria 9:**
There exists two tables in the raw layer of the warehouse which originated from different sources and which have similar data. These tables use two different identifier systems to refer to the same entity.
- Met – the results show that the same movie title appears with different movie_id values across datasets:
    1. "A Perfect Man" appears with movie_id = 2 in at least two different places.
    2. "Aftermath" appears under at least six different IDs.
    3. "Bad Boys" has four different IDs.
    4. This confirms that the same movie is being referenced with different unique identifiers across movies_metadata.csv and netflix_titles.csv.

In [7]:

%%bigquery
select m.id, m.original_title, n.show_id, n.title
from movies_entertainment_stg.movies_metadata m
inner join movies_entertainment_stg.netflix_movies_and_tvshows n
on m.original_title = n.title


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,original_title,show_id,title
0,523,Requiem,s4974,Requiem
1,55587,Michael,s2280,Michael
2,124639,Target,s4376,Target
3,107937,Bhoot,s6300,Bhoot
4,29621,Solo,s4205,Solo
...,...,...,...,...
2483,191112,Premachi Goshta,s7792,Premachi Goshta
2484,369033,Rebirth,s5831,Rebirth
2485,206157,Into the Badlands,s4029,Into the Badlands
2486,336807,Fatima,s1367,Fatima


In [3]:
%%bigquery
select CASE when SAFE_CAST(movie_id as string) IS NULL THEN netflix_id ELSE SAFE_CAST(movie_id as string) END AS id,
original_title,
type,
director,
group_cast,
country,
date_added,
release_year,
rating,
duration,
_data_source,
_load_time
from
    (select * from
        (select m.id as movie_id, n.title, n.type, n.director, n.cast as group_cast, n.country, n.date_added, n.release_year, n.rating, n.duration, n._data_source, n._load_time
        from movies_entertainment_stg.movies_metadata m
        inner join movies_entertainment_stg.netflix_movies_and_tvshows n
        on m.original_title = n.title) t1
    right join (select n.show_id as netflix_id, n.title as original_title from movies_entertainment_stg.netflix_movies_and_tvshows n) t2
    on t1.title = t2.original_title)



Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,original_title,type,director,group_cast,country,date_added,release_year,rating,duration,_data_source,_load_time
0,252,Willy Wonka & the Chocolate Factory,Movie,Mel Stuart,"Gene Wilder, Jack Albertson, Peter Ostrum, Roy...","United States, East Germany, West Germany",2020-01-01,1971,G,100 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
1,5255,The Polar Express,Movie,Robert Zemeckis,"Tom Hanks, Leslie Zemeckis, Eddie Deezen, Nona...",United States,2021-01-01,2004,G,100 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
2,123723,Gigi,Movie,"Vincente Minnelli, Charles Walters","Leslie Caron, Maurice Chevalier, Louis Jourdan...",United States,2019-11-01,1958,G,115 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
3,17281,Gigi,Movie,"Vincente Minnelli, Charles Walters","Leslie Caron, Maurice Chevalier, Louis Jourdan...",United States,2019-11-01,1958,G,115 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
4,17529,True Grit,Movie,Henry Hathaway,"John Wayne, Glen Campbell, Kim Darby, Jeremy S...",United States,2020-01-01,1969,G,128 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
9160,96599,Eden,TV Show,,"Marika Kono, Kentaro Ito, Kyoko Hikami, Tarusu...",,2021-05-27,2021,TV-PG,1 Season,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
9161,360339,Eden,TV Show,,"Marika Kono, Kentaro Ito, Kyoko Hikami, Tarusu...",,2021-05-27,2021,TV-PG,1 Season,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
9162,104528,Dogs,TV Show,,,United States,2021-07-07,2021,TV-PG,2 Seasons,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
9163,428501,City of Ghosts,TV Show,,"August Nuñez, Blue Chapman, Kirikou S'hai Muld...",United States,2021-03-05,2021,TV-Y7,1 Season,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00


In [6]:
%%bigquery
create or replace table movies_entertainment_int.Netflix_Movies_And_Tvshows AS
    select CASE when SAFE_CAST(movie_id as string) IS NULL THEN netflix_id ELSE SAFE_CAST(movie_id as string) END AS id,
    original_title,
    original_type,
    original_director,
    original_cast,
    original_country,
    original_date_added,
    original_release_year,
    original_rating,
    original_duration,
    original_data_source,
    original_load_time
    from
        (select * from
            (select m.id as movie_id, n.title, n.type, n.director, n.cast as group_cast, n.country, n.date_added, n.release_year, n.rating, n.duration, n._data_source, n._load_time
            from movies_entertainment_stg.movies_metadata m
            inner join movies_entertainment_stg.netflix_movies_and_tvshows n
            on m.original_title = n.title) t1
        right join (select n.show_id as netflix_id, n.title as original_title, n.type as original_type, n.director as original_director, n.cast as original_cast, n.country as original_country, n.date_added as original_date_added, n.release_year as original_release_year, n.rating as original_rating, n.duration as original_duration, n._data_source as original_data_source, n._load_time as original_load_time from movies_entertainment_stg.netflix_movies_and_tvshows n) t2
        on t1.title = t2.original_title)



Query is running:   0%|          |

#### **Criteria 10**

There exists a table in the raw layer of the warehouse that models more than one logical entity in the same table. This leads to data redundancy and storing repeated values. 
- Yes - some tables store multiple logical entities together, causing redundancy: box_office_gross contained entities of companies and movies

**Goal: Decompose box_office_gross into companies and box_office_gross**

In [4]:
%%bigquery
select brand, total, releases
from movies_entertainment_stg.box_office_gross


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,brand,total,releases
0,Alan Moore,276088604,4
1,MonsterVerse,580145113,4
2,Broken Lizard,73338237,5
3,Robert Ludlum,645459186,6
4,Laika,300158323,6
5,Roald Dahl,356238494,7
6,DisneyToon Studios,337576791,8
7,Disney Channel,359952780,8
8,Warner Animation Group,786497181,8
9,Disneynature,151620585,8


In [5]:
%%bigquery
create or replace table movies_entertainment_int.Companies as
    select brand, total, releases
    from movies_entertainment_stg.box_office_gross

Query is running:   0%|          |

In [23]:
%%bigquery
create or replace table movies_entertainment_int.Box_Office_Gross as
    select brand, number_1_release, lifetime_gross, _data_source, _load_time
    from movies_entertainment_stg.box_office_gross

Query is running:   0%|          |

IMDB Reviews Intermediate

In [3]:
%%bigquery
create or replace table movies_entertainment_int.Imdb_Reviews as
    select *
    from movies_entertainment_stg.imdb_reviews

Query is running:   0%|          |

In [26]:
%%bigquery
    select brand, number_1_release, lifetime_gross, _data_source, _load_time from
        (select t2.id, t2.original_title, t1.brand, t1.number_1_release, t1.lifetime_gross, t1._data_source, t1._load_time
        from movies_entertainment_stg.box_office_gross t1 inner join movies_entertainment_int.Movies_Metadata t2 on t1.number_1_release = t2.original_title)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,brand,number_1_release,lifetime_gross,_data_source,_load_time
0,Disneynature,Earth,32011576,box-office-gross,2025-02-06 23:13:06.561261+00:00
1,Amazon Studios,Manchester by the Sea,47695371,box-office-gross,2025-02-06 23:13:06.561261+00:00


**Removing Duplicate movies with diff IDs**

In [19]:
%%bigquery
CREATE OR REPLACE TABLE movies_entertainment_int.Movies_Metadata AS
WITH duplicates AS (
    SELECT original_title
    FROM movies_entertainment_int.Movies_Metadata
    GROUP BY original_title
    HAVING COUNT(*) > 1
)
SELECT * 
FROM movies_entertainment_int.Movies_Metadata
WHERE original_title NOT IN (SELECT original_title FROM duplicates)


Query is running:   0%|          |

#### **Removing Duplicates from box office gross**

In [24]:
%%bigquery
delete from movies_entertainment_int.Box_Office_Gross where brand in ('Platinum Dunes', 'Vertigo Entertainment', 'Bad Robot')


Query is running:   0%|          |

In [28]:
%%bigquery
    select id, brand, number_1_release, lifetime_gross, _data_source, _load_time from
        (select t2.id, t2.original_title, t1.brand, t1.number_1_release, t1.lifetime_gross, t1._data_source, t1._load_time
        from movies_entertainment_stg.box_office_gross t1 inner join movies_entertainment_int.Movies_Metadata t2 on t1.number_1_release = t2.original_title)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,brand,number_1_release,lifetime_gross,_data_source,_load_time
0,12092,Tim Burton-Johnny Depp,Alice in Wonderland,334191110,box-office-gross,2025-02-06 23:13:06.561261+00:00
1,25694,Tim Burton-Johnny Depp,Alice in Wonderland,334191110,box-office-gross,2025-02-06 23:13:06.561261+00:00
2,809,DreamWorks Animation,Shrek 2,441226247,box-office-gross,2025-02-06 23:13:06.561261+00:00
3,1498,Platinum Dunes,Teenage Mutant Ninja Turtles,191204754,box-office-gross,2025-02-06 23:13:06.561261+00:00
4,1498,Nickelodeon,Teenage Mutant Ninja Turtles,191204754,box-office-gross,2025-02-06 23:13:06.561261+00:00
...,...,...,...,...,...,...
146,809,DreamWorks Animation,Shrek 2,441226247,box-office-gross,2025-02-06 23:13:06.561261+00:00
147,11887,Disney Channel,High School Musical 3: Senior Year,90559416,box-office-gross,2025-02-06 23:13:06.561261+00:00
148,118,Roald Dahl,Charlie and the Chocolate Factory,206459076,box-office-gross,2025-02-06 23:13:06.561261+00:00
149,137106,Warner Animation Group,The Lego Movie,257760692,box-office-gross,2025-02-06 23:13:06.561261+00:00


#### **Intermediate table for Box Office Gross ensuring priumary key is id**

In [30]:
%%bigquery
CREATE OR REPLACE TABLE movies_entertainment_int.Box_Office_Gross AS
select id, brand, number_1_release, lifetime_gross, _data_source, _load_time from
    (select t2.id, t2.original_title, t1.brand, t1.number_1_release, t1.lifetime_gross, t1._data_source, t1._load_time
    from movies_entertainment_stg.box_office_gross t1 inner join movies_entertainment_stg.movies_metadata t2 on t1.number_1_release = t2.original_title)

Query is running:   0%|          |

#### **Primary Key Constraints**

In [31]:
%%bigquery
alter table movies_entertainment_int.Box_Office_Gross add primary key (id) not enforced;
alter table movies_entertainment_int.Movies_Metadata add primary key (id) not enforced;
alter table movies_entertainment_int.Netflix_Movies_And_Tvshows add primary key (id) not enforced;
alter table movies_entertainment_int.Companies add primary key (brand) not enforced

Query is running:   0%|          |

In [34]:
%%bigquery

alter table movies_entertainment_int.Box_Office_Gross add constraint box_office_gross_fk_companies foreign key (brand)
    references movies_entertainment_int.Companies (brand) not enforced;


Executing query with job ID: bd715a99-b51e-4522-a635-9a9818d0c74b
Query executing: 0.26s


ERROR:
 409 GET https://bigquery.googleapis.com/bigquery/v2/projects/dylanericsp25/queries/bd715a99-b51e-4522-a635-9a9818d0c74b?maxResults=0&location=us-central1&prettyPrint=false: Already Exists: Constraint box_office_gross_fk_companies

Location: us-central1
Job ID: bd715a99-b51e-4522-a635-9a9818d0c74b

