### **Create the staging layer for the Air Travel warehouse**


In [2]:
from google.cloud import bigquery

project_id = "dylanericsp25"
dataset = "movies_entertainment_stg"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset dylanericsp25.movies_entertainment_stg


In [3]:
%load_ext google.cloud.bigquery




## **Criteria 5**

#### **date_added to datetime (netflix_movies_tvshows table)**

In [13]:
%%bigquery
SELECT date_added AS orig_date_added, safe_cast(date_added as DATE) as new_date_added
FROM movies_entertainment_raw.netflix_movies_and_tvshows
WHERE date_added IS NOT NULL
LIMIT 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orig_date_added,new_date_added
0,1-Dec-16,NaT
1,1-Feb-18,NaT
2,1-Mar-17,NaT
3,26-Jan-17,NaT
4,15-Aug-16,NaT


In [32]:
%%bigquery
SELECT 
    date_added AS orig_date_added,
    PARSE_DATE('%d-%b-%y', date_added) AS new_date_added
FROM movies_entertainment_raw.netflix_movies_and_tvshows
WHERE date_added IS NOT NULL
LIMIT 5


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orig_date_added,new_date_added
0,1-Dec-16,2016-12-01
1,1-Feb-18,2018-02-01
2,1-Mar-17,2017-03-01
3,26-Jan-17,2017-01-26
4,15-Aug-16,2016-08-15


#### **putting it all togerher**

In [5]:
%%bigquery
SELECT 
    show_id,
    type,
    title,
    director,
    `cast`,
    country,
    PARSE_DATE('%d-%b-%y', date_added) AS new_date_added,
    release_year,
    rating,
    duration,
    _data_source,
    _load_time
FROM movies_entertainment_raw.netflix_movies_and_tvshows
LIMIT 5



Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,show_id,type,title,director,cast,country,new_date_added,release_year,rating,duration,_data_source,_load_time
0,s6828,TV Show,Gargantia on the Verdurous Planet,,"Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka...",Japan,2016-12-01,2013,,1 Season,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
1,s7313,TV Show,Little Lunch,,"Flynn Curry, Olivia Deeble, Madison Lu, Oisín ...",Australia,2018-02-01,2015,,1 Season,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
2,s7538,Movie,My Honor Was Loyalty,Alessandro Pepe,"Leone Frisa, Paolo Vaccarino, Francesco Miglio...",Italy,2017-03-01,2015,,115 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
3,s5990,Movie,13TH: A Conversation with Oprah Winfrey & Ava ...,,"Oprah Winfrey, Ava DuVernay",,2017-01-26,2017,,37 min,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00
4,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,2016-08-15,2015,66 min,,netflix-movies-and-tv-shows,2025-02-05 03:16:45.466701+00:00


In [59]:
%%bigquery
CREATE or REPLACE TABLE movies_entertainment_stg.netflix_movies_and_tvshows as
    SELECT 
        show_id,
        type,
        title,
        director,
        `cast`,
        country,
        safe.PARSE_DATE('%d-%b-%y', date_added) AS date_added,
        release_year,
        rating,
        duration,
        _data_source,
        _load_time
    FROM movies_entertainment_raw.netflix_movies_and_tvshows


Query is running:   0%|          |

#### **adult to boolean (movies_metadata table)**

In [55]:
%%bigquery
SELECT adult AS orig_adult, safe_cast(adult as BOOLEAN) as new_adult
FROM movies_entertainment_raw.movies_metadata
LIMIT 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orig_adult,new_adult
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False


In [56]:
%%bigquery
SELECT 
    safe_cast(adult as BOOLEAN),
    budget,
    genres,
    id,
    imdb_id,
    original_language,
    original_title,
    overview,
    _data_source,
    _load_time
FROM movies_entertainment_raw.movies_metadata
LIMIT 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,f0_,budget,genres,id,imdb_id,original_language,original_title,overview,_data_source,_load_time
0,False,0,,381525,tt5376720,Unknown,WiNWiN,American investment fund buys Austrian compani...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
1,False,0,"Action, War, Drama, History",104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00
2,False,0,Animation,257095,tt0225145,Unknown,Bajaja,The first fairy tale transformed into a full-l...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
3,False,0,"Animation, Documentary",381096,tt5333518,Unknown,Garn,The traditional crafts of crochet and knitting...,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
4,False,0,Comedy,147050,tt0122580,Unknown,Lambchops,"""George and Gracie enter an elegant drawing ro...",the-movies-dataset,2025-02-06 22:48:24.489026+00:00


In [58]:
%%bigquery
CREATE or REPLACE TABLE movies_entertainment_stg.movies_metadata as
    SELECT 
        safe_cast(adult as BOOLEAN) as adult,
        budget,
        genres,
        id,
        imdb_id,
        original_language,
        original_title,
        overview,
        _data_source,
        _load_time
    FROM movies_entertainment_raw.movies_metadata


Query is running:   0%|          |

## **Criteria 6**

#### **null values represented by something other than 'null' in movie_name column (imdb_reviews table)**

In [5]:
%%bigquery
select movie_name, 
case movie_name when 'N/A' then null else movie_name end as movie_name2,
_data_source, _load_time
from movies_entertainment_raw.imdb_reviews


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,movie_name,movie_name2,_data_source,_load_time
0,Gen-Y Cops The Next Generation,Gen-Y Cops The Next Generation,imdb-reviews,2025-02-07 05:13:14.495364+00:00
1,,,imdb-reviews,2025-02-07 05:13:14.495364+00:00
2,Cat in the Brain,Cat in the Brain,imdb-reviews,2025-02-07 05:13:14.495364+00:00
3,An American Werewolf in London (1997),An American Werewolf in London (1997),imdb-reviews,2025-02-07 05:13:14.495364+00:00
4,Bad Wolf,Bad Wolf,imdb-reviews,2025-02-07 05:13:14.495364+00:00
...,...,...,...,...
1252,,,imdb-reviews,2025-02-07 05:13:14.495364+00:00
1253,,,imdb-reviews,2025-02-07 05:13:14.495364+00:00
1254,You Are Alone,You Are Alone,imdb-reviews,2025-02-07 05:13:14.495364+00:00
1255,,,imdb-reviews,2025-02-07 05:13:14.495364+00:00


In [None]:
%%bigquery
select movie_name, 
case movie_name when 'N/A' then null else movie_name end as movie_name2,
_data_source, _load_time
from movies_entertainment_raw.imdb_reviews


In [9]:
%%bigquery
select
    filename,
    case movie_name when 'N/A' then null else movie_name end as movie_name2,
    sentiment,
    key_themes,
    named_entities,
    emotional_tone,
    star_rating
    _data_source, 
    _load_time
from movies_entertainment_raw.imdb_reviews


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,filename,movie_name2,sentiment,key_themes,named_entities,emotional_tone,_data_source,_load_time
0,143_4.txt,Gen-Y Cops The Next Generation,Negative,Plot;Acting;Special Effects,Paul Rudd (Actor); N/A; N/A,Disappointment,4,2025-02-07 05:13:14.495364+00:00
1,344_4.txt,,Negative,Plot;Special Effects,,Disappointment,,2025-02-07 05:13:14.495364+00:00
2,384_4.txt,Cat in the Brain,Negative,Plot;Acting;Gore,Lucio Fulci (Director); David L. Thompson (Actor),Disappointment,,2025-02-07 05:13:14.495364+00:00
3,430_4.txt,An American Werewolf in London (1997),Negative,Plot;Acting;FX,David Naughton (Actor); John Landis (Director)...,Disappointment,,2025-02-07 05:13:14.495364+00:00
4,452_2.txt,Bad Wolf,Negative,Plot;Pacing,,Disappointment,,2025-02-07 05:13:14.495364+00:00
...,...,...,...,...,...,...,...,...
1252,572_10.txt,,Positive,Plot;N/A;N/A,,Satisfaction,,2025-02-07 05:13:14.495364+00:00
1253,728_10.txt,,Positive,Plot;N/A;N/A,,Satisfaction,,2025-02-07 05:13:14.495364+00:00
1254,464_10.txt,You Are Alone,Positive,Soundtrack;Acting;Plot,Jessica Bohl (Actor),Satisfaction,,2025-02-07 05:13:14.495364+00:00
1255,466_9.txt,,Positive,Soundtrack;Acting;Plot,Nick Drake (Actor);,Satisfaction,,2025-02-07 05:13:14.495364+00:00


In [10]:
%%bigquery
CREATE or REPLACE TABLE movies_entertainment_stg.imdb_reviews as
    select
        filename,
        case movie_name when 'N/A' then null else movie_name end as movie_name,
        sentiment,
        key_themes,
        named_entities,
        emotional_tone,
        star_rating
        _data_source, 
        _load_time
    from movies_entertainment_raw.imdb_reviews


Query is running:   0%|          |

## **Criteria 7**

#### **Splitting cells with multiple values (genre column in movies metadata column)**

In [9]:
%%bigquery

SELECT 
    adult,
    budget,
    genres,
    id,
    imdb_id,
    original_language,
    original_title,
    overview,

    CASE WHEN genres LIKE '%Action%' THEN 1 ELSE 0 END AS action,
    CASE WHEN genres LIKE '%Adventure%' THEN 1 ELSE 0 END AS adventure,
    CASE WHEN genres LIKE '%Animation%' THEN 1 ELSE 0 END AS animation,
    CASE WHEN genres LIKE '%Comedy%' THEN 1 ELSE 0 END AS comedy,
    CASE WHEN genres LIKE '%Crime%' THEN 1 ELSE 0 END AS crime,
    CASE WHEN genres LIKE '%Documentary%' THEN 1 ELSE 0 END AS documentary,
    CASE WHEN genres LIKE '%Drama%' THEN 1 ELSE 0 END AS drama,
    CASE WHEN genres LIKE '%Family%' THEN 1 ELSE 0 END AS family,
    CASE WHEN genres LIKE '%Fantasy%' THEN 1 ELSE 0 END AS fantasy,
    CASE WHEN genres LIKE '%Foreign%' THEN 1 ELSE 0 END AS foreign,
    CASE WHEN genres LIKE '%History%' THEN 1 ELSE 0 END AS history,
    CASE WHEN genres LIKE '%Horror%' THEN 1 ELSE 0 END AS horror,
    CASE WHEN genres LIKE '%Music%' THEN 1 ELSE 0 END AS music,
    CASE WHEN genres LIKE '%Mystery%' THEN 1 ELSE 0 END AS mystery,
    CASE WHEN genres LIKE '%Romance%' THEN 1 ELSE 0 END AS romance,
    CASE WHEN genres LIKE '%Science Fiction%' THEN 1 ELSE 0 END AS scifi,
    CASE WHEN genres LIKE '%TV Movie%' THEN 1 ELSE 0 END AS tv_movie,
    CASE WHEN genres LIKE '%Thriller%' THEN 1 ELSE 0 END AS thriller,
    CASE WHEN genres LIKE '%War%' THEN 1 ELSE 0 END AS war,
    CASE WHEN genres LIKE '%Western%' THEN 1 ELSE 0 END AS western,
    _data_source,
    _load_time

FROM movies_entertainment_raw.movies_metadata
limit 5


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,action,adventure,...,music,mystery,romance,scifi,tv_movie,thriller,war,western,_data_source,_load_time
0,False,0,,381525,tt5376720,Unknown,WiNWiN,American investment fund buys Austrian compani...,0,0,...,0,0,0,0,0,0,0,0,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
1,False,0,"Action, War, Drama, History",104473,tt0223958,Unknown,La prise de Tournavos,"Three military men, seen inside a fortificatio...",1,0,...,0,0,0,0,0,0,1,0,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
2,False,0,Animation,257095,tt0225145,Unknown,Bajaja,The first fairy tale transformed into a full-l...,0,0,...,0,0,0,0,0,0,0,0,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
3,False,0,"Animation, Documentary",381096,tt5333518,Unknown,Garn,The traditional crafts of crochet and knitting...,0,0,...,0,0,0,0,0,0,0,0,the-movies-dataset,2025-02-06 22:48:24.489026+00:00
4,False,0,Comedy,147050,tt0122580,Unknown,Lambchops,"""George and Gracie enter an elegant drawing ro...",0,0,...,0,0,0,0,0,0,0,0,the-movies-dataset,2025-02-06 22:48:24.489026+00:00


#### **Why Splitting Genres into Multiple Columns is Practical**

While adding 20 additional columns might seem excessive at first, this structure significantly improves **query performance, filtering, and analysis**. Instead of relying on inefficient text searches (`LIKE '%Action%'`), we can now leverage **indexed columns** to quickly find and aggregate movies by genre. This structure also ensures **data consistency**, preventing issues caused by variations in how genres are listed. Additionally, this approach simplifies **data visualization**, allowing for easier trend analysis and reporting. In large datasets, structured genre columns are far more efficient than scanning text fields, making this a scalable solution for data-driven decision-making.


#### **staging**

In [5]:
%%bigquery
CREATE or REPLACE TABLE movies_entertainment_stg.movies_metadata as
    SELECT 
        adult,
        budget,
        genres,
        id,
        imdb_id,
        original_language,
        original_title,
        overview,

        CASE WHEN genres LIKE '%Action%' THEN 1 ELSE 0 END AS action,
        CASE WHEN genres LIKE '%Adventure%' THEN 1 ELSE 0 END AS adventure,
        CASE WHEN genres LIKE '%Animation%' THEN 1 ELSE 0 END AS animation,
        CASE WHEN genres LIKE '%Comedy%' THEN 1 ELSE 0 END AS comedy,
        CASE WHEN genres LIKE '%Crime%' THEN 1 ELSE 0 END AS crime,
        CASE WHEN genres LIKE '%Documentary%' THEN 1 ELSE 0 END AS documentary,
        CASE WHEN genres LIKE '%Drama%' THEN 1 ELSE 0 END AS drama,
        CASE WHEN genres LIKE '%Family%' THEN 1 ELSE 0 END AS family,
        CASE WHEN genres LIKE '%Fantasy%' THEN 1 ELSE 0 END AS fantasy,
        CASE WHEN genres LIKE '%Foreign%' THEN 1 ELSE 0 END AS foreign,
        CASE WHEN genres LIKE '%History%' THEN 1 ELSE 0 END AS history,
        CASE WHEN genres LIKE '%Horror%' THEN 1 ELSE 0 END AS horror,
        CASE WHEN genres LIKE '%Music%' THEN 1 ELSE 0 END AS music,
        CASE WHEN genres LIKE '%Mystery%' THEN 1 ELSE 0 END AS mystery,
        CASE WHEN genres LIKE '%Romance%' THEN 1 ELSE 0 END AS romance,
        CASE WHEN genres LIKE '%Science Fiction%' THEN 1 ELSE 0 END AS scifi,
        CASE WHEN genres LIKE '%TV Movie%' THEN 1 ELSE 0 END AS tv_movie,
        CASE WHEN genres LIKE '%Thriller%' THEN 1 ELSE 0 END AS thriller,
        CASE WHEN genres LIKE '%War%' THEN 1 ELSE 0 END AS war,
        CASE WHEN genres LIKE '%Western%' THEN 1 ELSE 0 END AS western,
        _data_source,
        _load_time

    FROM movies_entertainment_raw.movies_metadata


Query is running:   0%|          |

In [4]:
%%bigquery
CREATE or REPLACE TABLE movies_entertainment_stg.box_office_gross as
    SELECT * FROM movies_entertainment_raw.box_office_gross


Query is running:   0%|          |

In [3]:
%%bigquery
delete from movies_entertainment_stg.box_office_gross where brand in ('Platinum Dunes', 'Vertigo Entertainment', 'Bad Robot')


Query is running:   0%|          |