### **Create BQ dataset for storing the raw data**

In [2]:
from google.cloud import bigquery
import os

project_id = "dylanericsp25"
dataset = "movies_entertainment_raw"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset dylanericsp25.movies_entertainment_raw


### **Common Functions**

In [3]:
from google.cloud import bigquery

project_id = "dylanericsp25"
bucket = "movies-entertainment"
parent_folder = "initial-loads"
region = "us-central1"
dataset = "movies_entertainment_raw"

bq_client = bigquery.Client()

def create_load_table_from_csv(folder, file_name, table, schema, delimiter=",", quote_character="\""):

  uri = f"gs://{bucket}/{parent_folder}/{folder}/{file_name}"
  table_id = f"{project_id}.{dataset}.{table}"

  table = bigquery.Table(table_id, schema=schema)
  table = bq_client.create_table(table, exists_ok=True)
  print("Created table {}".format(table.table_id))

  # remove the data_source and load_time fields before loading the data,
  # neither one is present in the csv
  del schema[-1]
  del schema[-1]
  print(schema)

  job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        # create_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        create_disposition="CREATE_IF_NEEDED",  # <-- Corrected value
        write_disposition="WRITE_EMPTY",
        field_delimiter=delimiter,
        quote_character=quote_character,
        allow_jagged_rows=True,
        ignore_unknown_values=True,
        max_bad_records=10 # <-- Allows skipping up to 10 bad rows
      )

  load_job = bq_client.load_table_from_uri(uri, table_id, job_config=job_config)
  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Loaded {} rows.".format(destination_table.num_rows))


def create_load_table_from_json(folder, file_name, table, schema):

  table_id = f"{project_id}.{dataset}.{table}"

  table = bigquery.Table(table_id, schema=schema)
  table = bq_client.create_table(table, exists_ok=True)
  print("Created table {}".format(table.table_id))

  # remove the data_source and load_time fields before loading the data,
  # neither one is present in the json
  del schema[-1]
  del schema[-1]

  #print(schema)

  job_config = bigquery.LoadJobConfig(schema=schema,
      source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
      write_disposition = "WRITE_EMPTY"
  )

  uri = f"gs://{bucket}/{parent_folder}/{folder}/{file_name}"

  load_job = bq_client.load_table_from_uri(
      uri,
      table_id,
      location=region,
      job_config=job_config,
  )

  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Loaded {} rows.".format(destination_table.num_rows))

### **imbd-reviews**

In [4]:
folder = "imdb-reviews/out"
file_name = "all_imdb_reviews.csv"
table = "imdb_reviews"
delimiter = ","

schema = [
  bigquery.SchemaField("filename", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("movie_name", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("sentiment", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("key_themes", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("named_entities", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("emotional_tone", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("star_rating", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("_data_source", "STRING", mode="REQUIRED", default_value_expression="'imdb-reviews'"),
  bigquery.SchemaField("_load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table_from_csv(folder, file_name, table, schema, delimiter)

Created table imdb_reviews
[SchemaField('filename', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('movie_name', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('sentiment', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('key_themes', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('named_entities', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('emotional_tone', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('star_rating', 'STRING', 'NULLABLE', None, None, (), None)]
Loaded 1257 rows.


### **netflix_movies_and_tvshows**

In [22]:
folder = "netflix-movies-and-tv-shows"
file_name = "netflix_titles.csv"
table = "netflix_movies_and_tvshows"
delimiter = ","

schema = [
    bigquery.SchemaField("show_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("title", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("director", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("cast", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("date_added", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("release_year", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("rating", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("duration", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("_data_source", "STRING", mode="REQUIRED", default_value_expression="'netflix-movies-and-tv-shows'"),
    bigquery.SchemaField("_load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table_from_csv(folder, file_name, table, schema, delimiter)

Created table netflix_movies_and_tvshows
[SchemaField('show_id', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('type', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('title', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('director', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('cast', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('country', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('date_added', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('release_year', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('rating', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('duration', 'STRING', 'NULLABLE', None, None, (), None)]
Loaded 8807 rows.


### **movies_metadata**

In [29]:
folder = "the-movies-dataset/Movies Dataset"
file_name = "movies_metadata.csv"
table = "movies_metadata"
delimiter = ","

schema = [
    bigquery.SchemaField("adult", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("budget", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("genres", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),  
    bigquery.SchemaField("imdb_id", "STRING", mode="REQUIRED"),  
    bigquery.SchemaField("original_language", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("original_title", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("overview", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("_data_source", "STRING", mode="REQUIRED", default_value_expression="'the-movies-dataset'"),
    bigquery.SchemaField("_load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table_from_csv(folder, file_name, table, schema, delimiter)


Created table movies_metadata
[SchemaField('adult', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('budget', 'INTEGER', 'REQUIRED', None, None, (), None), SchemaField('genres', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('id', 'INTEGER', 'REQUIRED', None, None, (), None), SchemaField('imdb_id', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('original_language', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('original_title', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('overview', 'STRING', 'NULLABLE', None, None, (), None)]
Loaded 45463 rows.


### **box_office_gross**

In [36]:
folder = "box-office-gross/Top Box Office Revenue Data"
file_name = "bomojobrandindices.csv"
table = "box_office_gross"
delimiter = "\t"

schema = [
    bigquery.SchemaField("brand", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("total", "INTEGER", mode="NULLABLE"),
    bigquery.SchemaField("releases", "INTEGER", mode="NULLABLE"),
    bigquery.SchemaField("number_1_release", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("lifetime_gross", "INTEGER", mode="NULLABLE"),
    bigquery.SchemaField("_data_source", "STRING", mode="REQUIRED", default_value_expression="'box-office-gross'"),
    bigquery.SchemaField("_load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]


create_load_table_from_csv(folder, file_name, table, schema, delimiter)#

Created table box_office_gross
[SchemaField('brand', 'STRING', 'REQUIRED', None, None, (), None), SchemaField('total', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('releases', 'INTEGER', 'NULLABLE', None, None, (), None), SchemaField('number_1_release', 'STRING', 'NULLABLE', None, None, (), None), SchemaField('lifetime_gross', 'INTEGER', 'NULLABLE', None, None, (), None)]
Loaded 45 rows.


---
---
---
---
---
---
---
---
---