# Fahmi Abdulaziz - Qoala ETL Pipeline Assingment
This is development notebook used for ETL development and prototyping.

## Preparing Dependencies

In [1]:
!pip install -q pyspark kaggle

[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[K     |████████████████████████████████| 198 kB 58.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


preparing kaggle credential

In [3]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Preparing Source File

### Download & unzip tmdb dataset

In [None]:
%%bash --err null
kaggle datasets download edgartanaka1/tmdb-movies-and-series -p /content
unzip /content/tmdb-movies-and-series.zip

### Combining all the files into one JSON
This step is unnecessary, but it would speed up file loading a lot.

In [None]:
from os import listdir
from os.path import isfile, join
import json

In [None]:
def get_files_in_dir(path: str):
    return [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
def combine_all_file(dir: str, out: str):
    joined_file = open(out, "a+")
    file_paths = get_files_in_dir(dir)

    for file_path in file_paths:
        with open(join(dir, file_path)) as f:
            payload = f.read()
            joined_file.write(payload+'\n')

    joined_file.close()

In [None]:
for folder in ['series', 'movies']:
    dir = join("/content", folder, folder)
    out_file = join("/content", folder+"_joined.json")
    combine_all_file(dir, out_file)

## Extract

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F

In [5]:
spark = SparkSession.builder.appName("tmdb") \
    .config("spark.driver.maxResultSize", "10g") \
    .getOrCreate()

In [6]:
df_movies = spark.read.json("/content/drive/MyDrive/datasets/tmdb/movies_joined.json")
df_series = spark.read.json("/content/drive/MyDrive/datasets/tmdb/series_joined.json")

In [7]:
df_movies.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |

In [8]:
df_series.printSchema()

root
 |-- backdrop_path: string (nullable = true)
 |-- created_by: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- credit_id: string (nullable = true)
 |    |    |-- gender: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- profile_path: string (nullable = true)
 |-- episode_run_time: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- first_air_date: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- in_production: boolean (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- last_air_date: string (nullable = true)
 |-- last_episode_to_air: struct (nullable = true)
 |    |-

## Exploration

### Array struct column type consistency check
The repetitive value in `Array - Struct` column type could create inconsistency between data. In this part we will try to check if the data in `Array - Struct` filed is consistent

In [17]:
# utility function
def explode_array_column(dataframe: DataFrame, column: str, new_column: str):
    return dataframe.select(F.col(column)) \
        .where(F.size(F.col(column)) > 0 ) \
        .withColumn(new_column, F.explode(F.col(column))) \
        .select(F.col(new_column)) \
        


def check_array_struct_uniqueness(dataframe: DataFrame, column: str, id_key: str):
    res_table = explode_array_column(dataframe, column, "temp") \
        .select(F.col(f"temp.*")) \
        .distinct() \
        .groupBy(id_key).count() \
        .where(F.col("count") > 1)
    
    if res_table.count() > 0:
        print(f"Data in column {column} is inconsistent")
        res_table.show(10)

In [19]:
check_array_struct_uniqueness(df_movies, "production_companies", "id")
check_array_struct_uniqueness(df_movies, "genres", "id")
check_array_struct_uniqueness(df_movies, "spoken_languages", "iso_639_1")
check_array_struct_uniqueness(df_movies, "production_countries", "iso_3166_1")
check_array_struct_uniqueness(df_movies, "genres", "id")
check_array_struct_uniqueness(df_series, "genres", "id")
check_array_struct_uniqueness(df_series, "networks", "id")
check_array_struct_uniqueness(df_series, "production_companies", "id")

Data in column production_companies is inconsistent
+----+-----+
|  id|count|
+----+-----+
| 215|    2|
|6689|    2|
+----+-----+



In [23]:
df_movies.select(F.explode("production_companies")).select("col.*").where((F.col("id") == 215)).show()
df_movies.select(F.explode("production_companies")).select("col.*").where((F.col("id") == 6689)).show()

# if we take a look at the below result, it shows that both companies has incosistencies in it's data
# for this project, I will just leave warning in pipeline

+---+--------------------+--------------------+--------------+
| id|           logo_path|                name|origin_country|
+---+--------------------+--------------------+--------------+
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|              |
|215|/tQyeqkCj24krhY2W...|Double Feature Films|            US|
|215|/tQyeqkCj24krhY2W...|Double Feature Films|        

In [15]:
df_movies.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |

In [14]:
df_series.printSchema()

root
 |-- backdrop_path: string (nullable = true)
 |-- created_by: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- credit_id: string (nullable = true)
 |    |    |-- gender: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- profile_path: string (nullable = true)
 |-- episode_run_time: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- first_air_date: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- in_production: boolean (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- last_air_date: string (nullable = true)
 |-- last_episode_to_air: struct (nullable = true)
 |    |-

### Common field
both series and movies dataset has one same column, `genres` and `production_companies` column. This column has `ArrayType` type consists `Struct`. Yet we are not sure wether those two field has the same `id` and `name`. This part will try to check if `genres` & `production_companies` on both datasets are the same.

In [24]:
# Comparing genres between two datasets

df_series_gen = df_series.select(F.explode(F.col("genres"))).select(F.col("col.*")) \
    .withColumnRenamed("id", "series_id") \
    .withColumnRenamed("name", "series_genre") \
    .distinct()
df_movies_gen = df_movies.select(F.explode(F.col("genres"))).select(F.col("col.*")) \
    .withColumnRenamed("id", "movies_id") \
    .withColumnRenamed("name", "movies_genre") \
    .distinct()

genre_comparison = df_series_gen \
    .join(df_movies_gen, df_series_gen.series_id == df_movies_gen.movies_id, "full")
genre_comparison.show(5)

# from the result below it seems that the id and genres between two table are the same

+---------+------------+---------+------------+
|series_id|series_genre|movies_id|movies_genre|
+---------+------------+---------+------------+
|       12|   Adventure|       12|   Adventure|
|       14|     Fantasy|       14|     Fantasy|
|       16|   Animation|       16|   Animation|
|       18|       Drama|       18|       Drama|
|       22|     Musical|     null|        null|
+---------+------------+---------+------------+
only showing top 5 rows



In [27]:
# decide to check futher by looking for inequality between series_genre and movies_genre
genre_comparison.where(F.col("movies_id").isNotNull() & F.col("series_id").isNotNull() & (F.col("movies_genre") != F.col("series_genre"))).show()

# the result below show that between series and movies dataset the genre id are consistent,
# hence this lead the decision to normalize genre table

+---------+------------+---------+------------+
|series_id|series_genre|movies_id|movies_genre|
+---------+------------+---------+------------+
+---------+------------+---------+------------+



In [76]:
dwh_genres = genre_comparison \
    .withColumn("id", F.when(F.col("series_id").isNull(), F.col("movies_id")).otherwise(F.col("series_id"))) \
    .withColumn("genre", F.when(F.col("series_id").isNull(), F.col("movies_genre")).otherwise(F.col("series_genre"))) \
    .select(["id", "genre"])

dwh_genres.show(5)

# the below table will be used for dimension table

+---+---------+
| id|    genre|
+---+---------+
| 12|Adventure|
| 14|  Fantasy|
| 16|Animation|
| 18|    Drama|
| 22|  Musical|
+---+---------+
only showing top 5 rows



In [29]:
# Comparing production_companies between two datasets

df_series_com = df_series.select(F.explode(F.col("production_companies"))).select(F.col("col.*")) \
    .select(["id", "name", "origin_country"]) \
    .withColumnRenamed("id", "series_id") \
    .withColumnRenamed("name", "series_company") \
    .withColumnRenamed("origin_country", "series_origin_country") \
    .distinct()
df_movies_com = df_movies.select(F.explode(F.col("production_companies"))).select(F.col("col.*")) \
    .select(["id", "name", "origin_country"]) \
    .withColumnRenamed("id", "movies_id") \
    .withColumnRenamed("name", "movies_company") \
    .withColumnRenamed("origin_country", "movies_origin_country") \
    .distinct()

companies_comparison = df_series_com \
    .join(df_movies_com, df_series_com.series_id == df_movies_com.movies_id, "full")
companies_comparison.show(5)

# from the result below it seems that the id and genres between two table are the same

+---------+-------------------+---------------------+---------+-------------------+---------------------+
|series_id|     series_company|series_origin_country|movies_id|     movies_company|movies_origin_country|
+---------+-------------------+---------------------+---------+-------------------+---------------------+
|        1|     Lucasfilm Ltd.|                   US|        1|     Lucasfilm Ltd.|                   US|
|        5|  Columbia Pictures|                   US|        5|  Columbia Pictures|                   US|
|     null|               null|                 null|        6| RKO Radio Pictures|                   US|
|        7|DreamWorks Pictures|                   US|        7|DreamWorks Pictures|                   US|
|        9|            Gaumont|                   FR|        9|            Gaumont|                   FR|
+---------+-------------------+---------------------+---------+-------------------+---------------------+
only showing top 5 rows



In [43]:
# decide to check futher by looking for inequality between series_genre and movies_genre
companies_comparison \
.where(F.col("movies_id").isNotNull() \
       & F.col("series_id").isNotNull() \
       & (F.col("movies_company") != F.col("series_company")) \
       & (F.col("movies_origin_country") != F.col("series_origin_country"))) \
.show()

# the below table will be used for dimension table

+---------+--------------+---------------------+---------+--------------+---------------------+
|series_id|series_company|series_origin_country|movies_id|movies_company|movies_origin_country|
+---------+--------------+---------------------+---------+--------------+---------------------+
+---------+--------------+---------------------+---------+--------------+---------------------+



In [77]:
dwh_companies = companies_comparison \
    .withColumn("id", F.when(F.col("series_id").isNull(), F.col("movies_id")).otherwise(F.col("series_id"))) \
    .withColumn("company", F.when(F.col("series_id").isNull(), F.col("movies_company")).otherwise(F.col("series_company"))) \
    .withColumn("origin_country", F.when(F.col("series_id").isNull(), F.col("movies_origin_country")).otherwise(F.col("series_origin_country"))) \
    .select(["id", "company", "origin_country"]) \
    .where(F.col("origin_country").isNotNull())

dwh_companies.show(5)

# since there is no inconsistency in this table, we will normalize this table

+---+-------------------+--------------+
| id|            company|origin_country|
+---+-------------------+--------------+
|  1|     Lucasfilm Ltd.|            US|
|  5|  Columbia Pictures|            US|
|  6| RKO Radio Pictures|            US|
|  7|DreamWorks Pictures|            US|
|  9|            Gaumont|            FR|
+---+-------------------+--------------+
only showing top 5 rows



## Transform

In [72]:
# drop genre and companies
initial_columns = df_movies.schema.names

dwh_movies = df_movies \
    .withColumn("genre", F.explode("genres")) \
    .select(*initial_columns, F.col("genre.id").alias("genre_id")) \
    .groupBy(*initial_columns).agg(F.collect_list("genre_id").alias("genre_ids")) \
    .withColumn("company", F.explode("production_companies")) \
    .select(*initial_columns, "genre_ids", F.col("company.id").alias("company_id")) \
    .groupBy(*initial_columns, "genre_ids").agg(F.collect_list("company_id").alias("company_ids")) \
    .drop("genres", "production_companies")
    
dwh_movies.show(5)

# the table below will be saved to dwh as factual table

+-----+-------------+---------------------+------+--------+------+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+-------+-------+----------------+--------+-------+--------------------+-----+------------+----------+---------+-------------+
|adult|backdrop_path|belongs_to_collection|budget|homepage|    id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_countries|release_date|revenue|runtime|spoken_languages|  status|tagline|               title|video|vote_average|vote_count|genre_ids|  company_ids|
+-----+-------------+---------------------+------+--------+------+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+-------+-------+----------------+--------+-------+--------------------+-----+------------+----------+---------+-------------+
|false|         

In [73]:
# drop genre and companies
initial_columns = df_series.schema.names

dwh_series = df_series \
    .withColumn("genre", F.explode("genres")) \
    .select(*initial_columns, F.col("genre.id").alias("genre_id")) \
    .groupBy(*initial_columns).agg(F.collect_list("genre_id").alias("genre_ids")) \
    .withColumn("company", F.explode("production_companies")) \
    .select(*initial_columns, "genre_ids", F.col("company.id").alias("company_id")) \
    .groupBy(*initial_columns, "genre_ids").agg(F.collect_list("company_id").alias("company_ids")) \
    .drop("genres", "production_companies")

dwh_series.show(5)

# the table below will be saved to dwh as factual table

+-------------+----------+----------------+--------------+--------------------+------+-------------+---------+-------------+-------------------+-----------------------------------+--------------------+-------------------+------------------+-----------------+--------------+-----------------+-----------------------------------+--------+----------+--------------------+--------------------+-------------+--------+------------+----------+--------------------+--------------+
|backdrop_path|created_by|episode_run_time|first_air_date|            homepage|    id|in_production|languages|last_air_date|last_episode_to_air|                               name|            networks|next_episode_to_air|number_of_episodes|number_of_seasons|origin_country|original_language|                      original_name|overview|popularity|         poster_path|             seasons|       status|    type|vote_average|vote_count|           genre_ids|   company_ids|
+-------------+----------+----------------+-----------

## Load
Save data to parquet files

In [78]:
# save data to parquet format
dwh_series.write.parquet("series.parquet")
dwh_movies.write.parquet("movies.parquet")
dwh_genres.write.parquet("genres.parquet")
dwh_companies.write.parquet("companies.parquet")

In [79]:
!pip install -q gcloud google-cloud-bigquery

[?25l[K     |▊                               | 10 kB 23.1 MB/s eta 0:00:01[K     |█▍                              | 20 kB 28.7 MB/s eta 0:00:01[K     |██▏                             | 30 kB 24.5 MB/s eta 0:00:01[K     |██▉                             | 40 kB 19.3 MB/s eta 0:00:01[K     |███▋                            | 51 kB 9.3 MB/s eta 0:00:01[K     |████▎                           | 61 kB 9.8 MB/s eta 0:00:01[K     |█████                           | 71 kB 9.2 MB/s eta 0:00:01[K     |█████▊                          | 81 kB 10.1 MB/s eta 0:00:01[K     |██████▌                         | 92 kB 10.8 MB/s eta 0:00:01[K     |███████▏                        | 102 kB 8.6 MB/s eta 0:00:01[K     |████████                        | 112 kB 8.6 MB/s eta 0:00:01[K     |████████▋                       | 122 kB 8.6 MB/s eta 0:00:01[K     |█████████▍                      | 133 kB 8.6 MB/s eta 0:00:01[K     |██████████                      | 143 kB 8.6 MB/s eta 0:00:01[

In [80]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=yxiFNIHhouMFiGC7KXH7TINJcSdp9m&prompt=consent&access_type=offline&code_challenge=2qLX-y8hG3x5nKScJFLolsrNXI-aQ5LWW5jZ2CY6_i8&code_challenge_method=S256

Enter verification code: 4/1AX4XfWiO4X-IQf0TnA-N0M0lAqKnWedr2vnprtHj9LBx6fZ0tnrWmS6HYBw

You are now logged in as [afahmi13@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [81]:
!gcloud config set project de-porto

Updated property [core/project].


In [82]:
!gsutil cp -r ./movies.parquet gs://de-porto/qoala/movies.parquet

Copying file://./movies.parquet/._SUCCESS.crc [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/.part-00000-6a6bac41-c8b0-4b99-a1f8-b875cd8bc7eb-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/_SUCCESS [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/part-00002-6a6bac41-c8b0-4b99-a1f8-b875cd8bc7eb-c000.snappy.parquet [Content-Type=application/octet-stream]...
\
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./movies.parquet/.part-00001-6a6bac41-c8b0-4b99-a1f8-b875cd8bc7eb-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/.part-00002-6a6bac41-c8b0-4b99-a1f8-b875cd8bc7eb-c000.snappy.parquet.crc [Content-Type=application/oct

In [88]:
# load to bigquery
from google.cloud import bigquery

client = bigquery.Client.from_service_account_json("./de-porto-key.json")
config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.PARQUET)
uri = "gs://de-porto/qoala/movies.parquet/part-00001-6a6bac41-c8b0-4b99-a1f8-b875cd8bc7eb-c000.snappy.parquet"
table_id = "de-porto.de_porto.test1"

job = client.load_table_from_uri(
    uri, table_id, job_config=config
)

job.result()

<google.cloud.bigquery.job.LoadJob at 0x7f7b62c6f210>

In [None]:
explode_array_column(df_movies, "production_companies", "companies") \
    .select("companies.*").where(F.col("id") == 6689).show()

+----+--------------------+-----+--------------+
|  id|           logo_path| name|origin_country|
+----+--------------------+-----+--------------+
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59

In [None]:
df_movies.write.parquet("./movies.parquet")

In [None]:
!gcloud auth login

In [None]:
!gsutil -m cp -r ./movies.parquet gs://de-porto/qoala