# Fahmi Abdulaziz - Qoala ETL Pipeline Assingment
This is development notebook used for ETL development and prototyping.

## Preparing Dependencies

In [2]:
!pip install -q pyspark kaggle

[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[K     |████████████████████████████████| 198 kB 54.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


preparing kaggle credential

In [None]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Preparing Source File

### Download & unzip tmdb dataset

In [None]:
%%bash --err null
kaggle datasets download edgartanaka1/tmdb-movies-and-series -p /content
unzip /content/tmdb-movies-and-series.zip

### Combining all the files into one JSON
This step is unnecessary, but it would speed up file loading a lot.

In [None]:
from os import listdir
from os.path import isfile, join
import json

In [None]:
def get_files_in_dir(path: str):
    return [f for f in listdir(path) if isfile(join(path, f))]

In [None]:
def combine_all_file(dir: str, out: str):
    joined_file = open(out, "a+")
    file_paths = get_files_in_dir(dir)

    for file_path in file_paths:
        with open(join(dir, file_path)) as f:
            payload = f.read()
            joined_file.write(payload+'\n')

    joined_file.close()

In [None]:
for folder in ['series', 'movies']:
    dir = join("/content", folder, folder)
    out_file = join("/content", folder+"_joined.json")
    combine_all_file(dir, out_file)

## Extract

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F

In [5]:
spark = SparkSession.builder.appName("tmdb") \
    .config("spark.driver.maxResultSize", "10g") \
    .getOrCreate()

In [7]:
df_movies = spark.read.json("/content/drive/MyDrive/datasets/tmdb/movies_joined.json")
df_series = spark.read.json("/content/drive/MyDrive/datasets/tmdb/series_joined.json")

In [33]:
df_movies.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |

In [9]:
df_series.printSchema()

root
 |-- backdrop_path: string (nullable = true)
 |-- created_by: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- credit_id: string (nullable = true)
 |    |    |-- gender: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- profile_path: string (nullable = true)
 |-- episode_run_time: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- first_air_date: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- in_production: boolean (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- last_air_date: string (nullable = true)
 |-- last_episode_to_air: struct (nullable = true)
 |    |-

## Transform

In [31]:
# Comparing genres between two datasets

df_series_gen = df_series.select(F.explode(F.col("genres"))).select(F.col("col.*")) \
    .withColumnRenamed("id", "series_id") \
    .withColumnRenamed("name", "series_genre") \
    .distinct()
df_movies_gen = df_movies.select(F.explode(F.col("genres"))).select(F.col("col.*")) \
    .withColumnRenamed("id", "movies_id") \
    .withColumnRenamed("name", "movies_genre") \
    .distinct()

genre_comparison = df_series_gen \
    .join(df_movies_gen, df_series_gen.series_id == df_movies_gen.movies_id, "left", )
genre_comparison.show()

# from the result below it seems that the id and genres between two table are the same

+---------+------------------+---------+---------------+
|series_id|      series_genre|movies_id|   movies_genre|
+---------+------------------+---------+---------------+
|       22|           Musical|     null|           null|
|    10759|Action & Adventure|     null|           null|
|    10752|               War|    10752|            War|
|    10763|              News|     null|           null|
|       27|            Horror|       27|         Horror|
|    10764|           Reality|     null|           null|
|     1115|        Road Movie|     null|           null|
|    10749|           Romance|    10749|        Romance|
|    10751|            Family|    10751|         Family|
|       37|           Western|       37|        Western|
|    10767|              Talk|     null|           null|
|       80|             Crime|       80|          Crime|
|    10762|              Kids|     null|           null|
|       36|           History|       36|        History|
|       18|             Drama| 

In [32]:
# decide to check futher by looking for inequality between series_genre and movies_genre
genre_comparison.where((F.col("movies_id") != None) & (F.col("movies_genre") != F.col("series_genre"))).show()

# the result below show that between series and movies dataset the genre id are consistent,
# hence this lead the decision to normalize genre table

+---------+------------+---------+------------+
|series_id|series_genre|movies_id|movies_genre|
+---------+------------+---------+------------+
+---------+------------+---------+------------+



In [37]:
df_series_gen \
    .withColumnRenamed("series_genre", "genre") \
    .withColumnRenamed("series_id", "id") \
    .show(5)

# the below table will be used for dimension table

+-----+------------------+
|   id|             genre|
+-----+------------------+
|10766|              Soap|
|10759|Action & Adventure|
|   27|            Horror|
|  878|   Science Fiction|
|   53|          Thriller|
+-----+------------------+
only showing top 5 rows



Check `production_companies` uniqueness. The result shows that relation between `production_companies.id` and `production_companies.name` is consistent, because there is `id` with different `name`.

In [12]:
df_movies.show()

+-----+--------------------+---------------------+-------+--------------------+--------------------+------+----------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+--------+-------+-----------------------+--------+--------------------+--------------------+-----+------------+----------+
|adult|       backdrop_path|belongs_to_collection| budget|              genres|            homepage|    id|   imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date| revenue|runtime|       spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----+--------------------+---------------------+-------+--------------------+--------------------+------+----------+-----------------+--------------------+--------------------+----------+--------------------+--------------

In [None]:
def explode_array_column(dataframe: DataFrame, column: str, new_column: str):
    return dataframe.select(F.col(column)) \
        .where(F.size(F.col(column)) > 0 ) \
        .withColumn(new_column, F.explode(F.col(column))) \
        .select(F.col(new_column)) \
        


def check_array_struct_uniqueness(dataframe: DataFrame, column: str, id_key: str):
    explode_array_column(dataframe, column, "temp") \
        .select(F.col(f"temp.*")) \
        .distinct() \
        .groupBy(id_key).count() \
        .where(F.col("count") > 1) \
        .show()

In [None]:
check_array_struct_uniqueness(df_movies, "production_companies", "id")
check_array_struct_uniqueness(df_movies, "genres", "id")
check_array_struct_uniqueness(df_movies, "spoken_languages", "iso_639_1")
check_array_struct_uniqueness(df_movies, "production_countries", "iso_3166_1")

+----+-----+
|  id|count|
+----+-----+
| 215|    2|
|6689|    2|
+----+-----+

+---+-----+
| id|count|
+---+-----+
+---+-----+

+---------+-----+
|iso_639_1|count|
+---------+-----+
+---------+-----+

+----------+-----+
|iso_3166_1|count|
+----------+-----+
+----------+-----+



In [None]:
explode_array_column(df_movies, "production_companies", "companies") \
    .select("companies.*").where(F.col("id") == 6689).show()

+----+--------------------+-----+--------------+
|  id|           logo_path| name|origin_country|
+----+--------------------+-----+--------------+
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/vaKMJDLIZCeqhGC5...|SHAFT|            JP|
|6689|/i6J2W84TzUf59PdM...|SHAFT|            JP|
|6689|/i6J2W84TzUf59

In [38]:
df_movies.write.parquet("./movies.parquet")

In [39]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=4Tbew4LMS0L3dkpjqYrIjjWYCLVBQQ&prompt=consent&access_type=offline&code_challenge=c3f2wKUyJK1GdLANq2XODdOsKCTldS5K05RLPYPNZXQ&code_challenge_method=S256

Enter verification code: 4/1AX4XfWhhfNtJr-J3TI6gYQMA-WYXNplsSCNm58EIrsTGx0mAJU5eT1-d5hc

You are now logged in as [afahmi13@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [42]:
!gsutil -m cp -r ./movies.parquet gs://de-porto/qoala

Copying file://./movies.parquet/.part-00002-c0f48c51-29e2-4949-92fb-1f353658cae7-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/part-00000-c0f48c51-29e2-4949-92fb-1f353658cae7-c000.snappy.parquet [Content-Type=application/octet-stream]...
/ [0/10 files][    0.0 B/142.0 MiB]   0% Done                                   / [0/10 files][    0.0 B/142.0 MiB]   0% Done                                   Copying file://./movies.parquet/.part-00001-c0f48c51-29e2-4949-92fb-1f353658cae7-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
/ [0/10 files][    0.0 B/142.0 MiB]   0% Done                                   Copying file://./movies.parquet/_SUCCESS [Content-Type=application/octet-stream]...
Copying file://./movies.parquet/.part-00000-c0f48c51-29e2-4949-92fb-1f353658cae7-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
/ [0/10 files][    0.0 B/142.0 MiB]   0% Done                                   Copyi

### Join files to singe file

In [None]:
from os import listdir
from os.path import isfile, join
import json

In [None]:
series_path="/content/series/series"
series_files = [f for f in listdir(series_path) if isfile(join(series_path, f))]

In [None]:
join_series_file=open("/content/series_joined.json", "a+")
for series_file in series_files:
    with open(join(series_path, series_file)) as sf:
        payload = sf.read()
        join_series_file.write(payload+'\n')
join_series_file.close()

In [None]:
movies_path="/content/movies/movies"
movies_files = [f for f in listdir(movies_path) if isfile(join(movies_path, f))]

In [None]:
join_movies_file=open("/content/movies_joined.json", "a+")
for movies_file in movies_files:
    with open(join(movies_path, movies_file)) as sf:
        payload = sf.read()
        join_movies_file.write(payload+'\n')
join_movies_file.close()

In [None]:
!cp /content/movies_joined.json /content/drive/MyDrive/datasets/tmdb/
!cp /content/series_joined.json /content/drive/MyDrive/datasets/tmdb/

In [None]:
!ls -l /content

total 594384
drwx------ 5 root root      4096 Feb 10 07:18 drive
drwxr-xr-x 3 root root      4096 Feb 10 07:38 movies
-rw-r--r-- 1 root root 475254159 Feb 10 09:38 movies_joined.json
drwxr-xr-x 1 root root      4096 Feb  1 14:32 sample_data
drwxr-xr-x 3 root root      4096 Feb 10 07:40 series
-rw-r--r-- 1 root root 133373315 Feb 10 09:32 series_joined.json


In [None]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=oI9FufrvgWrWQTQR0EoT05d98K6H5g&prompt=consent&access_type=offline&code_challenge=1hheCwrBwkB2CuyrgqMsR_gyJ2Nr0DoakmJ5GDlLx04&code_challenge_method=S256

Enter verification code: 4/1AX4XfWjKdS2_A24yIYy5Xa3MsNI-a2FOhEHoeZ4pacruHlzHnt0OaZf9vjM

You are now logged in as [afahmi13@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


In [None]:
!gsutil cp /content/movies_joined.json gs://de-porto/qoala/raw_data

Copying file:///content/movies_joined.json [Content-Type=application/json]...
/ [0 files][    0.0 B/453.2 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

-
Operation completed over 1 objects/453.2 MiB.                                    


In [None]:
!gsutil cp /content/series_joined.json gs://de-porto/qoala/raw_data

Copying file:///content/series_joined.json [Content-Type=application/json]...
/
Operation completed over 1 objects/127.2 MiB.                                    


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

TIMEOUT: ignored

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType, TimestampType

In [None]:
spark = SparkSession.builder.appName("relearn") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

In [None]:
df_eq = spark.read.parquet("/content/drive/MyDrive/datasets/relearn_spark/equipment.parquet")

In [None]:
df_eq.show(5)

+---+---------+-----------+----+
| id|     type|      brand|year|
+---+---------+-----------+----+
|  2|Bulldozer|       Sany|2016|
|  4|Bulldozer|      Volvo|2019|
|  1|Bulldozer|    Komatsu|2018|
|  5|Bulldozer|    Komatsu|2019|
|  3|Bulldozer|Caterpillar|2018|
+---+---------+-----------+----+
only showing top 5 rows



In [None]:
schema = StructType([
    StructField("timestamp", TimestampType(), False),
    StructField("id", IntegerType(), False),
    StructField("type", StringType(), False),
    StructField("brand", StringType(), False),
    StructField("year", DateType(), False),
    StructField("location", StringType(), False),
    StructField("fuel_level", FloatType(), False)
])
df_iot = spark.read.csv("/content/drive/MyDrive/datasets/relearn_spark/iot.csv", header=True, schema=schema)

In [None]:
df_iot.printSchema()
df_iot.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- id: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- year: date (nullable = true)
 |-- location: string (nullable = true)
 |-- fuel_level: float (nullable = true)

+-------------------+---+----------+-----------+----------+--------------------+----------+
|          timestamp| id|      type|      brand|      year|            location|fuel_level|
+-------------------+---+----------+-----------+----------+--------------------+----------+
|2022-01-25 13:15:29| 13|Dump Truck|       Hino|2017-01-01|POINT(107.3225083...|  99.31521|
|2022-01-25 13:16:26| 16|Dump Truck|       Fuso|2018-01-01|POINT(107.3192117...|  98.88088|
|2022-01-25 13:15:29| 12|Dump Truck|       Fuso|2018-01-01|POINT(107.3213810...|  99.57359|
|2022-01-25 13:18:10| 11|Dump Truck|       Hino|2015-01-01|POINT(107.3152080...|  96.41994|
|2022-01-25 13:16:44|  7| Excavator|Caterpillar|2015-01-01|POINT(107.3199178...|  9