In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import col, explode_outer, lit

import re

In [2]:
if 'spark' in locals() or 'spark' in globals():
    spark.stop()

spark = SparkSession\
    .builder\
    .appName("Criando tabelas prata")\
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/03 19:05:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/03 19:05:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [3]:
def get_df_datatypes(df):
    column_names = [field.name for field in df.schema.fields]
    column_types = [re.search('(\w+)\(', str(field.dataType)).group(1) for field in df.schema.fields]
    columns_datatypes =  dict(zip(column_names, column_types))
    
    return columns_datatypes

In [4]:
def unnest_struct_array(df, verbose = False):
    
    columns_manifest = get_df_datatypes(df)
    dtypes = list(columns_manifest.values())
    cols = list(columns_manifest.keys())
        
    if 'StructType' not in list(columns_manifest.values()) and 'ArrayType' not in list(columns_manifest.values()):
        if verbose == True:
            print('INFO: Não há colunas StructType nem Array')
            df.printSchema()
        return df
    else:
        if verbose == True:
            print(f'INFO: Colunas a serem avaliadas {cols}')
        
        for column in cols:
            
            if verbose == True:
                print(f'INFO: Avaliando a coluna {column}')
            
            if columns_manifest[column] == 'StructType':
                if verbose == True:
                    print(f"INFO: {column} é um campo StructType")
                    print(f"INFO: Mapeando objetos dentro do struct {column}")
                struct_fields = [field.name for field in df.select(f'{column}.*').schema.fields]
                
                if verbose == True:
                    print(f"INFO: Abrindo Struct {column} nos campos {struct_fields}")
                
                df = df.select(
                        ['*'] +
                        [col(f'{column}.{struct_field}').alias(f'{column}_{struct_field}') for struct_field in struct_fields]
                ).drop(column)
                
                
            elif columns_manifest[column] == 'ArrayType':
                if verbose == True:
                    print(f"INFO: {column} é um campo ArrayType")
                    print(f"INFO: Executando explode_outer() do campo {column}")
                
                df = df.withColumn(f'{column}', explode_outer(f'{column}'))
                
            else:
                if verbose == True:
                    print(f"INFO: {column} não é um campo StructType ou ArrayType")
        if verbose == True:
            print('INFO: RESETANDO O LOOP')
    
    return unnest_struct_array(df = df)

# Tabela TBL_SILVER_POKEMONS

In [5]:
pokemons_df = spark.read.table('pokeapi.tbl_bronze_pokemons')
pokemons_df.printSchema()

Hive Session ID = 39e43022-ba1c-454a-bce0-639af3955ba2


root
 |-- abilities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ability: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |-- is_hidden: boolean (nullable = true)
 |    |    |-- slot: long (nullable = true)
 |-- base_experience: long (nullable = true)
 |-- forms: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- game_indices: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- game_index: long (nullable = true)
 |    |    |-- version: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |-- height: long (nullable = true)
 |-- held_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- item: stru

In [6]:
def unstruct(df):
    
    columns_manifest = get_df_datatypes(df)
    
    dtypes = list(columns_manifest.values())
    cols = list(columns_manifest.keys())

    if 'StructType' not in dtypes:
        return df
    
    elif 'StructType' in dtypes:
        for column in cols:

            if columns_manifest[column] == 'StructType':
                struct_fields = [field.name for field in df.select(f'{column}.*').schema.fields]

                df = df.select(
                        ['*'] +
                        [col(f'{column}.{struct_field}').alias(f'{column}_{struct_field}') for struct_field in struct_fields]
                ).drop(column)
                
    return unstruct(df = df)

In [7]:
def disarray(df, primary_key, prefix):
    
    dims_dict = {}
    
    columns_manifest = get_df_datatypes(df)
    
    dtypes = list(columns_manifest.values())
    cols = list(columns_manifest.keys())

    if 'ArrayType' not in dtypes:
        return df
    
    elif 'ArrayType' in dtypes:
        
        for column in cols:
            if columns_manifest[column] == 'ArrayType':

                dims_dict[f'{prefix}_{column}_df'] = df.select(f'{primary_key}', explode_outer(f'{column}').alias(f'{column}'))

                df = df.drop(f'{column}')
    
    return df, dims_dict 

In [8]:
pokemons_df = unstruct(df = pokemons_df)
pokemons_df.printSchema()

root
 |-- abilities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ability: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |-- is_hidden: boolean (nullable = true)
 |    |    |-- slot: long (nullable = true)
 |-- base_experience: long (nullable = true)
 |-- forms: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- game_indices: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- game_index: long (nullable = true)
 |    |    |-- version: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |-- height: long (nullable = true)
 |-- held_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- item: stru

In [9]:
pokemons_df, dims_dict = disarray(df = pokemons_df, primary_key = 'id', prefix = 'pokemon')

In [10]:
pokemons_df.printSchema()

root
 |-- base_experience: long (nullable = true)
 |-- height: long (nullable = true)
 |-- id: long (nullable = true)
 |-- is_default: boolean (nullable = true)
 |-- location_area_encounters: string (nullable = true)
 |-- name: string (nullable = true)
 |-- order: long (nullable = true)
 |-- weight: long (nullable = true)
 |-- species_name: string (nullable = true)
 |-- species_url: string (nullable = true)
 |-- sprites_back_default: string (nullable = true)
 |-- sprites_back_female: string (nullable = true)
 |-- sprites_back_shiny: string (nullable = true)
 |-- sprites_back_shiny_female: string (nullable = true)
 |-- sprites_front_default: string (nullable = true)
 |-- sprites_front_female: string (nullable = true)
 |-- sprites_front_shiny: string (nullable = true)
 |-- sprites_front_shiny_female: string (nullable = true)
 |-- sprites_other_dream_world_front_default: string (nullable = true)
 |-- sprites_other_dream_world_front_female: string (nullable = true)
 |-- sprites_other_home_

In [11]:
pokemons_df = pokemons_df\
    .select(
        'id',
        'name',
        'weight',
        'height',
        'order',
        'base_experience',
        'is_default',
        'species_name',
        'species_url'
        )\
    .orderBy('id')
pokemons_df.show()

[Stage 0:>                                                        (0 + 24) / 24]

+---+----------+------+------+-----+---------------+----------+------------+--------------------+
| id|      name|weight|height|order|base_experience|is_default|species_name|         species_url|
+---+----------+------+------+-----+---------------+----------+------------+--------------------+
|  1| bulbasaur|    69|     7|    1|             64|      true|   bulbasaur|https://pokeapi.c...|
|  2|   ivysaur|   130|    10|    2|            142|      true|     ivysaur|https://pokeapi.c...|
|  3|  venusaur|  1000|    20|    3|            263|      true|    venusaur|https://pokeapi.c...|
|  4|charmander|    85|     6|    5|             62|      true|  charmander|https://pokeapi.c...|
|  5|charmeleon|   190|    11|    6|            142|      true|  charmeleon|https://pokeapi.c...|
|  6| charizard|   905|    17|    7|            267|      true|   charizard|https://pokeapi.c...|
|  7|  squirtle|    90|     5|   10|             63|      true|    squirtle|https://pokeapi.c...|
|  8| wartortle|   2

                                                                                

In [12]:
pokemons_df.count() #validando

                                                                                

1010

In [13]:
#spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemons")
pokemons_df.write\
    .option("path", "s3a://datalake/silver/pokemons/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemons')

                                                                                

In [14]:
dims_dict

{'pokemon_abilities_df': DataFrame[id: bigint, abilities: struct<ability:struct<name:string,url:string>,is_hidden:boolean,slot:bigint>],
 'pokemon_forms_df': DataFrame[id: bigint, forms: struct<name:string,url:string>],
 'pokemon_game_indices_df': DataFrame[id: bigint, game_indices: struct<game_index:bigint,version:struct<name:string,url:string>>],
 'pokemon_held_items_df': DataFrame[id: bigint, held_items: struct<item:struct<name:string,url:string>,version_details:array<struct<rarity:bigint,version:struct<name:string,url:string>>>>],
 'pokemon_moves_df': DataFrame[id: bigint, moves: struct<move:struct<name:string,url:string>,version_group_details:array<struct<level_learned_at:bigint,move_learn_method:struct<name:string,url:string>,version_group:struct<name:string,url:string>>>>],
 'pokemon_past_types_df': DataFrame[id: bigint, past_types: struct<generation:struct<name:string,url:string>,types:array<struct<slot:bigint,type:struct<name:string,url:string>>>>],
 'pokemon_stats_df': DataFr

In [15]:
for df in list(dims_dict.keys()):
    exec(f"{df} = dims_dict['{df}']")

In [16]:
pokemon_abilities_df.show(5)

+---+--------------------+
| id|           abilities|
+---+--------------------+
|151|{{synchronize, ht...|
|150|{{pressure, https...|
|150|{{unnerve, https:...|
|113|{{natural-cure, h...|
|113|{{serene-grace, h...|
+---+--------------------+
only showing top 5 rows



In [17]:
pokemon_abilities_df = unstruct(pokemon_abilities_df)
pokemon_abilities_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- abilities_is_hidden: boolean (nullable = true)
 |-- abilities_slot: long (nullable = true)
 |-- abilities_ability_name: string (nullable = true)
 |-- abilities_ability_url: string (nullable = true)



In [18]:
pokemon_abilities_df = pokemon_abilities_df.orderBy('id')

pokemon_abilities_df.show(5, truncate = False)



+---+-------------------+--------------+----------------------+-------------------------------------+
|id |abilities_is_hidden|abilities_slot|abilities_ability_name|abilities_ability_url                |
+---+-------------------+--------------+----------------------+-------------------------------------+
|1  |false              |1             |overgrow              |https://pokeapi.co/api/v2/ability/65/|
|1  |true               |3             |chlorophyll           |https://pokeapi.co/api/v2/ability/34/|
|2  |true               |3             |chlorophyll           |https://pokeapi.co/api/v2/ability/34/|
|2  |false              |1             |overgrow              |https://pokeapi.co/api/v2/ability/65/|
|3  |false              |1             |overgrow              |https://pokeapi.co/api/v2/ability/65/|
+---+-------------------+--------------+----------------------+-------------------------------------+
only showing top 5 rows



                                                                                

In [19]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_abilities")
pokemon_abilities_df.write\
    .option("path", "s3a://datalake/silver/pokemon_abilities/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_abilities')

                                                                                

In [20]:
pokemon_forms_df.show(5)
pokemon_forms_df.printSchema()

+---+--------------------+
| id|               forms|
+---+--------------------+
|151|{mew, https://pok...|
|150|{mewtwo, https://...|
|113|{chansey, https:/...|
|149|{dragonite, https...|
|122|{mr-mime, https:/...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- forms: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- url: string (nullable = true)



In [21]:
pokemon_forms_df = unstruct(pokemon_forms_df)
pokemon_forms_df.show(5)
pokemon_forms_df.printSchema()

+---+----------+--------------------+
| id|forms_name|           forms_url|
+---+----------+--------------------+
|151|       mew|https://pokeapi.c...|
|150|    mewtwo|https://pokeapi.c...|
|113|   chansey|https://pokeapi.c...|
|149| dragonite|https://pokeapi.c...|
|122|   mr-mime|https://pokeapi.c...|
+---+----------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- forms_name: string (nullable = true)
 |-- forms_url: string (nullable = true)



In [22]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_forms")
pokemon_forms_df.write\
    .option("path", "s3a://datalake/silver/pokemon_forms/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_forms')

                                                                                

In [23]:
pokemon_game_indices_df.show(5)
pokemon_game_indices_df.printSchema()

+---+--------------------+
| id|        game_indices|
+---+--------------------+
|151|{21, {red, https:...|
|151|{21, {blue, https...|
|151|{21, {yellow, htt...|
|151|{151, {gold, http...|
|151|{151, {silver, ht...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- game_indices: struct (nullable = true)
 |    |-- game_index: long (nullable = true)
 |    |-- version: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [24]:
pokemon_game_indices_df = unstruct(pokemon_game_indices_df)
pokemon_game_indices_df.show(5)
pokemon_game_indices_df.printSchema()

+---+-----------------------+-------------------------+------------------------+
| id|game_indices_game_index|game_indices_version_name|game_indices_version_url|
+---+-----------------------+-------------------------+------------------------+
|151|                     21|                      red|    https://pokeapi.c...|
|151|                     21|                     blue|    https://pokeapi.c...|
|151|                     21|                   yellow|    https://pokeapi.c...|
|151|                    151|                     gold|    https://pokeapi.c...|
|151|                    151|                   silver|    https://pokeapi.c...|
+---+-----------------------+-------------------------+------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- game_indices_game_index: long (nullable = true)
 |-- game_indices_version_name: string (nullable = true)
 |-- game_indices_version_url: string (nullable = true)



In [25]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_game_indices")
pokemon_game_indices_df.write\
    .option("path", "s3a://datalake/silver/pokemon_game_indices/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_game_indices')

                                                                                

In [26]:
pokemon_held_items_df.show(5)
pokemon_held_items_df.printSchema()

+---+--------------------+
| id|          held_items|
+---+--------------------+
|151|{{lum-berry, http...|
|150|                null|
|113|{{oval-stone, htt...|
|113|{{lucky-egg, http...|
|113|{{lucky-punch, ht...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- held_items: struct (nullable = true)
 |    |-- item: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- version_details: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- rarity: long (nullable = true)
 |    |    |    |-- version: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)



In [27]:
pokemon_held_items_df = unstruct(pokemon_held_items_df)
pokemon_held_items_df.show(5)
pokemon_held_items_df.printSchema()

+---+--------------------------+--------------------+--------------------+
| id|held_items_version_details|held_items_item_name| held_items_item_url|
+---+--------------------------+--------------------+--------------------+
|151|      [{100, {ruby, htt...|           lum-berry|https://pokeapi.c...|
|150|                      null|                null|                null|
|113|      [{50, {diamond, h...|          oval-stone|https://pokeapi.c...|
|113|      [{5, {ruby, https...|           lucky-egg|https://pokeapi.c...|
|113|      [{50, {black, htt...|         lucky-punch|https://pokeapi.c...|
+---+--------------------------+--------------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- held_items_version_details: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rarity: long (nullable = true)
 |    |    |-- version: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |

In [28]:
pokemon_held_items_df, pokemon_held_items_dims_dict = disarray(pokemon_held_items_df, primary_key = 'id', prefix = 'pokemon')

In [29]:
pokemon_held_items_df.show(5)
pokemon_held_items_df.printSchema()

+---+--------------------+--------------------+
| id|held_items_item_name| held_items_item_url|
+---+--------------------+--------------------+
|151|           lum-berry|https://pokeapi.c...|
|150|                null|                null|
|113|          oval-stone|https://pokeapi.c...|
|113|           lucky-egg|https://pokeapi.c...|
|113|         lucky-punch|https://pokeapi.c...|
+---+--------------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- held_items_item_name: string (nullable = true)
 |-- held_items_item_url: string (nullable = true)



In [30]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_held_items")
pokemon_held_items_df.write\
    .option("path", "s3a://datalake/silver/pokemon_held_items/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_held_items')

                                                                                

In [31]:
pokemon_held_items_dims_dict

{'pokemon_held_items_version_details_df': DataFrame[id: bigint, held_items_version_details: struct<rarity:bigint,version:struct<name:string,url:string>>]}

In [32]:
pokemon_held_items_version_details_df = pokemon_held_items_dims_dict['pokemon_held_items_version_details_df']
pokemon_held_items_version_details_df.show(5)
pokemon_held_items_version_details_df.printSchema()

+---+--------------------------+
| id|held_items_version_details|
+---+--------------------------+
|151|      {100, {ruby, http...|
|151|      {100, {sapphire, ...|
|151|      {100, {emerald, h...|
|151|      {100, {diamond, h...|
|151|      {100, {pearl, htt...|
+---+--------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- held_items_version_details: struct (nullable = true)
 |    |-- rarity: long (nullable = true)
 |    |-- version: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [33]:
pokemon_held_items_version_details_df = unstruct(pokemon_held_items_version_details_df)
pokemon_held_items_version_details_df.show(5)
pokemon_held_items_version_details_df.printSchema()

+---+---------------------------------+---------------------------------------+--------------------------------------+
| id|held_items_version_details_rarity|held_items_version_details_version_name|held_items_version_details_version_url|
+---+---------------------------------+---------------------------------------+--------------------------------------+
|151|                              100|                                   ruby|                  https://pokeapi.c...|
|151|                              100|                               sapphire|                  https://pokeapi.c...|
|151|                              100|                                emerald|                  https://pokeapi.c...|
|151|                              100|                                diamond|                  https://pokeapi.c...|
|151|                              100|                                  pearl|                  https://pokeapi.c...|
+---+---------------------------------+---------

In [34]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_held_items_version_details")
pokemon_held_items_version_details_df.write\
    .option("path", "s3a://datalake/silver/pokemon_held_items_version_details/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_held_items_version_details')
spark.catalog.listTables(dbName='pokeapi')

                                                                                

[Table(name='tbl_bronze_evolutions', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_bronze_pokemons', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemon_abilities', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemon_forms', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemon_game_indices', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemon_held_items', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemon_held_items_version_details', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='tbl_silver_pokemons', database='pokeapi', description=None, tableType='EXTERNAL', isTemporary=False)]

In [35]:
pokemon_moves_df.show(5)
pokemon_moves_df.printSchema()

+---+--------------------+
| id|               moves|
+---+--------------------+
|151|{{pound, https://...|
|151|{{mega-punch, htt...|
|151|{{pay-day, https:...|
|151|{{fire-punch, htt...|
|151|{{ice-punch, http...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- moves: struct (nullable = true)
 |    |-- move: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- version_group_details: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- level_learned_at: long (nullable = true)
 |    |    |    |-- move_learn_method: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- version_group: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)



In [36]:
pokemon_moves_df = unstruct(pokemon_moves_df)
pokemon_moves_df.show(5)
pokemon_moves_df.printSchema()

+---+---------------------------+---------------+--------------------+
| id|moves_version_group_details|moves_move_name|      moves_move_url|
+---+---------------------------+---------------+--------------------+
|151|       [{1, {level-up, h...|          pound|https://pokeapi.c...|
|151|       [{20, {level-up, ...|     mega-punch|https://pokeapi.c...|
|151|       [{0, {machine, ht...|        pay-day|https://pokeapi.c...|
|151|       [{0, {machine, ht...|     fire-punch|https://pokeapi.c...|
|151|       [{0, {machine, ht...|      ice-punch|https://pokeapi.c...|
+---+---------------------------+---------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- moves_version_group_details: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- level_learned_at: long (nullable = true)
 |    |    |-- move_learn_method: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- url: st

In [37]:
pokemon_moves_df, pokemon_moves_dims_dict = disarray(pokemon_moves_df, prefix = 'pokemon', primary_key = 'id')

In [38]:
pokemon_moves_df.show(5)
pokemon_moves_df.printSchema()

+---+---------------+--------------------+
| id|moves_move_name|      moves_move_url|
+---+---------------+--------------------+
|151|          pound|https://pokeapi.c...|
|151|     mega-punch|https://pokeapi.c...|
|151|        pay-day|https://pokeapi.c...|
|151|     fire-punch|https://pokeapi.c...|
|151|      ice-punch|https://pokeapi.c...|
+---+---------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- moves_move_name: string (nullable = true)
 |-- moves_move_url: string (nullable = true)



In [39]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_moves")
pokemon_moves_df.write\
    .option("path", "s3a://datalake/silver/pokemon_moves/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_moves')

                                                                                

In [40]:
pokemon_moves_dims_dict

{'pokemon_moves_version_group_details_df': DataFrame[id: bigint, moves_version_group_details: struct<level_learned_at:bigint,move_learn_method:struct<name:string,url:string>,version_group:struct<name:string,url:string>>]}

In [41]:
pokemon_moves_version_group_details_df = pokemon_moves_dims_dict['pokemon_moves_version_group_details_df']
pokemon_moves_version_group_details_df.show(5)
pokemon_moves_version_group_details_df.printSchema()

+---+---------------------------+
| id|moves_version_group_details|
+---+---------------------------+
|151|       {1, {level-up, ht...|
|151|       {1, {level-up, ht...|
|151|       {1, {level-up, ht...|
|151|       {1, {level-up, ht...|
|151|       {1, {level-up, ht...|
+---+---------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- moves_version_group_details: struct (nullable = true)
 |    |-- level_learned_at: long (nullable = true)
 |    |-- move_learn_method: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- version_group: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [42]:
pokemon_moves_version_group_details_df = unstruct(pokemon_moves_version_group_details_df)
pokemon_moves_version_group_details_df.show(5)
pokemon_moves_version_group_details_df.printSchema()

+---+--------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------------+---------------------------------------------+
| id|moves_version_group_details_level_learned_at|moves_version_group_details_move_learn_method_name|moves_version_group_details_move_learn_method_url|moves_version_group_details_version_group_name|moves_version_group_details_version_group_url|
+---+--------------------------------------------+--------------------------------------------------+-------------------------------------------------+----------------------------------------------+---------------------------------------------+
|151|                                           1|                                          level-up|                             https://pokeapi.c...|                                      red-blue|                         https://pokeapi.c...|
|151|               

In [43]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_moves_version_group_details")
pokemon_moves_version_group_details_df.write\
    .option("path", "s3a://datalake/silver/pokemon_moves_version_group_details/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_moves_version_group_details')

                                                                                

In [44]:
dims_dict

{'pokemon_abilities_df': DataFrame[id: bigint, abilities: struct<ability:struct<name:string,url:string>,is_hidden:boolean,slot:bigint>],
 'pokemon_forms_df': DataFrame[id: bigint, forms: struct<name:string,url:string>],
 'pokemon_game_indices_df': DataFrame[id: bigint, game_indices: struct<game_index:bigint,version:struct<name:string,url:string>>],
 'pokemon_held_items_df': DataFrame[id: bigint, held_items: struct<item:struct<name:string,url:string>,version_details:array<struct<rarity:bigint,version:struct<name:string,url:string>>>>],
 'pokemon_moves_df': DataFrame[id: bigint, moves: struct<move:struct<name:string,url:string>,version_group_details:array<struct<level_learned_at:bigint,move_learn_method:struct<name:string,url:string>,version_group:struct<name:string,url:string>>>>],
 'pokemon_past_types_df': DataFrame[id: bigint, past_types: struct<generation:struct<name:string,url:string>,types:array<struct<slot:bigint,type:struct<name:string,url:string>>>>],
 'pokemon_stats_df': DataFr

In [45]:
pokemon_past_types_df.show(5)
pokemon_past_types_df.printSchema()

+---+--------------------+
| id|          past_types|
+---+--------------------+
|151|                null|
|150|                null|
|113|                null|
|149|                null|
|122|{{generation-v, h...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- past_types: struct (nullable = true)
 |    |-- generation: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- types: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- slot: long (nullable = true)
 |    |    |    |-- type: struct (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)



In [46]:
pokemon_past_types_df = unstruct(pokemon_past_types_df)
pokemon_past_types_df.show(5)
pokemon_past_types_df.printSchema()

+---+--------------------+--------------------------+-------------------------+
| id|    past_types_types|past_types_generation_name|past_types_generation_url|
+---+--------------------+--------------------------+-------------------------+
|151|                null|                      null|                     null|
|150|                null|                      null|                     null|
|113|                null|                      null|                     null|
|149|                null|                      null|                     null|
|122|[{1, {psychic, ht...|              generation-v|     https://pokeapi.c...|
+---+--------------------+--------------------------+-------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- past_types_types: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- slot: long (nullable = true)
 |    |    |-- type: struct (nullable = true)
 |    |    |    |-- name: string 

In [47]:
pokemon_past_types_df, pokemon_past_types_dims_dict = disarray(pokemon_past_types_df, prefix = 'pokemon', primary_key = 'id')

In [48]:
pokemon_past_types_df.show(5)
pokemon_past_types_df.printSchema()

+---+--------------------------+-------------------------+
| id|past_types_generation_name|past_types_generation_url|
+---+--------------------------+-------------------------+
|151|                      null|                     null|
|150|                      null|                     null|
|113|                      null|                     null|
|149|                      null|                     null|
|122|              generation-v|     https://pokeapi.c...|
+---+--------------------------+-------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- past_types_generation_name: string (nullable = true)
 |-- past_types_generation_url: string (nullable = true)



In [49]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_past_types")
pokemon_past_types_df.write\
    .option("path", "s3a://datalake/silver/pokemon_past_types/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_past_types')

                                                                                

In [50]:
pokemon_past_types_dims_dict

{'pokemon_past_types_types_df': DataFrame[id: bigint, past_types_types: struct<slot:bigint,type:struct<name:string,url:string>>]}

In [51]:
pokemon_past_types_types_df = pokemon_past_types_dims_dict['pokemon_past_types_types_df']
pokemon_past_types_types_df.show(5)
pokemon_past_types_types_df.printSchema()

+---+--------------------+
| id|    past_types_types|
+---+--------------------+
|151|                null|
|150|                null|
|113|                null|
|149|                null|
|122|{1, {psychic, htt...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- past_types_types: struct (nullable = true)
 |    |-- slot: long (nullable = true)
 |    |-- type: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [52]:
pokemon_past_types_types_df = unstruct(pokemon_past_types_types_df)
pokemon_past_types_types_df.show(5)
pokemon_past_types_types_df.printSchema()

+---+---------------------+--------------------------+-------------------------+
| id|past_types_types_slot|past_types_types_type_name|past_types_types_type_url|
+---+---------------------+--------------------------+-------------------------+
|151|                 null|                      null|                     null|
|150|                 null|                      null|                     null|
|113|                 null|                      null|                     null|
|149|                 null|                      null|                     null|
|122|                    1|                   psychic|     https://pokeapi.c...|
+---+---------------------+--------------------------+-------------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- past_types_types_slot: long (nullable = true)
 |-- past_types_types_type_name: string (nullable = true)
 |-- past_types_types_type_url: string (nullable = true)



In [53]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_past_types_types")
pokemon_past_types_types_df.write\
    .option("path", "s3a://datalake/silver/pokemon_past_types_types/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_past_types_types')

                                                                                

In [54]:
pokemon_stats_df.show(5)
pokemon_stats_df.printSchema()

+---+--------------------+
| id|               stats|
+---+--------------------+
|151|{100, 3, {hp, htt...|
|151|{100, 0, {attack,...|
|151|{100, 0, {defense...|
|151|{100, 0, {special...|
|151|{100, 0, {special...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- stats: struct (nullable = true)
 |    |-- base_stat: long (nullable = true)
 |    |-- effort: long (nullable = true)
 |    |-- stat: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [55]:
pokemon_stats_df = unstruct(pokemon_stats_df)
pokemon_stats_df.show(5)
pokemon_stats_df.printSchema()

+---+---------------+------------+---------------+--------------------+
| id|stats_base_stat|stats_effort|stats_stat_name|      stats_stat_url|
+---+---------------+------------+---------------+--------------------+
|151|            100|           3|             hp|https://pokeapi.c...|
|151|            100|           0|         attack|https://pokeapi.c...|
|151|            100|           0|        defense|https://pokeapi.c...|
|151|            100|           0| special-attack|https://pokeapi.c...|
|151|            100|           0|special-defense|https://pokeapi.c...|
+---+---------------+------------+---------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- stats_base_stat: long (nullable = true)
 |-- stats_effort: long (nullable = true)
 |-- stats_stat_name: string (nullable = true)
 |-- stats_stat_url: string (nullable = true)



In [56]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_stats")
pokemon_stats_df.write\
    .option("path", "s3a://datalake/silver/pokemon_stats/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_stats')

                                                                                

In [57]:
pokemon_types_df.show(5)
pokemon_types_df.printSchema()

+---+--------------------+
| id|               types|
+---+--------------------+
|151|{1, {psychic, htt...|
|150|{1, {psychic, htt...|
|113|{1, {normal, http...|
|149|{1, {dragon, http...|
|149|{2, {flying, http...|
+---+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- types: struct (nullable = true)
 |    |-- slot: long (nullable = true)
 |    |-- type: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- url: string (nullable = true)



In [58]:
pokemon_types_df = unstruct(pokemon_types_df)
pokemon_types_df.show(5)
pokemon_types_df.printSchema()

+---+----------+---------------+--------------------+
| id|types_slot|types_type_name|      types_type_url|
+---+----------+---------------+--------------------+
|151|         1|        psychic|https://pokeapi.c...|
|150|         1|        psychic|https://pokeapi.c...|
|113|         1|         normal|https://pokeapi.c...|
|149|         1|         dragon|https://pokeapi.c...|
|149|         2|         flying|https://pokeapi.c...|
+---+----------+---------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- types_slot: long (nullable = true)
 |-- types_type_name: string (nullable = true)
 |-- types_type_url: string (nullable = true)



In [59]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_pokemon_types")
pokemon_types_df.write\
    .option("path", "s3a://datalake/silver/pokemon_types/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_pokemon_types')

                                                                                

# Tabela TBL_SILVER_EVOLUTIONS

In [60]:
evolutions_df = spark.read.table('pokeapi.tbl_bronze_evolutions')
evolutions_df.show(5)
evolutions_df.printSchema()

+-----------------+--------------------+---+
|baby_trigger_item|               chain| id|
+-----------------+--------------------+---+
|             null|{[], [{[{null, nu...| 67|
|             null|{[], [{[{null, nu...| 34|
|             null|{[], [{[{null, nu...|135|
|             null|{[], [{[{null, nu...| 33|
|             null|{[], [{[{null, nu...|147|
+-----------------+--------------------+---+
only showing top 5 rows

root
 |-- baby_trigger_item: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- chain: struct (nullable = true)
 |    |-- evolution_details: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- evolves_to: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- evolution_details: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- gender: long (nullable = true)
 |    

In [61]:
evolutions_df = unnest_struct_array(evolutions_df)
#evolutions_df.show(5)
evolutions_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- baby_trigger_item_name: string (nullable = true)
 |-- baby_trigger_item_url: string (nullable = true)
 |-- chain_evolution_details: string (nullable = true)
 |-- chain_is_baby: boolean (nullable = true)
 |-- chain_species_name: string (nullable = true)
 |-- chain_species_url: string (nullable = true)
 |-- chain_evolves_to_is_baby: boolean (nullable = true)
 |-- chain_evolves_to_species_name: string (nullable = true)
 |-- chain_evolves_to_species_url: string (nullable = true)
 |-- chain_evolves_to_evolution_details_gender: long (nullable = true)
 |-- chain_evolves_to_evolution_details_min_affection: long (nullable = true)
 |-- chain_evolves_to_evolution_details_min_beauty: long (nullable = true)
 |-- chain_evolves_to_evolution_details_min_happiness: long (nullable = true)
 |-- chain_evolves_to_evolution_details_min_level: long (nullable = true)
 |-- chain_evolves_to_evolution_details_needs_overworld_rain: boolean (nullable = true)
 |-- chain_evo

In [62]:
evolutions_df_1 = evolutions_df.select(
    lit(1).alias('evolution_level'),
    col('chain_species_name').alias('name'),
    col('chain_is_baby').alias('is_baby'),
    col('chain_evolves_to_species_name').alias('evolves_to'),
    col('chain_evolves_to_evolution_details_trigger_name').alias('trigger'),
    col('chain_evolves_to_evolution_details_min_level').alias('level'),
    col('chain_evolves_to_evolution_details_gender').alias('gender'),
    col('chain_evolves_to_evolution_details_held_item_name').alias('hold_item'),
    col('chain_evolves_to_evolution_details_item_name').alias('use_item'),
    col('chain_evolves_to_evolution_details_min_beauty').alias('beauty_req'),
    col('chain_evolves_to_evolution_details_min_happiness').alias('happiness_req'),
    col('chain_evolves_to_evolution_details_min_affection').alias('affection_req'),
    col('chain_evolves_to_evolution_details_needs_overworld_rain').alias('is_raining'),
    col('chain_evolves_to_evolution_details_relative_physical_stats').alias('stats'),
    col('chain_evolves_to_evolution_details_turn_upside_down').alias('turn_upside_down'),
    col('chain_evolves_to_evolution_details_time_of_day').alias('time_of_day'),
    col('chain_evolves_to_evolution_details_trade_species_name').alias('trade'),
    col('chain_evolves_to_evolution_details_party_species_name').alias('party_species'),
    col('chain_evolves_to_evolution_details_party_type_name').alias('party_type'),
    col('chain_evolves_to_evolution_details_location_name').alias('location'),
    col('chain_evolves_to_evolution_details_known_move_name').alias('known_move'),
    col('chain_evolves_to_evolution_details_known_move_type_name').alias('move_type')
).distinct()
evolutions_df_1.show(5)
evolutions_df_1.printSchema()

[Stage 56:==>                                                     (1 + 23) / 24]

+---------------+-------+-------+----------+--------+-----+------+---------+-----------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+-------------+----------+---------+
|evolution_level|   name|is_baby|evolves_to| trigger|level|gender|hold_item|   use_item|beauty_req|happiness_req|affection_req|is_raining|stats|turn_upside_down|time_of_day|trade|party_species|party_type|     location|known_move|move_type|
+---------------+-------+-------+----------+--------+-----+------+---------+-----------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+-------------+----------+---------+
|              1|porygon|  false|  porygon2|   trade| null|  null| up-grade|       null|      null|         null|         null|     false| null|           false|           | null|         null|      null|         null|      null|     null|
|              1|poliwag|  false| poliwh

                                                                                

In [63]:
evolutions_df_2 = evolutions_df.select(
    lit(2).alias('evolution_level'),
    col('chain_evolves_to_species_name').alias('name'),
    col('chain_evolves_to_is_baby').alias('is_baby'),
    col('chain_evolves_to_evolves_to_species_name').alias('evolves_to'),
    col('chain_evolves_to_evolves_to_evolution_details_trigger_name').alias('trigger'),
    col('chain_evolves_to_evolves_to_evolution_details_min_level').alias('level'),
    col('chain_evolves_to_evolves_to_evolution_details_gender').alias('gender'),
    col('chain_evolves_to_evolves_to_evolution_details_held_item_name').alias('hold_item'),
    col('chain_evolves_to_evolves_to_evolution_details_item_name').alias('use_item'),
    col('chain_evolves_to_evolves_to_evolution_details_min_beauty').alias('beauty_req'),
    col('chain_evolves_to_evolves_to_evolution_details_min_happiness').alias('happiness_req'),
    col('chain_evolves_to_evolves_to_evolution_details_min_affection').alias('affection_req'),
    col('chain_evolves_to_evolves_to_evolution_details_needs_overworld_rain').alias('is_raining'),
    col('chain_evolves_to_evolves_to_evolution_details_relative_physical_stats').alias('stats'),
    col('chain_evolves_to_evolves_to_evolution_details_turn_upside_down').alias('turn_upside_down'),
    col('chain_evolves_to_evolves_to_evolution_details_time_of_day').alias('time_of_day'),
    col('chain_evolves_to_evolves_to_evolution_details_trade_species').alias('trade'),
    col('chain_evolves_to_evolves_to_evolution_details_party_species').alias('party_species'),
    col('chain_evolves_to_evolves_to_evolution_details_party_type').alias('party_type'),
    col('chain_evolves_to_evolves_to_evolution_details_location_name').alias('location'),
    col('chain_evolves_to_evolves_to_evolution_details_known_move_name').alias('known_move'),
    col('chain_evolves_to_evolves_to_evolution_details_known_move_type').alias('move_type')
).distinct()
evolutions_df_2.show(5)
evolutions_df_2.printSchema()



+---------------+----------+-------+----------+--------+-----+------+---------+-----------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|evolution_level|      name|is_baby|evolves_to| trigger|level|gender|hold_item|   use_item|beauty_req|happiness_req|affection_req|is_raining|stats|turn_upside_down|time_of_day|trade|party_species|party_type|location|known_move|move_type|
+---------------+----------+-------+----------+--------+-----+------+---------+-----------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|              2|   lampent|  false|chandelure|use-item| null|  null|     null| dusk-stone|      null|         null|         null|     false| null|           false|           | null|         null|      null|    null|      null|     null|
|              2|    rhydon|  false| rhyperior| 

                                                                                

In [64]:
from pyspark.sql.functions import lit

evolutions_df_3 = evolutions_df.select(
    lit(3).alias('evolution_level'),
    col('chain_evolves_to_evolves_to_species_name').alias('name'),
    col('chain_evolves_to_evolves_to_is_baby').alias('is_baby'),
    col('chain_evolves_to_evolves_to_evolves_to').alias('evolves_to'),
    lit(None).alias('trigger'),
    lit(None).alias('level'),
    lit(None).alias('gender'),
    lit(None).alias('hold_item'),
    lit(None).alias('use_item'),
    lit(None).alias('beauty_req'),
    lit(None).alias('happiness_req'),
    lit(None).alias('affection_req'),
    lit(None).cast('boolean').alias('is_raining'),
    lit(None).alias('stats'),
    lit(None).cast('boolean').alias('turn_upside_down'),
    lit(None).alias('time_of_day'),
    lit(None).alias('trade'),
    lit(None).alias('party_species'),
    lit(None).alias('party_type'),
    lit(None).alias('location'),
    lit(None).alias('known_move'),
    lit(None).alias('move_type')
).distinct()
evolutions_df_3.show(5)
evolutions_df_3.printSchema()



+---------------+----------+-------+----------+-------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|evolution_level|      name|is_baby|evolves_to|trigger|level|gender|hold_item|use_item|beauty_req|happiness_req|affection_req|is_raining|stats|turn_upside_down|time_of_day|trade|party_species|party_type|location|known_move|move_type|
+---------------+----------+-------+----------+-------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|              3| aegislash|  false|      null|   null| null|  null|     null|    null|      null|         null|         null|      null| null|            null|       null| null|         null|      null|    null|      null|     null|
|              3| mamoswine|  false|      null|   null| null|  n

                                                                                

In [65]:
evolution_df_all = evolutions_df_1.union(evolutions_df_2).union(evolutions_df_3).distinct()
evolution_df_all.show()



+---------------+-----------+-------+----------+--------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|evolution_level|       name|is_baby|evolves_to| trigger|level|gender|hold_item|use_item|beauty_req|happiness_req|affection_req|is_raining|stats|turn_upside_down|time_of_day|trade|party_species|party_type|location|known_move|move_type|
+---------------+-----------+-------+----------+--------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|              2|   espathra|  false|      null|    null| null|  null|     null|    null|      null|         null|         null|      null| null|            null|       null| null|         null|      null|    null|      null|     null|
|              2|    bayleef|  false|  meganium|level-up

                                                                                

In [66]:
pokemons_df = spark.read.table('pokeapi.tbl_silver_pokemons')
pokemons_df.show(5)
pokemons_df.printSchema()

+---+----------+------+------+-----+---------------+----------+------------+--------------------+
| id|      name|weight|height|order|base_experience|is_default|species_name|         species_url|
+---+----------+------+------+-----+---------------+----------+------------+--------------------+
|  1| bulbasaur|    69|     7|    1|             64|      true|   bulbasaur|https://pokeapi.c...|
|  2|   ivysaur|   130|    10|    2|            142|      true|     ivysaur|https://pokeapi.c...|
|  3|  venusaur|  1000|    20|    3|            263|      true|    venusaur|https://pokeapi.c...|
|  4|charmander|    85|     6|    5|             62|      true|  charmander|https://pokeapi.c...|
|  5|charmeleon|   190|    11|    6|            142|      true|  charmeleon|https://pokeapi.c...|
+---+----------+------+------+-----+---------------+----------+------------+--------------------+
only showing top 5 rows

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- weight: long (n

In [67]:
evolutions_final_df = pokemons_df.select('id', 'name').join(evolution_df_all, on = 'name', how = 'left')
evolutions_final_df.show(5)
evolutions_final_df.printSchema()

                                                                                

+----------+---+---------------+-------+----------+--------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
|      name| id|evolution_level|is_baby|evolves_to| trigger|level|gender|hold_item|use_item|beauty_req|happiness_req|affection_req|is_raining|stats|turn_upside_down|time_of_day|trade|party_species|party_type|location|known_move|move_type|
+----------+---+---------------+-------+----------+--------+-----+------+---------+--------+----------+-------------+-------------+----------+-----+----------------+-----------+-----+-------------+----------+--------+----------+---------+
| bulbasaur|  1|              1|  false|   ivysaur|level-up|   16|  null|     null|    null|      null|         null|         null|     false| null|           false|           | null|         null|      null|    null|      null|     null|
|   ivysaur|  2|              2|  false|  ve

In [68]:
spark.sql("DROP TABLE IF EXISTS pokeapi.tbl_silver_evolutions")
evolutions_final_df.write\
    .option("path", "s3a://datalake/silver/evolutions/")\
    .format('parquet')\
    .mode('overwrite')\
    .saveAsTable('pokeapi.tbl_silver_evolutions')

                                                                                

In [69]:
spark.sql("SHOW TABLES IN pokeapi").show(truncate = False)

+---------+----------------------------------------------+-----------+
|namespace|tableName                                     |isTemporary|
+---------+----------------------------------------------+-----------+
|pokeapi  |tbl_bronze_evolutions                         |false      |
|pokeapi  |tbl_bronze_pokemons                           |false      |
|pokeapi  |tbl_silver_evolutions                         |false      |
|pokeapi  |tbl_silver_pokemon_abilities                  |false      |
|pokeapi  |tbl_silver_pokemon_forms                      |false      |
|pokeapi  |tbl_silver_pokemon_game_indices               |false      |
|pokeapi  |tbl_silver_pokemon_held_items                 |false      |
|pokeapi  |tbl_silver_pokemon_held_items_version_details |false      |
|pokeapi  |tbl_silver_pokemon_moves                      |false      |
|pokeapi  |tbl_silver_pokemon_moves_version_group_details|false      |
|pokeapi  |tbl_silver_pokemon_past_types                 |false      |
|pokea

In [70]:
spark.stop()