# Convert Json to PySpark Schema

Este projeto tem o objetivo de realizar a conversão de uma estrutura (json/dict) para um schema(StructType) que será utilizado na criação de um dataframe.<br>

A estrutura definida para a conversão é similar a estrura do json original, porém, ao invés de conter o valor do atributo ele é substituido pela tipagem desejada.<br>

<pre>
================================================================================
<b>Spark Type</b>    |Json Type        |<b>Spark Type</b>    |Json Type
================================================================================
<b>DataType</b>      |data             |<b>NullType</b>      |void
<b>StringType</b>    |string           |<b>BinaryType</b>    |binary
<b>BooleanType</b>   |boolean          |<b>DateType</b>      |date
<b>TimestampType</b> |timestamp        |<b>DecimalType</b>   |decimal
<b>DoubleType</b>    |double           |<b>FloatType</b>     |float
<b>ByteType</b>      |byte             |<b>IntegerType</b>   |integer
<b>LongType</b>      |long             |<b>ShortType</b>     |short
<b>ArrayType</b>     |array            |<b>MapType</b>       |map
<b>StructType</b>    |struct
</pre>


In [1]:
from IPython import display as ipy_display
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types as T
import requests
import json

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .config("spark.sql.caseSensitive", True) \
    .config("spark.sql.repl.eagerEval.enabled",True) \
    .appName("catch_pokemons") \
    .getOrCreate()

In [3]:
def display(df, n=5):
    return ipy_display.HTML(df.limit(n).toPandas().to_html(index=False))

In [4]:
SPARK_TYPES = {}
for _ in T.__all__:
    try:
        SPARK_TYPES[eval(f"T.{tp}.typeName()")] = _
    except:
        pass

In [5]:
def dict_to_spark_schema(input_data, parent_key=None, output_schema=None):
    if not bool(output_schema):
        output_schema = {
            "fields": [],
            "type": "struct"
        }
        
    if isinstance(input_data, dict):
        for key, value in input_data.items():
            base = {
                "metadata": {},
                "name": key,
                "nullable": True,
                "type": {}
            }
            base["type"] = dict_to_spark_schema(value, key, base["type"])
            output_schema["fields"].append(base)
    elif isinstance(input_data, list):
        base = {
            "containsNull": True,
            "elementType": {},
            "type": "array"
        }
        for el in input_data:
            base['elementType'] = dict_to_spark_schema(el, None, base['elementType'])
            output_schema.pop("fields")
            output_schema.update(base) 
    else:        
        output_schema = input_data
    return output_schema

def get_spark_struct_object(input_data):
    struct_object = dict_to_spark_schema(input_data)
    return T.StructType.fromJson(struct_object)

In [6]:
def get_schemas(name):
    map_schemas = {
        "pokemon": {
            "abilities": [{
              "ability": {"name": "string", "url": "string"},
              "is_hidden": "boolean",
              "slot": "integer"
            }],
            "base_experience": "integer",
            "forms": [{"name": "string", "url": "string"}],
            "height": "integer",
            "id": "integer",
            "is_default": "boolean",
            "location_area_encounters": "string",
            "name": "string",
            "order": "integer",
            "species": {"name": "string", "url": "string"},
            "types": [{
              "slot": "integer",
              "type": {"name": "string", "url": "string"}
            }],
            "weight": "integer"
        }
    }
    
    return map_schemas['pokemon']

def catch_pokemons(total, limit=10):
    pokemons = []
    for offset in range(0, total, limit):
        if (total - offset) < limit:
            limit = (total - offset)
        response = requests.get(f'https://pokeapi.co/api/v2/pokemon/?limit={limit}&offset={offset}')
        for item in response.json().get('results', []):
            print(item['url'])
            pokemons.append(requests.get(item['url']).json())
    return pokemons

def pokeapi_spark(total_pokemons):
    pokemon_schema = get_schemas("pokemon")
    pokemon_struct_object= get_spark_struct_object(pokemon_schema)
    pokemons_dataset = catch_pokemons(total_pokemons)
    
    df = spark.createDataFrame(pokemons_dataset, pokemon_struct_object)
    return df

In [7]:
df_pokemons = pokeapi_spark(10)

https://pokeapi.co/api/v2/pokemon/1/
https://pokeapi.co/api/v2/pokemon/2/
https://pokeapi.co/api/v2/pokemon/3/
https://pokeapi.co/api/v2/pokemon/4/
https://pokeapi.co/api/v2/pokemon/5/
https://pokeapi.co/api/v2/pokemon/6/
https://pokeapi.co/api/v2/pokemon/7/
https://pokeapi.co/api/v2/pokemon/8/
https://pokeapi.co/api/v2/pokemon/9/
https://pokeapi.co/api/v2/pokemon/10/


In [8]:
display(df_pokemons, n=10)

abilities,base_experience,forms,height,id,is_default,location_area_encounters,name,order,species,types,weight
"[((overgrow, https://pokeapi.co/api/v2/ability/65/), False, 1), ((chlorophyll, https://pokeapi.co/api/v2/ability/34/), True, 3)]",64,"[(bulbasaur, https://pokeapi.co/api/v2/pokemon-form/1/)]",7,1,True,https://pokeapi.co/api/v2/pokemon/1/encounters,bulbasaur,1,"(bulbasaur, https://pokeapi.co/api/v2/pokemon-species/1/)","[(1, (grass, https://pokeapi.co/api/v2/type/12/)), (2, (poison, https://pokeapi.co/api/v2/type/4/))]",69
"[((overgrow, https://pokeapi.co/api/v2/ability/65/), False, 1), ((chlorophyll, https://pokeapi.co/api/v2/ability/34/), True, 3)]",142,"[(ivysaur, https://pokeapi.co/api/v2/pokemon-form/2/)]",10,2,True,https://pokeapi.co/api/v2/pokemon/2/encounters,ivysaur,2,"(ivysaur, https://pokeapi.co/api/v2/pokemon-species/2/)","[(1, (grass, https://pokeapi.co/api/v2/type/12/)), (2, (poison, https://pokeapi.co/api/v2/type/4/))]",130
"[((overgrow, https://pokeapi.co/api/v2/ability/65/), False, 1), ((chlorophyll, https://pokeapi.co/api/v2/ability/34/), True, 3)]",236,"[(venusaur, https://pokeapi.co/api/v2/pokemon-form/3/)]",20,3,True,https://pokeapi.co/api/v2/pokemon/3/encounters,venusaur,3,"(venusaur, https://pokeapi.co/api/v2/pokemon-species/3/)","[(1, (grass, https://pokeapi.co/api/v2/type/12/)), (2, (poison, https://pokeapi.co/api/v2/type/4/))]",1000
"[((blaze, https://pokeapi.co/api/v2/ability/66/), False, 1), ((solar-power, https://pokeapi.co/api/v2/ability/94/), True, 3)]",62,"[(charmander, https://pokeapi.co/api/v2/pokemon-form/4/)]",6,4,True,https://pokeapi.co/api/v2/pokemon/4/encounters,charmander,5,"(charmander, https://pokeapi.co/api/v2/pokemon-species/4/)","[(1, (fire, https://pokeapi.co/api/v2/type/10/))]",85
"[((blaze, https://pokeapi.co/api/v2/ability/66/), False, 1), ((solar-power, https://pokeapi.co/api/v2/ability/94/), True, 3)]",142,"[(charmeleon, https://pokeapi.co/api/v2/pokemon-form/5/)]",11,5,True,https://pokeapi.co/api/v2/pokemon/5/encounters,charmeleon,6,"(charmeleon, https://pokeapi.co/api/v2/pokemon-species/5/)","[(1, (fire, https://pokeapi.co/api/v2/type/10/))]",190
"[((blaze, https://pokeapi.co/api/v2/ability/66/), False, 1), ((solar-power, https://pokeapi.co/api/v2/ability/94/), True, 3)]",240,"[(charizard, https://pokeapi.co/api/v2/pokemon-form/6/)]",17,6,True,https://pokeapi.co/api/v2/pokemon/6/encounters,charizard,7,"(charizard, https://pokeapi.co/api/v2/pokemon-species/6/)","[(1, (fire, https://pokeapi.co/api/v2/type/10/)), (2, (flying, https://pokeapi.co/api/v2/type/3/))]",905
"[((torrent, https://pokeapi.co/api/v2/ability/67/), False, 1), ((rain-dish, https://pokeapi.co/api/v2/ability/44/), True, 3)]",63,"[(squirtle, https://pokeapi.co/api/v2/pokemon-form/7/)]",5,7,True,https://pokeapi.co/api/v2/pokemon/7/encounters,squirtle,10,"(squirtle, https://pokeapi.co/api/v2/pokemon-species/7/)","[(1, (water, https://pokeapi.co/api/v2/type/11/))]",90
"[((torrent, https://pokeapi.co/api/v2/ability/67/), False, 1), ((rain-dish, https://pokeapi.co/api/v2/ability/44/), True, 3)]",142,"[(wartortle, https://pokeapi.co/api/v2/pokemon-form/8/)]",10,8,True,https://pokeapi.co/api/v2/pokemon/8/encounters,wartortle,11,"(wartortle, https://pokeapi.co/api/v2/pokemon-species/8/)","[(1, (water, https://pokeapi.co/api/v2/type/11/))]",225
"[((torrent, https://pokeapi.co/api/v2/ability/67/), False, 1), ((rain-dish, https://pokeapi.co/api/v2/ability/44/), True, 3)]",239,"[(blastoise, https://pokeapi.co/api/v2/pokemon-form/9/)]",16,9,True,https://pokeapi.co/api/v2/pokemon/9/encounters,blastoise,12,"(blastoise, https://pokeapi.co/api/v2/pokemon-species/9/)","[(1, (water, https://pokeapi.co/api/v2/type/11/))]",855
"[((shield-dust, https://pokeapi.co/api/v2/ability/19/), False, 1), ((run-away, https://pokeapi.co/api/v2/ability/50/), True, 3)]",39,"[(caterpie, https://pokeapi.co/api/v2/pokemon-form/10/)]",3,10,True,https://pokeapi.co/api/v2/pokemon/10/encounters,caterpie,14,"(caterpie, https://pokeapi.co/api/v2/pokemon-species/10/)","[(1, (bug, https://pokeapi.co/api/v2/type/7/))]",29


In [9]:
df_pokemons.select(
    "id",
    "name",
    df_pokemons.types[0].type['name'].alias('type_1'),
    df_pokemons.types[1].type['name'].alias('type_2'),
    "base_experience",
    "weight",
    "height"
)


id,name,type_1,type_2,base_experience,weight,height
1,bulbasaur,grass,poison,64,69,7
2,ivysaur,grass,poison,142,130,10
3,venusaur,grass,poison,236,1000,20
4,charmander,fire,,62,85,6
5,charmeleon,fire,,142,190,11
6,charizard,fire,flying,240,905,17
7,squirtle,water,,63,90,5
8,wartortle,water,,142,225,10
9,blastoise,water,,239,855,16
10,caterpie,bug,,39,29,3
