# Manipulação de Dados - Parte 1

##### Seleção
##### Filtros
##### Agregação
##### Ordenação

# Configurações Iniciais

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

spark = SparkSession.builder.getOrCreate()

# Import da Base

In [2]:
dfPokemon = (spark.read.format('json')
             .load('/home/jovyan/files/pokemons.json'))

# Conhecendo a Base

In [9]:
dfPokemon.show(10, False)
#dfPokemon.printSchema()

+------+-----------+---------------------------------------------------------+---+----------+----+-----+
|altura|experiencia|formas                                                   |id |nome      |peso|type |
+------+-----------+---------------------------------------------------------+---+----------+----+-----+
|7     |64         |[{bulbasaur, https://pokeapi.co/api/v2/pokemon-form/1/}] |1  |bulbasaur |69  |grass|
|10    |142        |[{ivysaur, https://pokeapi.co/api/v2/pokemon-form/2/}]   |2  |ivysaur   |130 |grass|
|20    |263        |[{venusaur, https://pokeapi.co/api/v2/pokemon-form/3/}]  |3  |venusaur  |1000|grass|
|6     |62         |[{charmander, https://pokeapi.co/api/v2/pokemon-form/4/}]|4  |charmander|85  |fire |
|11    |142        |[{charmeleon, https://pokeapi.co/api/v2/pokemon-form/5/}]|5  |charmeleon|190 |fire |
|17    |267        |[{charizard, https://pokeapi.co/api/v2/pokemon-form/6/}] |6  |charizard |905 |fire |
|5     |63         |[{squirtle, https://pokeapi.co/api/

# Exemplos

### Seleção

In [24]:
(dfPokemon
 .select(fn.col('id').alias('id_01'), dfPokemon.id.alias('id_02'), 'id', '*')
 .drop('type','id')
 .select('id_01', 'altura', 'nome')
 .withColumn('nome_maiusculo', fn.upper('nome'))
 .show(3)
)

+-----+------+---------+--------------+
|id_01|altura|     nome|nome_maiusculo|
+-----+------+---------+--------------+
|    1|     7|bulbasaur|     BULBASAUR|
|    2|    10|  ivysaur|       IVYSAUR|
|    3|    20| venusaur|      VENUSAUR|
+-----+------+---------+--------------+
only showing top 3 rows



### Filtro

In [42]:
(dfPokemon
 .filter(fn.col('altura') < 20)
 #.filter( (fn.col('peso') < 50) | (fn.col('experiencia') > 10) ) 
 .show(10)
)

+------+-----------+--------------------+---+----------+----+-----+
|altura|experiencia|              formas| id|      nome|peso| type|
+------+-----------+--------------------+---+----------+----+-----+
|     7|         64|[{bulbasaur, http...|  1| bulbasaur|  69|grass|
|    10|        142|[{ivysaur, https:...|  2|   ivysaur| 130|grass|
|     6|         62|[{charmander, htt...|  4|charmander|  85| fire|
|    11|        142|[{charmeleon, htt...|  5|charmeleon| 190| fire|
|    17|        267|[{charizard, http...|  6| charizard| 905| fire|
|     5|         63|[{squirtle, https...|  7|  squirtle|  90|water|
|    10|        142|[{wartortle, http...|  8| wartortle| 225|water|
|    16|        265|[{blastoise, http...|  9| blastoise| 855|water|
|     3|         39|[{caterpie, https...| 10|  caterpie|  29|  bug|
|     7|         72|[{metapod, https:...| 11|   metapod|  99|  bug|
+------+-----------+--------------------+---+----------+----+-----+
only showing top 10 rows



### Agregação

In [43]:
(dfPokemon
 .groupBy('altura')
 .agg(fn.sum('peso').alias('peso_total'), 
      fn.count('id').alias('qtd_total'))
 .show(3)
)

+------+----------+---------+
|altura|peso_total|qtd_total|
+------+----------+---------+
|    26|      4075|        2|
|    29|     13400|        3|
|    65|     10300|        3|
+------+----------+---------+
only showing top 3 rows



### Ordenação

In [53]:
#(dfPokemon
# .groupBy('altura')
# .agg(fn.sum('peso').alias('PESO'))
# .orderBy('altura')
# .show()
#)
dfPokemon.orderBy('altura', fn.col('peso').desc() ).show(3)

+------+-----------+--------------------+-----+------------------+----+-------+
|altura|experiencia|              formas|   id|              nome|peso|   type|
+------+-----------+--------------------+-----+------------------+----+-------+
|     1|        140|[{cosmoem, https:...|  790|           cosmoem|9999|psychic|
|     1|       null|[{gimmighoul-roam...|10263|gimmighoul-roaming|  10|  ghost|
|     1|         64|[{joltik, https:/...|  595|            joltik|   6|    bug|
+------+-----------+--------------------+-----+------------------+----+-------+
only showing top 3 rows



# Exercícios

### Exercício 01

In [57]:
# quantos pokemons existem no total?
(dfPokemon.count(), dfPokemon.distinct().count())
dfPokemon.select(fn.count('id'), fn.count_distinct('id')).show()

+---------+------------------+
|count(id)|count(DISTINCT id)|
+---------+------------------+
|     1281|              1281|
+---------+------------------+



### Exercício 02

In [58]:
# quantos kgs pesam todos os pokemons juntos?
dfPokemon.select(fn.sum('peso')).show()

+---------+
|sum(peso)|
+---------+
|  1254704|
+---------+



### Exercício 03

In [60]:
# retorne os pokemons que não possuem experiência
                                            #OR
dfPokemon.filter( (fn.col('experiencia') < 1) |  (fn.col('experiencia').isNull() ) ).show()

+------+-----------+--------------------+---+------------------+----+--------+
|altura|experiencia|              formas| id|              nome|peso|    type|
+------+-----------+--------------------+---+------------------+----+--------+
|    18|       null|[{wyrdeer, https:...|899|           wyrdeer| 951|  normal|
|    18|       null|[{kleavor, https:...|900|           kleavor| 890|     bug|
|    24|       null|[{ursaluna, https...|901|          ursaluna|2900|  ground|
|    30|       null|[{basculegion-mal...|902|  basculegion-male|1100|   water|
|    13|       null|[{sneasler, https...|903|          sneasler| 430|fighting|
|    25|       null|[{overqwil, https...|904|          overqwil| 605|    dark|
|    16|       null|[{enamorus-incarn...|905|enamorus-incarnate| 480|   fairy|
|     4|       null|[{sprigatito, htt...|906|        sprigatito|  41|   grass|
|     9|       null|[{floragato, http...|907|         floragato| 122|   grass|
|    15|       null|[{meowscarada, ht...|908|       

### Exercício 04

In [71]:
# retorne o(s) pokemon(s) mais pesado(s)

max_peso = dfPokemon.select(fn.max('peso')).first()[0]
print(f'Maior peso: {max_peso}kg')

dfPokemon.filter(fn.col('peso') == max_peso).show(3)

Maior peso: 10000kg
+------+-----------+--------------------+-----+--------------+-----+-----+
|altura|experiencia|              formas|   id|          nome| peso| type|
+------+-----------+--------------------+-----+--------------+-----+-----+
|   240|        236|[{venusaur-gmax, ...|10195| venusaur-gmax|10000|grass|
|   280|        240|[{charizard-gmax,...|10196|charizard-gmax|10000| fire|
|   250|        239|[{blastoise-gmax,...|10197|blastoise-gmax|10000|water|
+------+-----------+--------------------+-----+--------------+-----+-----+
only showing top 3 rows



### Exercício 05

In [73]:
# retorne o(s) pokemon(s) mais baixos(s)

menor_altura = dfPokemon.select(fn.min('altura')).first()[0]
print(f'Menor altura: {menor_altura}m')

dfPokemon.filter(fn.col('altura') == menor_altura).show()

Menor altura: 1m
+------+-----------+--------------------+-----+------------------+----+-------+
|altura|experiencia|              formas|   id|              nome|peso|   type|
+------+-----------+--------------------+-----+------------------+----+-------+
|     1|         64|[{joltik, https:/...|  595|            joltik|   6|    bug|
|     1|         61|[{flabebe-red, ht...|  669|           flabebe|   1|  fairy|
|     1|         61|[{cutiefly, https...|  742|          cutiefly|   2|    bug|
|     1|        170|[{comfey, https:/...|  764|            comfey|   3|  fairy|
|     1|        140|[{cosmoem, https:...|  790|           cosmoem|9999|psychic|
|     1|         62|[{sinistea-phony,...|  854|          sinistea|   2|  ghost|
|     1|       null|[{gimmighoul-roam...|10263|gimmighoul-roaming|  10|  ghost|
+------+-----------+--------------------+-----+------------------+----+-------+



### Exercício 06

In [83]:
# quantos pokemons tem mais de 200 pontos de experiencia? E quais são?

df_200_mais = dfPokemon.filter(fn.col('experiencia') > 200)

df_200_mais.select(fn.count('id')).show()

df_200_mais.select('nome', 'experiencia').orderBy('experiencia').show(3)

+---------+
|count(id)|
+---------+
|      289|
+---------+

+--------------+-----------+
|          nome|experiencia|
+--------------+-----------+
|manectric-mega|        201|
|   glalie-mega|        203|
|  lopunny-mega|        203|
+--------------+-----------+
only showing top 3 rows



### Exercício 07

In [90]:
# quantos pokemons possuem mais de 20m e menos de 1000kg? E quais são?

df_20_1000 = (dfPokemon
              .filter(fn.col('altura') > 20)
              .filter(fn.col('peso') < 1000)
             )

df_20_1000.select(fn.count('id')).show()

df_20_1000.select('nome', 'altura', 'peso').orderBy('altura', 'peso').show(3)

+---------+
|count(id)|
+---------+
|       29|
+---------+

+---------------+------+----+
|           nome|altura|peso|
+---------------+------+----+
|      hatterene|    21|  51|
|aerodactyl-mega|    21| 790|
|     eelektross|    21| 805|
+---------------+------+----+
only showing top 3 rows



### Exercício 08

In [98]:
# retorne os pokemons que possuem mais de uma forma

df_size = dfPokemon.withColumn('tamanho_lista_forma', fn.size('formas'))

df_size.filter(fn.col('tamanho_lista_forma') > 1).show()

+------+-----------+--------------------+---+----------+----+--------+-------------------+
|altura|experiencia|              formas| id|      nome|peso|    type|tamanho_lista_forma|
+------+-----------+--------------------+---+----------+----+--------+-------------------+
|     3|         41|[{pichu, https://...|172|     pichu|  20|electric|                  2|
|     5|        118|[{unown-a, https:...|201|     unown|  50| psychic|                 28|
|     2|         45|[{burmy-plant, ht...|412|     burmy|  34|     bug|                  3|
|     9|        148|[{mothim-plant, h...|414|    mothim| 233|     bug|                  3|
|     5|        158|[{cherrim-overcas...|421|   cherrim|  93|   grass|                  2|
|     3|         65|[{shellos-west, h...|422|   shellos|  63|   water|                  2|
|     9|        166|[{gastrodon-west,...|423| gastrodon| 299|   water|                  2|
|    32|        324|[{arceus-normal, ...|493|    arceus|3200|  normal|                 19|

In [102]:
(dfPokemon.select('id', 'nome', 'formas').filter(fn.size('formas') > 1).show(dfPokemon.count())
)

+---+-----------+--------------------+
| id|       nome|              formas|
+---+-----------+--------------------+
|172|      pichu|[{pichu, https://...|
|201|      unown|[{unown-a, https:...|
|412|      burmy|[{burmy-plant, ht...|
|414|     mothim|[{mothim-plant, h...|
|421|    cherrim|[{cherrim-overcas...|
|422|    shellos|[{shellos-west, h...|
|423|  gastrodon|[{gastrodon-west,...|
|493|     arceus|[{arceus-normal, ...|
|585|   deerling|[{deerling-spring...|
|586|   sawsbuck|[{sawsbuck-spring...|
|649|   genesect|[{genesect, https...|
|664| scatterbug|[{scatterbug-icy-...|
|665|     spewpa|[{spewpa-icy-snow...|
|666|   vivillon|[{vivillon-meadow...|
|669|    flabebe|[{flabebe-red, ht...|
|670|    floette|[{floette-red, ht...|
|671|    florges|[{florges-red, ht...|
|676|    furfrou|[{furfrou-natural...|
|716|    xerneas|[{xerneas-active,...|
|773|   silvally|[{silvally-normal...|
|854|   sinistea|[{sinistea-phony,...|
|855|polteageist|[{polteageist-pho...|
|869|   alcremie|[{alcrem

In [104]:
pokemons_com_mais_de_uma_forma = dfPokemon.filter(fn.size(dfPokemon["formas"]) > 1)
pokemons_com_mais_de_uma_forma.show()

+------+-----------+--------------------+---+----------+----+--------+
|altura|experiencia|              formas| id|      nome|peso|    type|
+------+-----------+--------------------+---+----------+----+--------+
|     3|         41|[{pichu, https://...|172|     pichu|  20|electric|
|     5|        118|[{unown-a, https:...|201|     unown|  50| psychic|
|     2|         45|[{burmy-plant, ht...|412|     burmy|  34|     bug|
|     9|        148|[{mothim-plant, h...|414|    mothim| 233|     bug|
|     5|        158|[{cherrim-overcas...|421|   cherrim|  93|   grass|
|     3|         65|[{shellos-west, h...|422|   shellos|  63|   water|
|     9|        166|[{gastrodon-west,...|423| gastrodon| 299|   water|
|    32|        324|[{arceus-normal, ...|493|    arceus|3200|  normal|
|     6|         67|[{deerling-spring...|585|  deerling| 195|  normal|
|    19|        166|[{sawsbuck-spring...|586|  sawsbuck| 925|  normal|
|    15|        300|[{genesect, https...|649|  genesect| 825|     bug|
|     