## Importando Libs

In [1]:
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import SparkSession

### Criação da Sessão do Spark

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Igti Hadoop")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [3]:
spark

### Lendo os dados do HDFS

- Define Schema do DataFrame

In [4]:
schema = """
        emp_no int,
        first_name string,
        last_name string,
        gender string
        """

In [6]:
df = (
    spark
    .read
    .format("parquet")
    .schema(schema)
    .load("hdfs://namenode:8020/user/root/raw_zone/employees")
)

- Verificando Schema do DataFrame

In [7]:
df.dtypes

[('emp_no', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string')]

## Processamento Simples

- Geração de um novo DataFrame com a quantidade de funcionários por sexo

In [8]:
df_aggregate = (
    df
    .groupBy("gender")
    .agg(f.count("emp_no")
    .alias("amount"))
)

In [9]:
df_aggregate.show()

+------+------+
|gender|amount|
+------+------+
|     F|120051|
|     M|179973|
+------+------+



In [10]:
df_aggregate.dtypes

[('gender', 'string'), ('amount', 'bigint')]

### Escrevendo os dados do DataFrame em formato parquet no HDFS

In [11]:
df_aggregate.write.mode("overwrite").format("parquet").save("hdfs://namenode:8020/user/root/processing_zone/employees")