In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window as W

In [2]:
spark = (SparkSession.builder
        .master("local")
        .appName("PysparkSQL_ES")
        .config("spark.ui.port", "4051")
        .getOrCreate()
        )

In [3]:
spark

In [9]:
df = (spark.read.format("parquet")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("gs://projeto_final_2021/parquet_to_bq/ES/part-00000-9bdc9996-951c-4c96-b1ea-1bedfa62eee1-c000.snappy.parquet")
  .createOrReplaceTempView("es")
     )

In [10]:
spark.sql('DESCRIBE es').show(100)

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|          nome|   string|   null|
|      situacao|   string|   null|
|         cargo|   string|   null|
|           Mes|  tinyint|   null|
|           ano| smallint|   null|
|salarioLiquido|    float|   null|
|        estado|   string|   null|
+--------------+---------+-------+



In [34]:
spark.sql("SELECT * FROM es").show()

+--------------------+--------+--------------------+---+----+--------------+------+
|                nome|situacao|               cargo|Mes| ano|salarioLiquido|estado|
+--------------------+--------+--------------------+---+----+--------------+------+
|Adelimar C. Caeta...|   ATIVO|              PROF V|  1|2020|       2575.09|    ES|
|Adelina de Jesus ...|   ATIVO| CONSELHEIRO TUTELAR|  1|2020|       6401.63|    ES|
|Ademir Nogueira Lyra|   ATIVO|GR 1002 REF I - S...|  1|2020|        860.45|    ES|
|Adenice Sanson de...|   ATIVO|              PROF V|  1|2020|       2938.73|    ES|
|Adriana de F. R. ...|   ATIVO|              PROF V|  1|2020|       3319.12|    ES|
|Adriana de Medeir...|   ATIVO|              PROF V|  1|2020|       5664.27|    ES|
|Adriana Garcia M....|   ATIVO|              PROF V|  1|2020|       3666.43|    ES|
|Adriana J. Ferrei...|   ATIVO|GR 1010 REF I - S...|  1|2020|       1315.93|    ES|
|ADRIANA TEIXEIRA ...|   ATIVO|GR 1092 REF I - S...|  1|2020|        1314.1|

In [6]:
#Quantidade de servidores por cargo

spark.sql('''SELECT cargo, COUNT(nome) AS Qtd_servidores FROM es GROUP BY cargo''').show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------+
|               cargo|Qtd_servidores|
+--------------------+--------------+
|GR 1034 REF I - S...|            26|
|         Pensionista|         16847|
|GR 1024 REF I - S...|           584|
|GR 1016 REF I - S...|            16|
|GR 1018 REF I - S...|            79|
|GR 1020 REF I - S...|            46|
|                   7|            48|
|Classe NF I - Pad...|            76|
|GR 1018 REF I - S...|           448|
|                 CC1|            81|
|GR 1090 REF I - S...|            21|
|GR 1020 REF I - S...|            21|
|GR 1092 REF I - S...|            17|
| GR I SUB A - SEQ. 4|             5|
|GR 1014 REF I - S...|           317|
|GR 1028 REF I - S...|          6176|
|                  11|            20|
|Classe AF II - Pa...|            36|
|GR 1002 REF I - S...|           978|
|GUARDA MUNICIPAL ...|           780|
+--------------------+--------------+
only showing top 20 rows



                                                                                

In [11]:
# Verificar ano, cargo e salario maior que 10000 no ano de 2021
spark.sql("""
SELECT
    ano, cargo, salarioLiquido 
FROM 
    es
WHERE
    salarioLiquido > 10000 AND ano = 2021

""").show(30)

+----+--------------------+--------------+
| ano|               cargo|salarioLiquido|
+----+--------------------+--------------+
|2021|Classe NF III - P...|      15227.35|
|2021|Classe NF II - Pa...|       11227.6|
|2021|GR 1014 REF I - S...|      10724.59|
|2021|Classe NF II - Pa...|      16654.94|
|2021|  PROCURADOR NIVEL I|      14600.08|
|2021|          Aposentado|      10931.72|
|2021|Classe NF I - Pad...|      11658.31|
|2021|       SUBSECRETÁRIO|      11685.98|
|2021|       SUBSECRETÁRIO|      10044.49|
|2021|Classe AF III - P...|      11945.73|
|2021| PROCURADOR NIVEL VI|      14338.91|
|2021|GR 1014 REF I - S...|      10737.49|
|2021|          Aposentado|      12533.09|
|2021|              PROF V|      10713.61|
|2021|Classe AF III - P...|      22233.93|
|2021|GR 1018 REF I - S...|      11462.63|
|2021|Classe AF III - P...|      11411.75|
|2021|             PROF VI|      10340.27|
|2021|              PROF V|      11840.51|
|2021|              PROF V|      10006.58|
|2021|Class

In [12]:
#Média salarial dos servidores no cargo de professor durante o decorrer do ano de 2020
spark.sql(
"""
SELECT
    cargo,
    ROUND(AVG(salarioLiquido),2) AS media_salario,
    ano,
    mes 
FROM
    es
WHERE
    cargo LIKE "%PROF%" AND ano == 2020
GROUP BY
    mes, ano, cargo
ORDER BY
    mes
"""
).show(truncate=50)

+--------+-------------+----+---+
|   cargo|media_salario| ano|mes|
+--------+-------------+----+---+
| PROF IV|      2000.99|2020|  1|
|  PROF I|      2119.61|2020|  1|
|PROF VII|      4036.03|2020|  1|
| PROF VI|       2999.7|2020|  1|
| PROF II|      2317.45|2020|  1|
|  PROF V|      2409.09|2020|  1|
| PROF VI|      2750.52|2020|  2|
|PROF VII|      3658.14|2020|  2|
|  PROF I|      1678.09|2020|  2|
| PROF IV|      1767.16|2020|  2|
| PROF II|      1826.56|2020|  2|
|  PROF V|       2023.4|2020|  2|
|PROF VII|      4053.19|2020|  3|
| PROF VI|      2785.37|2020|  3|
| PROF II|      2121.49|2020|  3|
|  PROF I|      1941.23|2020|  3|
|  PROF V|      2352.87|2020|  3|
| PROF IV|      1952.06|2020|  3|
| PROF VI|      2986.67|2020|  4|
| PROF IV|      2091.45|2020|  4|
+--------+-------------+----+---+
only showing top 20 rows



In [14]:
#Top 5 menores médias salariais por cargo 
spark.sql("SELECT cargo, ROUND(AVG(salarioLiquido),2) AS media_salario_cargo FROM es GROUP BY cargo ORDER BY media_salario_cargo limit 5").show()

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+-------------------+
|               cargo|media_salario_cargo|
+--------------------+-------------------+
|GR 437 REF I - Se...|             563.11|
|GR 500 REF I - Se...|             645.23|
| Estagiário Superior|             696.01|
|GR 486 REF I - Se...|             741.44|
|GR 1003 REF I - S...|             893.98|
+--------------------+-------------------+



                                                                                

In [17]:
#Descobrir o número total de registros de cada ano
spark.sql("select ano, count(situacao) from es group by ano").show() 

+----+---------------+
| ano|count(situacao)|
+----+---------------+
|2020|         185090|
|2021|          93175|
+----+---------------+



In [37]:
#média salarial de servidores ativos no ano de 2021.
spark.sql(
"""
SELECT
    cargo,
    ROUND(AVG(salarioLiquido),2) AS media_salario,
    ano,
    mes,
    situacao
FROM
    es
WHERE
    situacao = "ATIVO" AND ano = 2021
GROUP BY
    mes, ano, cargo, situacao
ORDER BY
    mes
"""
).show()

+--------------------+-------------+----+---+--------+
|               cargo|media_salario| ano|mes|situacao|
+--------------------+-------------+----+---+--------+
|GR 1002 REF I - S...|      4835.47|2021|  1|   ATIVO|
|GR 1024 REF I - S...|      1548.73|2021|  1|   ATIVO|
|GR 1090 REF I - S...|      2370.12|2021|  1|   ATIVO|
|GR 1016 REF I - S...|      1464.41|2021|  1|   ATIVO|
|GR 1034 REF I - S...|      3321.12|2021|  1|   ATIVO|
|                   8|      3236.15|2021|  1|   ATIVO|
|GR 1002 REF I - S...|      2035.17|2021|  1|   ATIVO|
|GR 1032 REF I - S...|      4877.03|2021|  1|   ATIVO|
|GUARDA MUNICIPAL ...|       3104.6|2021|  1|   ATIVO|
|GR 586 REF I - Se...|      3189.96|2021|  1|   ATIVO|
|Classe NF III - P...|      8235.19|2021|  1|   ATIVO|
|Classe NF III - P...|      4820.02|2021|  1|   ATIVO|
|GUARDA MUNICIPAL ...|      4170.62|2021|  1|   ATIVO|
|                   9|      7541.87|2021|  1|   ATIVO|
|GR 1014 REF I - S...|      1815.32|2021|  1|   ATIVO|
|GR 1002 R