# Importando bibliotecas

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window as W
from pyspark.sql.types import *

# Criando Spark session

In [2]:
spark = (SparkSession
        .builder
        .master('local')
        .appName('PIB_IPCA')
        .config('spark.ui.port', '4050')
        .getOrCreate()
        )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/23 20:04:36 WARN org.apache.spark.util.Utils: Service 'SparkUI' could not bind on port 4050. Attempting port 4051.
21/11/23 20:04:36 WARN org.apache.spark.util.Utils: Service 'SparkUI' could not bind on port 4051. Attempting port 4052.


# Tabela PIB Nacional

In [3]:
# Leitura do arquivo do bucket em formato parquet
df_pib = spark.read.parquet('gs://projeto_final_2021/pandas_to_parquet/PIB_parquet/df_PIB.snappy')

                                                                                

In [4]:
# Visualização do schema e tipo de dados
df_pib.printSchema()

root
 |-- PIB: float (nullable = true)
 |-- ano: short (nullable = true)
 |-- mes: byte (nullable = true)



In [5]:
# Visualização da tabala do PIB
df_pib.show()

                                                                                

+--------+----+---+
|     PIB| ano|mes|
+--------+----+---+
|716186.5|2021|  9|
|727121.7|2021|  8|
|727346.1|2021|  7|
|711254.2|2021|  6|
|714313.1|2021|  5|
|717796.8|2021|  4|
|730633.7|2021|  3|
|670296.5|2021|  2|
|647092.5|2021|  1|
|680491.7|2020| 12|
|662599.5|2020| 11|
|660409.1|2020| 10|
|636933.0|2020|  9|
|625101.4|2020|  8|
|629700.9|2020|  7|
|591537.3|2020|  6|
|561628.4|2020|  5|
|555387.4|2020|  4|
|616335.3|2020|  3|
|609810.8|2020|  2|
+--------+----+---+
only showing top 20 rows



## Ordenando valores na tabela PIB para inserção da coluna ID

In [6]:
# Reordenando as colunas do dataframe
df_pib = df_pib.select(col('mes'), col('ano'), col('PIB'))
# Excluindo o mes adicional 'setembro' para poder relacionar com a tabela IPCA
df_pib_sort = df_pib.filter((col('ano') != 2021) | (col('mes') != 9)).orderBy(col('ano'), col('mes'))
# Gerar visualização
df_pib_sort.show(100)

+---+----+--------+
|mes| ano|     PIB|
+---+----+--------+
|  1|2020|617747.2|
|  2|2020|609810.8|
|  3|2020|616335.3|
|  4|2020|555387.4|
|  5|2020|561628.4|
|  6|2020|591537.3|
|  7|2020|629700.9|
|  8|2020|625101.4|
|  9|2020|636933.0|
| 10|2020|660409.1|
| 11|2020|662599.5|
| 12|2020|680491.7|
|  1|2021|647092.5|
|  2|2021|670296.5|
|  3|2021|730633.7|
|  4|2021|717796.8|
|  5|2021|714313.1|
|  6|2021|711254.2|
|  7|2021|727346.1|
|  8|2021|727121.7|
+---+----+--------+



In [7]:
# Criando uma falsa particação para poder ordenar os dados por ano e mês
w = W.orderBy(col('ano'), col('mes'))
# Usando a partição par criar a coluna ID com numeração correta
df_pib_sortId = df_pib_sort.withColumn('id', row_number().over(w))

In [8]:
# Gerar visualização
df_pib_sortId.show()

21/11/23 20:04:47 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:47 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:48 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:48 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:48 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+---+----+--------+---+
|mes| ano|     PIB| id|
+---+----+--------+---+
|  1|2020|617747.2|  1|
|  2|2020|609810.8|  2|
|  3|2020|616335.3|  3|
|  4|2020|555387.4|  4|
|  5|2020|561628.4|  5|
|  6|2020|591537.3|  6|
|  7|2020|629700.9|  7|
|  8|2020|625101.4|  8|
|  9|2020|636933.0|  9|
| 10|2020|660409.1| 10|
| 11|2020|662599.5| 11|
| 12|2020|680491.7| 12|
|  1|2021|647092.5| 13|
|  2|2021|670296.5| 14|
|  3|2021|730633.7| 15|
|  4|2021|717796.8| 16|
|  5|2021|714313.1| 17|
|  6|2021|711254.2| 18|
|  7|2021|727346.1| 19|
|  8|2021|727121.7| 20|
+---+----+--------+---+



# IPCA Estadual

In [9]:
# Leitura do arquivo do bucket em formato parquet
df_ipca_est = spark.read.parquet('gs://projeto_final_2021/pandas_to_parquet/IPCA_parquet/df_ipca_estadual.snappy')

In [10]:
# Visualização do schema e tipo de dados
df_ipca_est.printSchema()

root
 |-- Mês: string (nullable = true)
 |-- Brasil: float (nullable = true)
 |-- Belém_PA: float (nullable = true)
 |-- Fortaleza_CE: float (nullable = true)
 |-- Recife_PE: float (nullable = true)
 |-- Salvador_BA: float (nullable = true)
 |-- Belo_Horizonte_MG: float (nullable = true)
 |-- Grande_Vitória_ES: float (nullable = true)
 |-- Rio_de_Janeiro_RJ: float (nullable = true)
 |-- São_Paulo_SP: float (nullable = true)
 |-- Curitiba_PR: float (nullable = true)
 |-- Porto_Alegre_RS: float (nullable = true)



In [11]:
# Visualização do schema e tipo de dados
df_ipca_est.show()

+--------------+------+--------+------------+---------+-----------+-----------------+-----------------+-----------------+------------+-----------+---------------+
|           Mês|Brasil|Belém_PA|Fortaleza_CE|Recife_PE|Salvador_BA|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|Curitiba_PR|Porto_Alegre_RS|
+--------------+------+--------+------------+---------+-----------+-----------------+-----------------+-----------------+------------+-----------+---------------+
|  janeiro 2020|  0.21|    0.39|        0.28|      0.3|       0.34|              0.2|             0.29|             0.05|        0.33|       0.05|           0.17|
|fevereiro 2020|  0.25|    0.21|         0.8|     0.38|       0.16|              0.5|             0.33|            -0.02|        0.23|       0.08|           0.16|
|    março 2020|  0.07|   -0.16|        0.21|     0.31|       0.17|             0.05|             0.12|             0.46|        0.09|       0.13|          -0.32|
|    abril 2020| -0.31

In [12]:
# Selecionando colunas para manter no data frame
cols_to_keep = {'Mês', 'Brasil', 'Belo_Horizonte_MG', 'Grande_Vitória_ES', 'Rio_de_Janeiro_RJ', 'São_Paulo_SP'}
# Selecionando colunas para exluir do data frame 
cols_to_drop = list(set(df_ipca_est.schema.names).difference(cols_to_keep))
# Realizando drop das colunas não desejáveis
df_ipca_4est = df_ipca_est.drop(*cols_to_drop)

In [13]:
# Gerando visulização
df_ipca_4est.show(100)

+--------------+------+-----------------+-----------------+-----------------+------------+
|           Mês|Brasil|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|
+--------------+------+-----------------+-----------------+-----------------+------------+
|  janeiro 2020|  0.21|              0.2|             0.29|             0.05|        0.33|
|fevereiro 2020|  0.25|              0.5|             0.33|            -0.02|        0.23|
|    março 2020|  0.07|             0.05|             0.12|             0.46|        0.09|
|    abril 2020| -0.31|            -0.21|            -0.09|             0.18|       -0.37|
|     maio 2020| -0.38|             -0.6|            -0.48|            -0.28|       -0.28|
|    junho 2020|  0.26|             0.05|             0.56|            -0.01|        0.29|
|    julho 2020|  0.36|             0.39|             0.21|             0.24|        0.24|
|   agosto 2020|  0.24|             0.21|            -0.03|            -0.13|        0.31|

In [14]:
# Adicionando as colunas ano e mes ao data frame e removendo a coluna Mês
df_ipca_4est = df_ipca_4est.withColumn('mes', split(col('Mês'), ' ').getItem(0))\
                           .withColumn('ano', split(col('Mês'), ' ').getItem(1))\
                                             .withColumn('mes', when(col('mes') == 'janeiro', 1)
                                             .when(col('mes') == 'fevereiro', 2)
                                             .when(col('mes') == 'março', 3)
                                             .when(col('mes') == 'abril', 4)
                                             .when(col('mes') == 'maio', 5)
                                             .when(col('mes') == 'junho', 6)
                                             .when(col('mes') == 'julho', 7)
                                             .when(col('mes') == 'agosto', 8)
                                             .when(col('mes') == 'setembro', 9)
                                             .when(col('mes') == 'outubro', 10)
                                             .when(col('mes') == 'novembro', 11)
                                             .when(col('mes') == 'dezembro', 12)
                                             .otherwise(None))

# Realizando drop das colunas não desejáveis
df_ipca_4est = df_ipca_4est.drop('Mês')

In [15]:
# Gernado visualização
df_ipca_4est.show()

+------+-----------------+-----------------+-----------------+------------+---+----+
|Brasil|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|mes| ano|
+------+-----------------+-----------------+-----------------+------------+---+----+
|  0.21|              0.2|             0.29|             0.05|        0.33|  1|2020|
|  0.25|              0.5|             0.33|            -0.02|        0.23|  2|2020|
|  0.07|             0.05|             0.12|             0.46|        0.09|  3|2020|
| -0.31|            -0.21|            -0.09|             0.18|       -0.37|  4|2020|
| -0.38|             -0.6|            -0.48|            -0.28|       -0.28|  5|2020|
|  0.26|             0.05|             0.56|            -0.01|        0.29|  6|2020|
|  0.36|             0.39|             0.21|             0.24|        0.24|  7|2020|
|  0.24|             0.21|            -0.03|            -0.13|        0.31|  8|2020|
|  0.64|             0.76|             0.83|             0.62|   

In [16]:
# Criando uma falsa particação para poder ordenar os dados por ano e mês
w1 = W.orderBy(col('ano'), col('mes'))
# Usando a partição par criar a coluna ID com numeração correta
df_ipca_4estId = df_ipca_4est.withColumn('id', row_number().over(w1))

In [17]:
# Gerando visualização
df_ipca_4estId.show()

21/11/23 20:04:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:50 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+-----------------+-----------------+-----------------+------------+---+----+---+
|Brasil|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|mes| ano| id|
+------+-----------------+-----------------+-----------------+------------+---+----+---+
|  0.21|              0.2|             0.29|             0.05|        0.33|  1|2020|  1|
|  0.25|              0.5|             0.33|            -0.02|        0.23|  2|2020|  2|
|  0.07|             0.05|             0.12|             0.46|        0.09|  3|2020|  3|
| -0.31|            -0.21|            -0.09|             0.18|       -0.37|  4|2020|  4|
| -0.38|             -0.6|            -0.48|            -0.28|       -0.28|  5|2020|  5|
|  0.26|             0.05|             0.56|            -0.01|        0.29|  6|2020|  6|
|  0.36|             0.39|             0.21|             0.24|        0.24|  7|2020|  7|
|  0.24|             0.21|            -0.03|            -0.13|        0.31|  8|2020|  8|
|  0.64|             

In [18]:
# Reordenando as colunas do dataframe
df_ipca_4estId = df_ipca_4estId.select(col('id'), col('Brasil'), col('Belo_Horizonte_MG'), col('Grande_Vitória_ES'), col('Rio_de_Janeiro_RJ'), col('São_Paulo_SP'))
# Gerando visualização
df_ipca_4estId.show()

21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---+------+-----------------+-----------------+-----------------+------------+
| id|Brasil|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|
+---+------+-----------------+-----------------+-----------------+------------+
|  1|  0.21|              0.2|             0.29|             0.05|        0.33|
|  2|  0.25|              0.5|             0.33|            -0.02|        0.23|
|  3|  0.07|             0.05|             0.12|             0.46|        0.09|
|  4| -0.31|            -0.21|            -0.09|             0.18|       -0.37|
|  5| -0.38|             -0.6|            -0.48|            -0.28|       -0.28|
|  6|  0.26|             0.05|             0.56|            -0.01|        0.29|
|  7|  0.36|             0.39|             0.21|             0.24|        0.24|
|  8|  0.24|             0.21|            -0.03|            -0.13|        0.31|
|  9|  0.64|             0.76|             0.83|             0.62|        0.44|
| 10|  0.86|             1.08|          

# Agrupando de IPCA estadual e PIB nacional

In [19]:
# Realizando agrupamento dos dataframes por join entre tabelas
df_ipca_pib = df_pib_sortId.join(df_ipca_4estId, on=['id'], how='inner')

In [20]:
# Gerando visualização
df_ipca_pib.show()

21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:51 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+---+---+----+--------+------+-----------------+-----------------+-----------------+------------+
| id|mes| ano|     PIB|Brasil|Belo_Horizonte_MG|Grande_Vitória_ES|Rio_de_Janeiro_RJ|São_Paulo_SP|
+---+---+----+--------+------+-----------------+-----------------+-----------------+------------+
|  1|  1|2020|617747.2|  0.21|              0.2|             0.29|             0.05|        0.33|
|  2|  2|2020|609810.8|  0.25|              0.5|             0.33|            -0.02|        0.23|
|  3|  3|2020|616335.3|  0.07|             0.05|             0.12|             0.46|        0.09|
|  4|  4|2020|555387.4| -0.31|            -0.21|            -0.09|             0.18|       -0.37|
|  5|  5|2020|561628.4| -0.38|             -0.6|            -0.48|            -0.28|       -0.28|
|  6|  6|2020|591537.3|  0.26|             0.05|             0.56|            -0.01|        0.29|
|  7|  7|2020|629700.9|  0.36|             0.39|             0.21|             0.24|        0.24|
|  8|  8|2020|625101

21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [21]:
# Renomeando colunas para melhor identificação
df_ipca_pib = (df_ipca_pib.withColumnRenamed('PIB','pib')
                          .withColumnRenamed('Brasil','ipca_Brasil')
                          .withColumnRenamed('Belo_Horizonte_MG','ipca_MG')
                          .withColumnRenamed('Grande_Vitória_ES','ipca_ES')
                          .withColumnRenamed('Rio_de_Janeiro_RJ','ipca_RJ')
                          .withColumnRenamed('São_Paulo_SP','ipca_SP'))

In [22]:
#Gerando visualização
df_ipca_pib.show()

21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:52 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc

+---+---+----+--------+-----------+-------+-------+-------+-------+
| id|mes| ano|     pib|ipca_Brasil|ipca_MG|ipca_ES|ipca_RJ|ipca_SP|
+---+---+----+--------+-----------+-------+-------+-------+-------+
|  1|  1|2020|617747.2|       0.21|    0.2|   0.29|   0.05|   0.33|
|  2|  2|2020|609810.8|       0.25|    0.5|   0.33|  -0.02|   0.23|
|  3|  3|2020|616335.3|       0.07|   0.05|   0.12|   0.46|   0.09|
|  4|  4|2020|555387.4|      -0.31|  -0.21|  -0.09|   0.18|  -0.37|
|  5|  5|2020|561628.4|      -0.38|   -0.6|  -0.48|  -0.28|  -0.28|
|  6|  6|2020|591537.3|       0.26|   0.05|   0.56|  -0.01|   0.29|
|  7|  7|2020|629700.9|       0.36|   0.39|   0.21|   0.24|   0.24|
|  8|  8|2020|625101.4|       0.24|   0.21|  -0.03|  -0.13|   0.31|
|  9|  9|2020|636933.0|       0.64|   0.76|   0.83|   0.62|   0.44|
| 10| 10|2020|660409.1|       0.86|   1.08|   0.91|   0.59|   0.89|
| 11| 11|2020|662599.5|       0.89|   0.95|   0.97|   0.69|   1.04|
| 12| 12|2020|680491.7|       1.35|   1.53|   1.

21/11/23 20:04:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:04:53 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


# Salvar o arquivo

In [24]:
# Salvando em formato parquet
(df_ipca_pib.write.format("parquet")
.option("header", "true")
.option("inferschema", "true")
.save("gs://projeto_final_2021/parquet_to_bq/ipca_pib_nacional/")
)

21/11/23 20:06:40 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:06:40 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:06:41 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:06:41 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/11/23 20:06:41 WARN org.apache.spark.sql.execution.window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performanc