In [1]:
import pandas as pd
import numpy as np
import zipfile
import requests
import os
import glob
from os import chdir, getcwd, listdir
from io import BytesIO

from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession

#-------------------------------
# bibliotecas webscraping
import urllib3
from bs4 import BeautifulSoup

urllib3.disable_warnings()

spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

url = 'https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/indicadores-educacionais/indicadores-de-fluxo-da-educacao-superior/2010-2019'
folder = 'indicadores_fluxo_educacao_superior'
path = '/tmp/'

In [2]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Criando diretório para armazenar o conteúdo do INEP ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
os.makedirs(folder, exist_ok=True)

print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Efetuando busca webscraping da URL de download do arquivo ZIP ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
conexao = urllib3.PoolManager()
retorno = conexao.request('GET', url)

pagina = BeautifulSoup(retorno.data,"html.parser")

dado = []
for link in pagina.find_all('a',class_ = 'external-link'):
    dado.append(link.get('href'))

url_download = str(dado[0])

print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Efetuando download do arquivo Zip ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
# Desabilitar temporariamente a verificação SSL
response = requests.get(url_download, verify=False)
filebytes = BytesIO(response.content)

print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Descompactando arquivo Zip no Lake ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

myzip = zipfile.ZipFile(filebytes)
myzip.extractall(path + folder)
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Descompactação efetuada ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

chdir(path + folder)
print(getcwd())
for c in listdir():
    print(c)

/tmp/indicadores_fluxo_educacao_superior
indicadores_trajetoria_educacao_superior_2010_2019.xlsx
Dicionário_acompanhamento_trajetória.docx
md5_indicadores_trajetoria_educacao_superior_2010_2019.txt
indicadores_trajetoria_educacao_superior_2010_2019.ods


In [3]:
# arquivo = url_download.split('/')[-1]
# print(arquivo)
# type(arquivo)

In [4]:
# pip install openpyxl

In [5]:
# targetPattern = r"/tmp/indicadores_fluxo_educacao_superior/*.xlsx"
# file = str(glob.glob(targetPattern)).replace("['","").replace("']","")
# print(file)

In [6]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Efetuando Busca do nome do Arquivo XLSX ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

targetPattern = r"{path}{folder}/*.xlsx".format(path=path, folder=folder)
file = str(glob.glob(targetPattern)).replace("['","").replace("']","")
print("Arquivo:", file)

print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Carregando Dataframe com os dados do Excel ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

df = pd.read_excel(file, usecols= 'A:AE', skiprows= lambda x: x < 8 or x > 259238, sheet_name='INDICADORES_TRAJETORIA' )

Arquivo: /tmp/indicadores_fluxo_educacao_superior/indicadores_trajetoria_educacao_superior_2010_2019.xlsx


In [7]:
# df.head()

In [8]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Convertendo Dataframe para SPARK ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

sparkDF=spark.createDataFrame(df) 
# sparkDF.printSchema()
# sparkDF.show()



In [9]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Gravando Dados na Camada BRONZE  ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

(sparkDF
 .write
 .format('parquet')
 .mode('overwrite')
 .save('s3a://landing/indicadores_fluxo_educacao_superior')
 )



In [10]:
# sparkDF.printSchema()

In [11]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Carregando Dataframe a partir do Arquivo Parquet ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

dfIndicador = (spark.read.format('parquet')
             .load('s3a://landing/indicadores_fluxo_educacao_superior'))



In [12]:
print('============ ============ ============ ============ ============ ============ ============ ============ ============')
print('============ Evidência de Gravação ============')
print('============ ============ ============ ============ ============ ============ ============ ============ ============')

dfIndicador.show(5, False)

+------+-----------------------------------+---------------------------+------------------------+--------+--------+---------+-----+------------+-----------------+--------------------+--------------+--------------+------------------+---------------------------------+---------------+-----------------+-----------------------+---------------------+-----------------------+----------------------------+--------------+--------------+-------------+--------------+-----------+------------------+------------------+------------------+------------------+------------------+
|CO_IES|NO_IES                             |TP_CATEGORIA_ADMINISTRATIVA|TP_ORGANIZACAO_ACADEMICA|CO_CURSO|NO_CURSO|CO_REGIAO|CO_UF|CO_MUNICIPIO|TP_GRAU_ACADEMICO|TP_MODALIDADE_ENSINO|CO_CINE_ROTULO|NO_CINE_ROTULO|CO_CINE_AREA_GERAL|NO_CINE_AREA_GERAL               |NU_ANO_INGRESSO|NU_ANO_REFERENCIA|NU_PRAZO_INTEGRALIZACAO|NU_ANO_INTEGRALIZACAO|NU_PRAZO_ACOMPANHAMENTO|NU_ANO_MAXIMO_ACOMPANHAMENTO|QT_INGRESSANTE|QT_PERMANENCIA|QT_CO

In [15]:
print("Total de registros Carregados no Arquivo Parquet:", dfIndicador.count())

Total de registros Carregados no Arquivo Parquet: 259230
