In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fastapi import HTTPException
import json
from selenium.webdriver.common.by import By
import time
import datetime
import pandas as pd
from lxml import etree
from io import StringIO

In [115]:
anoInicio = 1970
anoTermino = datetime.date.today().year

In [43]:
def save_html_content(name:str, content:str, ano:int = None, subopt_name:str = None):
    if (ano==None and subopt_name==None):
        arquivo = f"../arquivos/{name}.html"
    elif (ano!=None and subopt_name==None):
        arquivo = f"../arquivos/{name}_{ano}.html"
    elif (ano!=None and subopt_name!=None):
        arquivo = f"../arquivos/{name}_{subopt_name}_{ano}.html"
    
    with open(arquivo, "w", encoding='utf-8') as file:
        file.write(content)

In [54]:
def read_html_content(name:str, ano:int = None, subopt_name:str = None):
    if (ano==None and subopt_name==None):
        arquivo = f"../arquivos/{name}.html"
    elif (ano!=None and subopt_name==None):
        arquivo = f"../arquivos/{name}_{ano}.html"
    elif (ano!=None and subopt_name!=None):
        arquivo = f"../arquivos/{name}_{subopt_name}_{ano}.html"
    
    with open(arquivo, "r", encoding='utf-8') as file:
        return file

In [19]:
options = Options()
#executa o navegador sem a UI
options.add_argument("--headless")
#disabilita o sandbox
options.add_argument("--no-sandbox")
#disabilita dev shm
options.add_argument("--disable-dev-shm-usage")

#iniciar o chrome
driver = webdriver.Chrome(options=options)

In [23]:
def getContent(driver:webdriver, url:str):
    driver.get(url=url)
    time.sleep(2)
    return driver.page_source

In [47]:
def Scrap(opt:int, opt_name:str, anoInicio:int = None, anoTermino:int = None, subopts=None):
    if anoInicio == None and subopts == None:
        url = f"http://vitibrasil.cnpuv.embrapa.br/index.php?opcao=opt_0{opt}"
        html_content = getContent(driver=driver, url=url)
        save_html_content(opt_name, html_content)
    elif(anoInicio!=None and subopts==None):
        print(f"inicio:{anoInicio} - termino{anoTermino}")
        for ano in range(anoInicio, anoTermino):
            print(f"ano->{ano}")
            url = f"http://vitibrasil.cnpuv.embrapa.br/index.php?ano={ano}&opcao=opt_0{opt}"
            html_content = getContent(driver=driver, url=url)
            save_html_content(opt_name, html_content, ano=ano)
    elif(anoInicio!=None and subopts!=None):
        for subopt_name, subopt_value in subopts.items():
            for ano in range(anoInicio, anoTermino):
                url = f"http://vitibrasil.cnpuv.embrapa.br/index.php?ano={ano}&opcao=opt_0{opt}&subopcao=subopt_0{subopt_value}"
                html_content = getContent(driver=driver, url=url)
                save_html_content(opt_name, html_content, ano=ano, subopt_name=subopt_name)

In [32]:
Scrap(opt=1, opt_name="apresentacao")

In [37]:
Scrap(opt=2, opt_name="producao", anoInicio=anoInicio, anoTermino=anoTermino)

inicio:1970 - termino1973
ano->1970
ano->1971
ano->1972


In [49]:
subopts = {
    "Viníferas":"1"
    , "Americanas e híbridas":"2"
    , "Uvas de mesa":"3"
    , "Sem classificação":"4"
    }

Scrap(opt=3, opt_name="processamento", anoInicio=anoInicio, anoTermino=anoTermino, subopts=subopts)

In [50]:
Scrap(opt=4, opt_name="comercializacao", anoInicio=anoInicio, anoTermino=anoTermino)

inicio:1970 - termino1973
ano->1970
ano->1971
ano->1972


In [51]:
subopts = {
    "Vinhos de mesa":"1"
    , "Espumantes":"2"
    , "Uvas frescas":"3"
    , "Uvas passas":"4"
    , "Suco de uva":"5"
    }
Scrap(opt=5, opt_name="importacao", anoInicio=anoInicio, anoTermino=anoTermino, subopts=subopts)

In [52]:
subopts = {
    "Vinhos de mesa":"1"
    , "Espumantes":"2"
    , "Uvas frescas":"3"
    , "Suco de uva":"4"
    }
Scrap(opt=6, opt_name="exportacao", anoInicio=anoInicio, anoTermino=anoTermino, subopts=subopts)

In [22]:
Scrap(opt=7, opt_name="publicacao")

In [53]:
# Fechar o driver
driver.quit()

In [130]:
def CreateParquetFile(name:str, ano:int, table_elements:str, subopt_name:str = None):
    # Lista para armazenar os DataFrames
    dfs = []

    for i, table in enumerate(table_elements):
        # Obter o HTML da tabela
        table_html = etree.tostring(table, pretty_print=True, encoding='unicode')
        
        # Converter o HTML da tabela em DataFrame usando pandas
        df = pd.read_html(StringIO(table_html))[0]
        dfs.append(df)

    # Opcional: combinar todos os DataFrames em um único DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    if (subopt_name==None):
        arquivo_parquet = f"../data/{name}_{ano}.parquet"
    else:
        arquivo_parquet = f"../data/{name}_{subopt_name}_{ano}.parquet"

    combined_df.to_parquet(arquivo_parquet, engine='pyarrow')

In [129]:
def ProcessHtmlFileToParquet(name:str, ano:int, subopt_name:str = None):
    if (subopt_name==None):
        arquivo_html = f"../arquivos/{name}_{ano}.html"
    else:
        arquivo_html = f"../arquivos/{name}_{subopt_name}_{ano}.html"

    with open(arquivo_html, "r", encoding="utf-8") as htm_file:
        html_content = htm_file.read()
        parser = etree.HTMLParser()
        tree = etree.fromstring(html_content, parser)
        table_elements = tree.xpath('//table[contains(@class, "tb_base") and contains(@class, "tb_dados")]')
        num_tables = len(table_elements)
        CreateParquetFile(name=name, ano=ano, table_elements=table_elements, subopt_name=subopt_name)
        

In [126]:
name = "producao"
for ano in range(anoInicio, anoTermino):
    ProcessHtmlFileToParquet(name=name, ano=ano)

In [131]:
name = "processamento"
subopts = {
    "Viníferas":"1"
    , "Americanas e híbridas":"2"
    , "Uvas de mesa":"3"
    , "Sem classificação":"4"
    }
for subopt_name, subopt_value in subopts.items():
    for ano in range(anoInicio, anoTermino):
        ProcessHtmlFileToParquet(name=name, ano=ano, subopt_name=subopt_name)

In [132]:
name = "comercializacao"
for ano in range(anoInicio, anoTermino):
    ProcessHtmlFileToParquet(name=name, ano=ano)

In [133]:
name = "importacao"
subopts = {
    "Vinhos de mesa":"1"
    , "Espumantes":"2"
    , "Uvas frescas":"3"
    , "Uvas passas":"4"
    , "Suco de uva":"5"
    }
for subopt_name, subopt_value in subopts.items():
    for ano in range(anoInicio, anoTermino):
        ProcessHtmlFileToParquet(name=name, ano=ano, subopt_name=subopt_name)

In [None]:
name = "exportacao"
subopts = {
    "Vinhos de mesa":"1"
    , "Espumantes":"2"
    , "Uvas frescas":"3"
    , "Suco de uva":"4"
    }
for subopt_name, subopt_value in subopts.items():
    for ano in range(anoInicio, anoTermino):
        ProcessHtmlFileToParquet(name=name, ano=ano, subopt_name=subopt_name)