# Testando o codigo extraction

In [1]:
#Importando as bibliotecas
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [22]:
#Criando os intervalos de tempo para o DataFrame
START_YEAR = 1970
END_YEAR = datetime.now().year - 1

In [3]:
# URLs base para cada tipo de tabela
URL_TEMPLATES = [
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_02",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_03&subopcao=subopt_01",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_03&subopcao=subopt_02",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_03&subopcao=subopt_03",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_03&subopcao=subopt_04",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_04",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_05&subopcao=subopt_01",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_05&subopcao=subopt_02",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_05&subopcao=subopt_03",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_05&subopcao=subopt_04",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_05&subopcao=subopt_05",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_06&subopcao=subopt_01",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_06&subopcao=subopt_02",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_06&subopcao=subopt_03",
    "http://vitibrasil.cnpuv.embrapa.br/index.php?ano={year}&opcao=opt_06&subopcao=subopt_04"
]

In [4]:
#Verifica a disponibilidade da url
def fetch_page_content(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.content

In [5]:
#Web scraping da tabela 
def parse_table_content(content):
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find('table', class_='tb_base tb_dados')
    headers = [header.text.strip() for header in table.find_all('th')]
    rows = [
        [cell.text.strip() for cell in row.find_all('td')]
        for row in table.find_all('tr')[1:]
    ]
    return headers, rows

In [6]:
#Extrai a tabela para cada ano
def extract_table_data(url, year):
    try:
        content = fetch_page_content(url)
        headers, rows = parse_table_content(content)
        df = pd.DataFrame(rows, columns=headers)
        df['Ano'] = year
        return df
    except Exception as e:
        print(f"Erro ao extrair dados do ano {year}: {e}")
        return pd.DataFrame()

In [7]:
#Extrai todas as tabelas dentro do intervalo de anos definido
def extract_table_all_data(url_template, start_year, end_year):
    all_data = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        url = url_template.format(year=year)
        year_data = extract_table_data(url, year)
        if not year_data.empty:
            all_data = pd.concat([all_data, year_data], ignore_index=True)
            print(f"Dados do ano {year} extraídos com sucesso.")
    return all_data

In [8]:
#Realiza o pivot do DataFrame e formatação dos dados
def pivot_dataframe(df, columns):
    if not all(col in df.columns for col in columns):
        raise ValueError("Uma ou mais colunas especificadas não estão presentes no DataFrame")
    for col in columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df_pivot = df.pivot_table(index='Ano', columns=columns[0], values=columns[1:], aggfunc='sum')
    df_pivot.columns = ['_'.join(col).strip() for col in df_pivot.columns.values]
    df_pivot.reset_index(inplace=True)
    return df_pivot

In [9]:
#Pega os headers da tabela, para ser usado como coluna no pivot_dataframe
def get_table_headers(url, end_year):
    content = fetch_page_content(url.format(year=end_year))
    headers, _ = parse_table_content(content)
    return headers

In [10]:
#Pega o titulo da tabela e cria o nome do arquivo CSV com o datetime do dia da extração
def get_filename_from_page(end_year, url):
    content = fetch_page_content(url.format(year=end_year))
    soup = BeautifulSoup(content, 'html.parser')
    p_element = soup.find('p', {'class': 'text_center'})
    if p_element:
        filename_base = p_element.text.strip().replace(f' [{end_year}]', '').replace(' ', '_').replace(',', '')
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f'{filename_base}_{timestamp}.csv'
        return filename
    return None

In [26]:
#Função para salvar o DataFrame em um CSV
def save_to_csv(df, filename):
    df.to_csv(filename, index=False)

### Sequencia de execução das funções para gerar o DataFrame final

In [25]:
all_data = extract_table_all_data(URL_TEMPLATES[0], START_YEAR, END_YEAR)
columns = get_table_headers(URL_TEMPLATES[0], END_YEAR)
df_pivot = pivot_dataframe(all_data, columns)
df_pivot

Dados do ano 1970 extraídos com sucesso.
Dados do ano 1971 extraídos com sucesso.
Dados do ano 1972 extraídos com sucesso.
Dados do ano 1973 extraídos com sucesso.
Dados do ano 1974 extraídos com sucesso.
Dados do ano 1975 extraídos com sucesso.
Dados do ano 1976 extraídos com sucesso.
Dados do ano 1977 extraídos com sucesso.
Dados do ano 1978 extraídos com sucesso.
Dados do ano 1979 extraídos com sucesso.
Dados do ano 1980 extraídos com sucesso.
Dados do ano 1981 extraídos com sucesso.
Dados do ano 1982 extraídos com sucesso.
Dados do ano 1983 extraídos com sucesso.
Dados do ano 1984 extraídos com sucesso.
Dados do ano 1985 extraídos com sucesso.
Dados do ano 1986 extraídos com sucesso.
Dados do ano 1987 extraídos com sucesso.
Dados do ano 1988 extraídos com sucesso.
Dados do ano 1989 extraídos com sucesso.
Dados do ano 1990 extraídos com sucesso.
Dados do ano 1991 extraídos com sucesso.
Dados do ano 1992 extraídos com sucesso.
Dados do ano 1993 extraídos com sucesso.
Dados do ano 199

Unnamed: 0,Ano,Quantidade (L.)_Bagaceira,Quantidade (L.)_Base Champenoise champanha,Quantidade (L.)_Base Charmat champanha,Quantidade (L.)_Base espumante,Quantidade (L.)_Base espumante moscatel,Quantidade (L.)_Bebida de uva,Quantidade (L.)_Borra líquida,Quantidade (L.)_Borra seca,Quantidade (L.)_Branco,...,Quantidade (L.)_Tinto,Quantidade (L.)_Total,Quantidade (L.)_VINHO DE MESA,Quantidade (L.)_VINHO FINO DE MESA (VINIFERA),Quantidade (L.)_Vinagre,Quantidade (L.)_Vinho Composto,Quantidade (L.)_Vinho acidificado,Quantidade (L.)_Vinho leve,Quantidade (L.)_Vinho licoroso,Quantidade (L.)_Vinho orgânico
0,1970,961.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,748.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1971,913.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,21.5,0.0,0.0,0.0,0.0,0.0
2,1972,755.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.2,0.0,0.0,0.0,0.0,0.0
3,1973,661.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1974,424.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,35.3,0.0,0.0,0.0,0.0,0.0
5,1975,576.098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1976,705.912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,288.0,0.0,0.0,0.0,0.0,0.0
7,1977,828.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1978,698.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1979,938.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Execução para salvar todos os DataFrames extraidos a partir do Web Scraping

In [None]:
# Função principal para extrair dados de todos os URLs
def extract_and_save_all_data(url_templates, start_year, end_year):
    for url_template in url_templates:
        all_data = extract_table_all_data(url_template, start_year, end_year)
        if not all_data.empty:
            headers = get_table_headers(url_template, end_year)
            filename = get_filename_from_page(end_year, url_template)
            all_data = pivot_dataframe(all_data, headers)
            if filename:
                save_to_csv(all_data, filename)
                print(f"Dados salvos em {filename}")
# Executa a extração de dados
extract_and_save_all_data(URL_TEMPLATES, START_YEAR, END_YEAR)