In [None]:
import requests
from google.cloud import storage
import json
import os
import re
import chardet

def strip_tags(html_text):
    clean_text = re.sub(r"<[^>]*>", "", html_text)
    return clean_text

def save_offset_to_file(offset, filename):
    with open(filename, "w") as file:
        file.write(str(offset))

def load_offset_from_file(filename):
    if not os.path.exists(filename):
        return 0

    with open(filename, "r") as file:
        return int(file.read().strip())

def extract_data_from_api(api_url, offset, limit=100):
    try:
        url = f"{api_url}&offset={offset}&limit={limit}"
        response = requests.get(url)
        response.raise_for_status()  # Verificar se a solicitação foi bem-sucedida

        # Determinar a codificação correta usando o módulo chardet
        encoding = chardet.detect(response.content)["encoding"]
        projects = json.loads(response.content.decode(encoding))["_embedded"]["projetos"]

        # Limpar o conteúdo dos objetos
        for project in projects:
            if "campo_de_texto" in project:
                project["campo_de_texto"] = strip_tags(project["campo_de_texto"])

        return projects
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")
        return None

def upload_to_google_cloud_storage(bucket_name, data, filename, key_path):
    try:
        client = storage.Client.from_service_account_json(key_path)
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(filename)

        # Preparar os dados como linhas separadas no arquivo JSON
        data_json_lines = "\n".join(json.dumps(obj, ensure_ascii=False) for obj in data)

        # Fazer o upload dos dados no formato UTF-8
        blob.upload_from_string(data_json_lines.encode("utf-8"), content_type="application/json; charset=utf-8")
        print(f"Data uploaded to gs://{bucket_name}/{filename}")
        return True
    except Exception as e:
        print(f"Error during data upload: {e}")
        return False

if __name__ == "__main__":
    api_url = "http://api.salic.cultura.gov.br/v1/propostas/?format=json"  # Obter todos os projetos
    bucket_name = "salic_dados"  # Substitua pelo nome do seu bucket
    filename = "propostas.json"  # Nome do arquivo no bucket
    key_path = "/content/salic-web-37918cc40c59.json"  # Caminho para a chave de acesso JSON
    offset_file = "last_offset.txt"  # Nome do arquivo para armazenar o último offset

    last_offset = load_offset_from_file(offset_file)
    projects_limit_per_batch = 100  # Definindo o limite de projetos por lote (100 linhas)
    # total_data_limit = 500  # Limite total de linhas a serem extraídas

    all_data = []
    # while len(all_data) < total_data_limit:
    while True:  # Removido o limite de dados no loop
        data = extract_data_from_api(api_url, last_offset, limit=projects_limit_per_batch)
        if not data:
            break

        all_data.extend(data)
        last_offset += projects_limit_per_batch
        save_offset_to_file(last_offset, offset_file)

    # Limitar os dados ao total_data_limit, se necessário
    all_data = all_data[:total_data_limit]

    if all_data:
        # Enviar todos os dados (incluindo os existentes) para o Google Cloud Storage em um único arquivo JSON
        if upload_to_google_cloud_storage(bucket_name, all_data, filename, key_path):
            print("Data uploaded successfully.")
        else:
            print("Data upload failed. Check the error messages for more information.")

    print("All data has been extracted and uploaded.")

Error during API request: 405 Client Error: METHOD NOT ALLOWED for url: http://api.salic.cultura.gov.br/v1/projetos/?format=json&offset=0&limit=1000
All data has been extracted and uploaded.


In [1]:
import requests
from google.cloud import storage
import json
import os
import re
import chardet

def strip_tags(html_text):
    clean_text = re.sub(r"<[^>]*>", "", html_text)
    return clean_text

def save_offset_to_file(offset, filename):
    with open(filename, "w") as file:
        file.write(str(offset))

def load_offset_from_file(filename):
    if not os.path.exists(filename):
        return 0

    with open(filename, "r") as file:
        return int(file.read().strip())

def extract_data_from_api(api_url, offset, limit=100):
    try:
        url = f"{api_url}&offset={offset}&limit={limit}"
        response = requests.get(url)
        response.raise_for_status()  # Verificar se a solicitação foi bem-sucedida

        # Codificações a serem tentadas em ordem de probabilidade
        encodings_to_try = ["utf-8", "iso-8859-1", "windows-1252"]

        for encoding in encodings_to_try:
            try:
                projects = json.loads(response.content.decode(encoding))["_embedded"]["propostas"]
                break
            except UnicodeDecodeError:
                continue

        # Limpar o conteúdo dos objetos
        for project in projects:
            if "campo_de_texto" in project:
                project["campo_de_texto"] = strip_tags(project["campo_de_texto"])

        return projects
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")
        return None

def upload_to_google_cloud_storage(bucket_name, data, filename, key_path):
    try:
        client = storage.Client.from_service_account_json(key_path)
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(filename)

        # Preparar os dados como linhas separadas no arquivo JSON
        data_json_lines = "\n".join(json.dumps(obj, ensure_ascii=False) for obj in data)

        # Fazer o upload dos dados no formato UTF-8
        blob.upload_from_string(data_json_lines.encode("utf-8"), content_type="application/json; charset=utf-8")
        print(f"Data uploaded to gs://{bucket_name}/{filename}")
        return True
    except Exception as e:
        print(f"Error during data upload: {e}")
        return False

if __name__ == "__main__":
    api_url = "http://api.salic.cultura.gov.br/v1/propostas/?format=json"  # Obter todos os projetos
    bucket_name = "salic_dados"  # Substitua pelo nome do seu bucket
    filename = "propostas.json"  # Nome do arquivo no bucket
    key_path = "/content/salic-web-37918cc40c59.json"  # Caminho para a chave de acesso JSON
    offset_file = "last_offset.txt"  # Nome do arquivo para armazenar o último offset

    last_offset = load_offset_from_file(offset_file)
    projects_limit_per_batch = 100  # Definindo o limite de projetos por lote (100 linhas)
    # total_data_limit = 500  # Limite total de linhas a serem extraídas

    all_data = []
    # while len(all_data) < total_data_limit:
    while True:  # Removido o limite de dados no loop
        data = extract_data_from_api(api_url, last_offset, limit=projects_limit_per_batch)
        if not data:
            break

        all_data.extend(data)
        last_offset += projects_limit_per_batch
        save_offset_to_file(last_offset, offset_file)

    # Limitar os dados ao total_data_limit, se necessário
    # all_data = all_data[:total_data_limit]

    if all_data:
        # Enviar todos os dados (incluindo os existentes) para o Google Cloud Storage em um único arquivo JSON
        if upload_to_google_cloud_storage(bucket_name, all_data, filename, key_path):
            print("Data uploaded successfully.")
        else:
            print("Data upload failed. Check the error messages for more information.")

    print("All data has been extracted and uploaded.")

Error during API request: 404 Client Error: NOT FOUND for url: http://api.salic.cultura.gov.br/v1/propostas/?format=json&offset=324500&limit=100
Data uploaded to gs://salic_dados/propostas.json
Data uploaded successfully.
All data has been extracted and uploaded.
