In [1]:
import requests
from google.cloud import storage
import csv
import json
import os
import re
import chardet

def strip_tags(html_text):
    clean_text = re.sub(r"<[^>]*>", "", html_text)
    return clean_text

def extract_data_from_link(link):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Verificar se a solicitação foi bem-sucedida

        # Codificações a serem tentadas em ordem de probabilidade
        encodings_to_try = ["utf-8", "iso-8859-1", "windows-1252"]

        for encoding in encodings_to_try:
            try:
                data = json.loads(response.content.decode(encoding))["_embedded"]["doacoes"]
                break
            except UnicodeDecodeError:
                continue

        # Limpar o conteúdo dos objetos
        for item in data:
            if "campo_de_texto" in item:
                item["campo_de_texto"] = strip_tags(item["campo_de_texto"])

        return data
    except requests.exceptions.RequestException as e:
        print(f"Error during API request for link {link}: {e}")
        return []

def upload_to_google_cloud_storage(bucket_name, data, filename, key_path):
    try:
        client = storage.Client.from_service_account_json(key_path)
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(filename)

        # Preparar os dados como linhas separadas no arquivo JSON
        data_json_lines = "\n".join(json.dumps(obj, ensure_ascii=False) for obj in data)

        # Fazer o upload dos dados no formato UTF-8
        blob.upload_from_string(data_json_lines.encode("utf-8"), content_type="application/json; charset=utf-8")
        print(f"Data from {len(data)} links uploaded to gs://{bucket_name}/{filename}")
        return True
    except Exception as e:
        print(f"Error during data upload: {e}")
        return False

if __name__ == "__main__":
    input_csv_path = "/content/doacoes.csv"  # Substitua pelo caminho para o CSV contendo os links
    bucket_name = "salic_dados"  # Substitua pelo nome do seu bucket
    filename = "doacoes.json"  # Nome do arquivo no bucket
    key_path = "/content/salic-web-37918cc40c59.json"  # Caminho para a chave de acesso JSON

    all_data = []
    with open(input_csv_path, "r", newline="") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            link = row[0]
            data = extract_data_from_link(link)
            all_data.extend(data)

    if all_data:
        # Enviar todos os dados para o Google Cloud Storage em um único arquivo JSON
        if upload_to_google_cloud_storage(bucket_name, all_data, filename, key_path):
            print("Data uploaded successfully.")
        else:
            print("Data upload failed. Check the error messages for more information.")

    print("All data has been extracted and uploaded.")

FileNotFoundError: ignored