## Libs

In [36]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import time
import pyarrow as pa

## Data Ingestion (using web scraping)

In [None]:
response = requests.get('https://nijianmo.github.io/amazon/index.html')
soup = BeautifulSoup(response.content, 'html.parser')

os.makedirs("amazon_reviews", exist_ok=True)

In [None]:
for link in soup.find_all('a'):
    href = link.get('href')

    if href and href.endswith(".json.gz"):
        file_name = href.split('/')[-1]
        output_path = os.path.join("amazon_reviews", file_name)

        if file_name == 'Electronics_5.json.gz':

            r = requests.get(href, stream=True, verify=False)
    
            with open(output_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print(f"{file_name} salvo com sucesso!\n")

## Creating Chunks to save as DF

In [32]:
chunks = pd.read_json(
    "amazon_reviews/Electronics_5.json.gz",
    lines=True,
    compression='gzip',
    chunksize=100_000 
)

## Saving in Delta Table

In [42]:
path = "delta_tables/amazon_electronics"

tabela_criada = os.path.exists(path)

In [None]:
colunas_desejadas = {
    'image': 'string',
    'overall': 'float64',
    'vote': 'string',
    'verified': 'boolean',
    'reviewTime': 'string',
    'reviewerID': 'string',
    'asin': 'string',
    'reviewerName': 'string',
    'reviewText': 'string',
    'summary': 'string',
    'unixReviewTime': 'int64',
    'style': 'object'
}

start = time.time()

for i, chunk in enumerate(chunks):
    print(f"Escrevendo chunk {i}...")

    df_chunk = chunk.reindex(columns=colunas_desejadas.keys())

    for col, tipo in colunas_desejadas.items():
        df_chunk[col] = df_chunk[col].astype(tipo, errors="ignore")

    write_deltalake(
        path,
        df_chunk,
        mode="append" if tabela_criada else "overwrite"
    )

    tabela_criada = True

end = time.time()
print(f"Delta com Pandas escrita em {end - start:.2f} segundos 💽")
