# ü•á Camada Gold ‚Äî An√°lise Explorat√≥ria dos Dados Agregados
Explora√ß√£o da camada anal√≠tica: contagem de breweries por tipo e localiza√ß√£o.

In [1]:
import pandas as pd
import os

GOLD_PATH = '../data/gold/breweries_aggregated.parquet'

df = pd.read_parquet(GOLD_PATH, engine='pyarrow')

print(f'Shape: {df.shape}')
print(f'Colunas: {df.columns.tolist()}')
print(f'\nTotal de breweries: {df["brewery_count"].sum()}')
print(f'Pa√≠ses √∫nicos: {df["country"].nunique()}')
print(f'Estados √∫nicos: {df["state"].nunique()}')
print(f'Tipos √∫nicos: {df["brewery_type"].nunique()}')

Shape: (447, 4)
Colunas: ['brewery_type', 'country', 'state', 'brewery_count']

Total de breweries: 9110
Pa√≠ses √∫nicos: 19
Estados √∫nicos: 136
Tipos √∫nicos: 14


In [2]:
# Visualiza todos os dados
df

Unnamed: 0,brewery_type,country,state,brewery_count
0,micro,Australia,ACT,7
1,large,Australia,NSW,3
2,micro,Australia,NSW,141
3,micro,Australia,NT,3
4,large,Australia,QLD,2
...,...,...,...,...
442,brewpub,United States,Wyoming,16
443,closed,United States,Wyoming,6
444,micro,United States,Wyoming,24
445,regional,United States,Wyoming,1


In [3]:
# Total de breweries por tipo
by_type = df.groupby('brewery_type')['brewery_count'].sum().sort_values(ascending=False)
print('=== Total de breweries por tipo ===')
print(by_type.to_string())

=== Total de breweries por tipo ===
brewery_type
micro         4845
brewpub       2574
planning       646
closed         348
regional       222
contract       183
large          112
proprietor      65
taproom         45
bar             37
nano            22
cidery           7
beergarden       3
location         1


In [4]:
# Total de breweries por pa√≠s
by_country = df.groupby('country')['brewery_count'].sum().sort_values(ascending=False)
print('=== Total de breweries por pa√≠s ===')
print(by_country.to_string())

=== Total de breweries por pa√≠s ===
country
United States    8034
Australia         514
Canada            119
South Africa      104
Ireland            70
England            62
South Korea        61
Poland             34
Singapore          33
Austria            15
Portugal           14
Scotland           10
Germany            10
Sweden             10
Japan              10
Italy               4
France              3
Isle of Man         2
Ukraine             1


In [5]:
# Top 15 estados com mais breweries
by_state = df.groupby('state')['brewery_count'].sum().sort_values(ascending=False).head(15)
print('=== Top 15 estados com mais breweries ===')
print(by_state.to_string())

=== Top 15 estados com mais breweries ===
state
California        919
Washington        498
Colorado          449
New York          419
Michigan          375
Texas             352
Pennsylvania      345
Florida           312
North Carolina    311
Ohio              303
Oregon            295
Illinois          257
Virginia          255
Wisconsin         225
Minnesota         183


In [6]:
# Pivot: tipo x pa√≠s
pivot = df.pivot_table(
    values='brewery_count',
    index='brewery_type',
    columns='country',
    aggfunc='sum',
    fill_value=0
)
print('=== Breweries por tipo x pa√≠s ===')
pivot

=== Breweries por tipo x pa√≠s ===


country,Australia,Austria,Canada,England,France,Germany,Ireland,Isle of Man,Italy,Japan,Poland,Portugal,Scotland,Singapore,South Africa,South Korea,Sweden,Ukraine,United States
brewery_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
bar,0,2,0,0,0,5,0,0,0,0,0,0,0,28,0,0,0,0,2
beergarden,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0
brewpub,0,1,44,10,0,0,9,0,0,6,11,7,0,4,44,56,0,0,2382
cidery,0,0,6,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
closed,0,0,12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,335
contract,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,183
large,22,10,0,1,0,2,3,0,1,0,0,0,0,1,0,0,0,0,72
location,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
micro,491,0,54,40,3,0,49,2,1,4,20,7,9,0,59,5,10,1,4090
nano,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,19


In [8]:
# Verifica√ß√£o de qualidade: checar nulos e brewery_count = 0
print('=== Data Quality Check ===')
print(f'Valores nulos: {df.isnull().sum().sum()}')
print(f'Grupos com brewery_count = 0: {(df["brewery_count"] == 0).sum()}')
print(f'brewery_count m√≠nimo: {df["brewery_count"].min()}')
print(f'brewery_count m√°ximo: {df["brewery_count"].max()}')
print(f'brewery_count m√©dio: {df["brewery_count"].mean():.2f}')

=== Data Quality Check ===
Valores nulos: 0
Grupos com brewery_count = 0: 0
brewery_count m√≠nimo: 1
brewery_count m√°ximo: 466
brewery_count m√©dio: 20.38
