# Data Inspection, Cleaning, and EDA
Dataset: Open Brewery DB (collected via `scripts/collect.py`).

Deliverables covered:
- Data collection process
- Dataset overview
- Inspection summary and issues
- Before vs after cleaning comparison
- EDA insights and visuals


In [None]:
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from src.cleaning import missing_table, clean_breweries
from src.eda import plot_missing, plot_numeric_distributions, plot_categoricals, plot_correlations

RAW_PATH = Path('../data/raw/breweries.json')
INTERIM_PATH = Path('../data/interim/breweries_inspected.parquet')
CLEAN_PATH = Path('../data/processed/breweries_clean.parquet')


## 1. Load raw data

In [None]:
raw = pd.read_json(RAW_PATH)
raw.head()

## 2. Dataset overview

In [None]:
raw.info()

In [None]:
desc_num = raw.select_dtypes(include='number').describe().T
desc_cat = raw.select_dtypes(include='object').describe().T
desc_num, desc_cat.head()

## 3. Inspection: missingness, duplicates, outliers, invalids

In [None]:
missing_tbl = missing_table(raw)
missing_tbl.head(10)

In [None]:
dup_count = raw.duplicated().sum()
dup_subset = raw.duplicated(subset=['id','name','street','city','state']).sum()
dup_count, dup_subset

In [None]:
plot_missing(raw)
plt.show()

## 4. Cleaning

In [None]:
clean = clean_breweries(raw)
clean.head()

In [None]:
INTERIM_PATH.parent.mkdir(parents=True, exist_ok=True)
CLEAN_PATH.parent.mkdir(parents=True, exist_ok=True)
raw.to_parquet(INTERIM_PATH, index=False)
clean.to_parquet(CLEAN_PATH, index=False)
len(raw), len(clean)

## 5. Before vs After comparison

In [None]:
stats_before = raw.select_dtypes(include='number').describe().T
stats_after  = clean.select_dtypes(include='number').describe().T
compare = pd.concat({'raw': stats_before, 'clean': stats_after}, axis=1)
compare.head()

In [None]:
missing_compare = pd.DataFrame({
    'raw_missing_pct': raw.isna().mean()*100,
    'clean_missing_pct': clean.isna().mean()*100
}).sort_values('raw_missing_pct', ascending=False)
missing_compare.head(10)

## 6. EDA

In [None]:
plot_numeric_distributions(clean, num_cols=['latitude','longitude'])
plt.show()

In [None]:
plot_categoricals(clean, cat_cols=['brewery_type','state','country'], top_n=15)
plt.show()

In [None]:
plot_correlations(clean)
plt.show()

### Insights (to fill after running)
- Bullet 1
- Bullet 2
- Bullet 3

### Next steps
- Potential modeling or further data enrichment ideas.
