# 01 - EDA Básica | Basic EDA

PT-BR: Este notebook demonstra uma Análise Exploratória de Dados (EDA) usando dados sintéticos, cobrindo estatísticas, distribuições, correlações, dados faltantes e outliers. Inclui exemplos de pré-processamento e gráficos com matplotlib/plotly.

EN: This notebook demonstrates Exploratory Data Analysis (EDA) with synthetic data, covering statistics, distributions, correlations, missing values, and outliers. It includes preprocessing examples and matplotlib/plotly plots.

In [None]:
"""
PT-BR: Imports e configuração inicial.
EN: Imports and initial setup.
"""
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# If running inside repo, ensure src is on path
ROOT = Path.cwd()
SRC = (ROOT / 'src')
if SRC.exists():
    sys.path.insert(0, str(SRC))

# Optional project utilities (if available)
try:
    from dsworkflows.data.preprocess import validate_missing_values  # type: ignore
except Exception:
    validate_missing_values = None

%config InlineBackend.figure_format = 'retina'
sns.set(style='whitegrid', context='notebook')


## 1. Geração de Dados Sintéticos | Synthetic Data Generation

In [None]:
def generate_synthetic_df(n_samples=600, n_features=8, n_informative=5, n_classes=3, random_state=42):
    """
    PT-BR: Gera um DataFrame sintético para classificação.
    EN: Generates a synthetic classification DataFrame.
    """
    X, y = make_classification(
        n_samples=n_samples, n_features=n_features, n_informative=n_informative,
        n_redundant=0, n_repeated=0, n_classes=n_classes, random_state=random_state
    )
    cols = [f'feat_{i}' for i in range(n_features)]
    df = pd.DataFrame(X, columns=cols)
    df['target'] = y
    # Introduzir alguns NaNs e outliers
    rng = np.random.default_rng(random_state)
    nan_idx = rng.choice(df.index, size=int(0.05*len(df)), replace=False)
    col_for_nan = rng.choice(cols)
    df.loc[nan_idx, col_for_nan] = np.nan
    out_idx = rng.choice(df.index, size=int(0.01*len(df)), replace=False)
    df.loc[out_idx, rng.choice(cols)] *= 8
    return df

df = generate_synthetic_df()
df.head()


## 2. Estatísticas e Distribuições | Stats and Distributions

In [None]:
df.describe().T


In [None]:
num_cols = [c for c in df.columns if c.startswith('feat_')]
fig, axes = plt.subplots(2, 3, figsize=(12, 7))
for ax, col in zip(axes.ravel(), num_cols[:6]):
    sns.histplot(data=df, x=col, kde=True, ax=ax)
    ax.set_title(col)
plt.tight_layout()
plt.show()


In [None]:
# Interactive Plotly distribution for first 2 columns
px.histogram(df, x=num_cols[0], nbins=40, title=f'Distribution of {num_cols[0]}').show()


## 3. Correlação | Correlation

In [None]:
corr = df[num_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, cmap='viridis', annot=False)
plt.title('Matriz de Correlação | Correlation Matrix')
plt.show()


## 4. Dados Faltantes | Missing Values

In [None]:
missing_rate = df.isna().mean().sort_values(ascending=False)
missing_rate


In [None]:
sns.barplot(x=missing_rate.index, y=missing_rate.values)
plt.xticks(rotation=45)
plt.ylabel('Proporção de NaNs | NaN Proportion')
plt.title('Dados Faltantes | Missing Values')
plt.show()


## 5. Outliers (IQR)

In [None]:
def iqr_outlier_mask(s: pd.Series, k: float = 1.5) -> pd.Series:
    """
    PT-BR: Retorna máscara booleana para outliers via IQR.
    EN: Returns boolean mask for outliers using IQR.
    """
    q1, q3 = s.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - k*iqr, q3 + k*iqr
    return (s < lower) | (s > upper)

outlier_counts = {c: iqr_outlier_mask(df[c].dropna()).sum() for c in num_cols}
pd.Series(outlier_counts).sort_values(ascending=False)


## 6. Pré-processamento | Preprocessing

In [None]:
X = df[num_cols].copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols)])
clf = Pipeline(steps=[('pre', preprocessor), ('model', LogisticRegression(max_iter=1000))])
clf.fit(X_train, y_train)
print('Train score:', clf.score(X_train, y_train))
print('Test score:', clf.score(X_test, y_test))
print(classification_report(y_test, clf.predict(X_test)))


## 7. Validação do Projeto | Project Validation

In [None]:
"""
PT-BR: Se utilitários do projeto estiverem disponíveis, demonstre validações.
EN: If project utilities are available, demonstrate validations.
"""
if validate_missing_values is not None:
    # Exemplo: checar se taxa de NaNs não excede 10% por coluna
    report = validate_missing_values(df[num_cols], max_missing_rate=0.10)
    display(report)
else:
    print('Project validation utilities not available in this environment.')


## 8. Visualização Bivariada | Bivariate Visualization

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=y, y=X[num_cols[0]])
plt.title(f'Boxplot {num_cols[0]} por classe | by class')
plt.show()

px.scatter(df, x=num_cols[0], y=num_cols[1], color=df['target'].astype(str),
           title='Scatter (Plotly) colored by target').show()


## 9. Exportar Artefatos | Export Artifacts

In [None]:
out_dir = Path('data/processed')
out_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(out_dir / 'synthetic_sample.csv', index=False)
print('Saved:', out_dir / 'synthetic_sample.csv')


---
PT-BR: Fim do notebook. Veja próximos notebooks para engenharia de features e modelagem.

EN: End of notebook. See next notebooks for feature engineering and modeling.