# 01 - Exploratory Data Analysis and Preprocessing

This notebook covers:
1. Dataset description and origin
2. Data loading and initial exploration
3. Data quality assessment
4. Handling missing values and outliers
5. Exploratory Data Analysis (distributions, correlations)
6. Feature preparation for unsupervised learning

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append('..')

from src.config_loader import load_config, get_absolute_path
from src.data_loading import DataLoader
from src.preprocessing import DataPreprocessor

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Setup complete!")

## 1. Dataset Description

### Origin and Context
**Dataset Name**: [Fill in dataset name]  
**Source**: [Describe where the data comes from]  
**Domain**: [Business/Government/Health/Environment]  
**Collection Method**: [How was the data collected?]  
**Time Period**: [When was the data collected?]  

### Purpose
[Describe the purpose of this analysis and what questions you want to answer]

### Expected Structure
- **Rows**: ≥1000 samples
- **Columns**: ≥10 variables
- **Variable Types**: Mix of numerical and potentially categorical features

## 2. Data Loading

In [None]:
# Load configuration
config = load_config()

# Initialize data loader
loader = DataLoader(config)

# Load data
try:
    data = loader.load_data()
    print(f"✓ Data loaded successfully!")
    print(f"Shape: {data.shape}")
    print(f"Columns: {list(data.columns)}")
except FileNotFoundError:
    print("Error: Please place 'base_historica.csv' in the data/raw/ directory")
    data = None

## 3. Initial Data Exploration

In [None]:
if data is not None:
    print("Dataset Overview:")
    print("="*50)
    print(f"Number of rows: {data.shape[0]}")
    print(f"Number of columns: {data.shape[1]}")
    print(f"\nFirst few rows:")
    display(data.head())
    
    print(f"\nData Types:")
    print(data.dtypes)
    
    print(f"\nBasic Statistics:")
    display(data.describe())

## 4. Data Quality Assessment

### 4.1 Missing Values Analysis

In [None]:
if data is not None:
    missing_values = data.isnull().sum()
    missing_percentage = (missing_values / len(data)) * 100
    
    missing_df = pd.DataFrame({
        'Missing Count': missing_values,
        'Percentage': missing_percentage
    }).sort_values('Percentage', ascending=False)
    
    print("Missing Values Summary:")
    display(missing_df[missing_df['Missing Count'] > 0])
    
    # Visualize missing values
    if missing_df['Missing Count'].sum() > 0:
        plt.figure(figsize=(12, 6))
        missing_data = missing_df[missing_df['Missing Count'] > 0]
        plt.barh(missing_data.index, missing_data['Percentage'])
        plt.xlabel('Percentage of Missing Values')
        plt.title('Missing Values by Column')
        plt.tight_layout()
        plt.show()
    else:
        print("✓ No missing values found!")

### 4.2 Duplicates Check

In [None]:
if data is not None:
    n_duplicates = data.duplicated().sum()
    print(f"Number of duplicate rows: {n_duplicates}")
    if n_duplicates > 0:
        print(f"Percentage: {n_duplicates/len(data)*100:.2f}%")

## 5. Exploratory Data Analysis (EDA)

### 5.1 Numerical Features Distribution

In [None]:
if data is not None:
    # Identify numerical columns
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"Number of numerical features: {len(numerical_cols)}")
    print(f"Numerical features: {numerical_cols}")
    
    if len(numerical_cols) > 0:
        # Distribution plots
        n_cols = 3
        n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
        axes = axes.flatten() if n_rows > 1 else [axes]
        
        for idx, col in enumerate(numerical_cols):
            if idx < len(axes):
                data[col].hist(bins=30, ax=axes[idx], edgecolor='black')
                axes[idx].set_title(f'Distribution of {col}')
                axes[idx].set_xlabel(col)
                axes[idx].set_ylabel('Frequency')
        
        # Hide extra subplots
        for idx in range(len(numerical_cols), len(axes)):
            axes[idx].set_visible(False)
        
        plt.tight_layout()
        plt.show()

### 5.2 Correlation Analysis

In [None]:
if data is not None and len(numerical_cols) > 0:
    # Compute correlation matrix
    correlation = data[numerical_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Heatmap of Numerical Features')
    plt.tight_layout()
    plt.show()
    
    # Identify highly correlated pairs
    print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
    high_corr = []
    for i in range(len(correlation.columns)):
        for j in range(i+1, len(correlation.columns)):
            if abs(correlation.iloc[i, j]) > 0.7:
                high_corr.append((correlation.columns[i], correlation.columns[j], correlation.iloc[i, j]))
    
    if high_corr:
        for feat1, feat2, corr in high_corr:
            print(f"  {feat1} <-> {feat2}: {corr:.3f}")
    else:
        print("  No highly correlated pairs found")

### 5.3 Outlier Detection

In [None]:
if data is not None and len(numerical_cols) > 0:
    # Create box plots for outlier detection
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        if idx < len(axes):
            data.boxplot(column=col, ax=axes[idx])
            axes[idx].set_title(f'Box Plot of {col}')
            axes[idx].set_ylabel(col)
    
    # Hide extra subplots
    for idx in range(len(numerical_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()

### 5.4 Categorical Features Analysis

In [None]:
if data is not None:
    # Identify categorical columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Number of categorical features: {len(categorical_cols)}")
    print(f"Categorical features: {categorical_cols}")
    
    if len(categorical_cols) > 0:
        # Show unique values for each categorical column
        for col in categorical_cols:
            n_unique = data[col].nunique()
            print(f"\n{col}: {n_unique} unique values")
            if n_unique <= 10:
                print(data[col].value_counts())

## 6. Data Preprocessing

### 6.1 Handle Missing Values

In [None]:
if data is not None:
    # Initialize preprocessor
    preprocessor = DataPreprocessor(scaling_method='standard')
    
    # Handle missing values
    data_clean = preprocessor.handle_missing_values(
        data.copy(), 
        strategy='median', 
        threshold=0.5
    )
    
    print(f"✓ Missing values handled")
    print(f"Shape after handling missing values: {data_clean.shape}")

### 6.2 Remove Outliers

In [None]:
if data is not None:
    # Remove outliers using IQR method
    data_no_outliers = preprocessor.remove_outliers(
        data_clean.copy(),
        method='iqr',
        threshold=3.0
    )
    
    print(f"✓ Outliers removed")
    print(f"Shape after removing outliers: {data_no_outliers.shape}")

### 6.3 Encode Categorical Variables

In [None]:
if data is not None:
    # Encode categorical variables
    data_encoded = preprocessor.encode_categorical(
        data_no_outliers.copy(),
        method='onehot'
    )
    
    print(f"✓ Categorical variables encoded")
    print(f"Shape after encoding: {data_encoded.shape}")

### 6.4 Scale Features

In [None]:
if data is not None:
    # Scale features
    data_scaled = preprocessor.scale_features(data_encoded.copy(), fit=True)
    
    print(f"✓ Features scaled")
    print(f"Final shape: {data_scaled.shape}")

### 6.5 Save Processed Data

In [None]:
if data is not None:
    # Save processed data
    output_path = get_absolute_path('data/processed/processed_data.csv')
    data_scaled.to_csv(output_path, index=False)
    
    print(f"✓ Processed data saved to: {output_path}")
    print(f"\nFinal dataset ready for unsupervised learning:")
    print(f"  - Rows: {data_scaled.shape[0]}")
    print(f"  - Features: {data_scaled.shape[1]}")
    print(f"  - All numerical: {data_scaled.select_dtypes(include=[np.number]).shape[1] == data_scaled.shape[1]}")

## 7. Summary

### Key Findings from EDA:
1. [Finding 1]
2. [Finding 2]
3. [Finding 3]

### Preprocessing Steps Applied:
- Missing values: [strategy]
- Outliers: [method and threshold]
- Categorical encoding: [method]
- Feature scaling: [method]

### Next Steps:
- Proceed to dimensionality reduction (Notebook 02)
- Apply PCA and UMAP
- Visualize data in reduced dimensions