In [14]:
# %% [markdown]
# # Data Ingestion Analysis
#
# Bu notebook, `data_ingestion` modülünü kullanarak ham veri setlerini işler ve sonuçları analiz eder.

In [15]:
# %%
import logging
import os
import sys
from pathlib import Path

import pandas as pd
from IPython.display import display

In [16]:

# %%
sys.path.append(os.path.abspath("../source"))

In [17]:
# %%
# Import necessary functions
from source.data_ingestion import ingest_data
from source.utils.config_loader import load_config
from source.utils.logger import setup_logger

In [18]:
# %%
# %% [markdown]
# ## Load Configuration

In [21]:

# %%
config_path = Path("../config/settings.yml").resolve()
config = load_config(config_path)

if config is None:
    logging.error("Failed to load configuration. Terminating notebook.")
    sys.exit(1)

RAW_DIR = Path(config["paths"]["raw_dir"]).resolve()
LOG_DIR = Path(config["paths"].get("logs_dir", "../04-logs")).resolve()

LOG_DIR.mkdir(parents=True, exist_ok=True)

logger = setup_logger(
    name="data_ingestion_notebook",
    log_file=LOG_DIR / "data_ingestion_notebook.log",
    log_level=config.get("logging", {}).get("level", "INFO").upper()
)

logger.info("=== Data Ingestion Notebook Initialized ===")

[32m2025-01-10 18:46:29,302 - data_ingestion_notebook - INFO - === Data Ingestion Notebook Initialized ===[0m


In [22]:

# %%
# %% [markdown]
# ## Perform Data Ingestion

In [31]:
# %%
ingest_data(raw_dir=RAW_DIR)

[32m2025-01-10 18:53:24,340 - data_ingestion - INFO - === Starting data ingestion process ===[0m
[32m2025-01-10 18:53:24,343 - data_ingestion - INFO - Found 515 CSV files in /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/raw[0m
[32m2025-01-10 18:53:24,344 - data_ingestion - INFO - Processing file: epa_so2_virginia_2023.csv[0m
[32m2025-01-10 18:53:24,352 - data_ingestion - INFO - Deduplicated epa_so2_virginia_2023.csv: (2877, 21) -> (2877, 21)[0m


TypeError: unsupported operand type(s) for +: 'PosixPath' and 'str'

In [None]:

# %%
# %% [markdown]
# ## Load and Display Combined Data

In [None]:
# %%
processed_dir = Path(config["paths"]["processed_dir"]).resolve()
combined_file = processed_dir / "epa_long_preprocessed.csv"

if combined_file.exists():
    logger.info(f"Loading combined data from {combined_file}")
    df_combined = pd.read_csv(combined_file)
    display(df_combined.head())
else:
    logger.warning(f"Combined data file {combined_file} does not exist.")

In [None]:
# %%
# %% [markdown]
# ## Summary Statistics

In [None]:
# %%
if 'df_combined' in locals() and not df_combined.empty:
    summary = df_combined.describe(include='all').transpose()
    display(summary)
else:
    logger.warning("No combined data available for summary statistics.")

In [None]:
# %%
# %% [markdown]
# ## Visualization

In [None]:
# %%
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots are rendered inline
%matplotlib inline

In [None]:
if 'df_combined' in locals() and not df_combined.empty:
    # Histogram of row counts per file can be inferred from metadata
    metadata_path = Path(config["paths"]["metadata_dir"]) / "processed_files.json"
    try:
        metadata = pd.read_json(metadata_path)
        plt.figure(figsize=(10, 6))
        sns.histplot(metadata['rows_count'], bins=30, kde=True)
        plt.title("Distribution of Row Counts per File")
        plt.xlabel("Row Count")
        plt.ylabel("Frequency")
        plt.show()
    except Exception as e:
        logger.error(f"Failed to load metadata for visualization: {e}")

In [None]:
# Boxplot of missing values in required columns
    required_columns = config["data_check"].get("required_columns", [])
    if required_columns:
        missing_counts = df_combined[required_columns].isnull().sum()
        plt.figure(figsize=(12, 6))
        sns.boxplot(x=missing_counts.index, y=missing_counts.values)
        plt.title("Missing Values in Required Columns")
        plt.xlabel("Columns")
        plt.ylabel("Number of Missing Values")
        plt.xticks(rotation=45)
        plt.show()
else:
    logger.warning("No combined data available for visualization.")