In [8]:
# %% [markdown]
# # Data Ingestion Analysis
#
# Bu notebook, `data_ingestion` modülünü kullanarak ham veri setlerini işler ve sonuçları analiz eder.

In [9]:
# %%
import logging
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path
from IPython.display import display


In [10]:
# %%
sys.path.append(os.path.abspath("../source"))

In [11]:
# %%
# Import necessary functions
from source.data_ingestion import merge_pollutants
from source.data_ingestion import ingest_data
from source.utils.config_loader import load_config
from source.utils.logger import setup_logger

In [12]:
# %%
# %% [markdown]
# ## Load Configuration

In [13]:
# %%
config_path = Path("../config/settings.yml").resolve()
config = load_config(config_path)

if config is None:
    logging.error("Failed to load configuration. Terminating notebook.")
    sys.exit(1)

RAW_DIR = Path(config["paths"]["raw_dir"]).resolve()
LOG_DIR = Path(config["paths"].get("logs_dir", "../04-logs")).resolve()

LOG_DIR.mkdir(parents=True, exist_ok=True)

logger = setup_logger(
    name="data_ingestion_notebook",
    log_file=LOG_DIR / "data_ingestion_notebook.log",
    log_level=config.get("logging", {}).get("level", "INFO").upper()
)

logger.info("=== Data Ingestion Notebook Initialized ===")
print(f"Raw data directory: {RAW_DIR}")
print(f"Log directory: {LOG_DIR}")

[32m2025-01-14 19:38:43,156 - data_ingestion_notebook - INFO - === Data Ingestion Notebook Initialized ===[0m


Raw data directory: /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/raw
Log directory: /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/logs


In [8]:
# %%
# %% [markdown]
# ## Perform Data Ingestion

In [None]:
# %%
df_final = merge_pollutants()  # merge_pollutants() outer join işlemini yapıp final df'yi döndürür.
if df_final is not None and not df_final.empty:
    print("Merged DataFrame preview:")
    display(df_final.head(10))
    print("Merged DataFrame shape:", df_final.shape)
else:
    print("Merged DataFrame is empty or merge process failed.")

In [None]:
# %%
if df_final is not None and not df_final.empty:
    summary = df_final.describe(include='all').transpose()
    print("Summary statistics:")
    display(summary)

    # Örneğin tarih dağılımı histogramı
    if "Date" in df_final.columns:
        plt.figure(figsize=(10,6))
        df_final["Date"] = pd.to_datetime(df_final["Date"], errors="coerce")
        df_final["Year"] = df_final["Date"].dt.year
        sns.countplot(data=df_final, x="Year")
        plt.title("Distribution by Year")
        plt.show()

    # Kirletici kolonlarının boşluk oranı
    pollutant_cols = [
        "Daily Max 1-hour SO2 Concentration",
        "Daily Max 8-hour CO Concentration",
        "Daily Max 1-hour NO2 Concentration",
        "Daily Max 8-hour Ozone Concentration",
        "Daily Mean PM2.5 Concentration"
    ]
    missing_counts = df_final[pollutant_cols].isnull().sum()
    plt.figure(figsize=(10,5))
    sns.barplot(x=missing_counts.index, y=missing_counts.values)
    plt.title("Missing Counts in Pollutant Columns")
    plt.xlabel("Pollutant Column")
    plt.ylabel("Missing Count")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No data available for visualization.")

# %% [markdown]
# ## 5. Log Analizi
#
# Log dosyasını açarak, ingestion sırasında oluşan hata/uyarı sayısını inceleyebilirsiniz.

# %%
log_file_path = os.path.join(LOG_DIR, "data_ingestion.log")
if os.path.exists(log_file_path):
    with open(log_file_path, "r", encoding="utf-8") as f:
        logs = f.readlines()
    print(f"Data ingestion log file contains {len(logs)} lines. Last 20 lines:")
    for line in logs[-20:]:
        print(line.strip())
else:
    print("No data ingestion log file found.")

# %% [markdown]
# ## 6. Sonraki Adımlar
#
# - Gerekirse, outer join sonrasında verinin eksik (NaN) kısımlarını doldurmak için `missing_handle.py` modülü geliştirilebilir.
# - Ayrıca, veritabanı entegrasyonu gibi ileri aşamalar planlanabilir.
#
# İyi çalışmalar!

In [9]:
# %%
ingest_data(raw_dir=RAW_DIR)

[32m2025-01-14 19:25:14,155 - data_ingestion - INFO - === Starting multi-pollutant ingestion with outer join ===[0m
[32m2025-01-14 19:25:14,159 - data_ingestion - INFO - Found 515 .csv files in /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/raw[0m
[32m2025-01-14 19:25:14,160 - data_ingestion - INFO - Processing file epa_so2_virginia_2023.csv[0m
[32m2025-01-14 19:25:14,178 - data_ingestion - INFO - Deduplicated => epa_so2_virginia_2023.csv: (2877, 21) => (2877, 21)[0m
[32m2025-01-14 19:25:14,180 - data_ingestion - INFO - File epa_so2_virginia_2023.csv => poll=so2, 2877 rows, done.[0m
[32m2025-01-14 19:25:14,194 - data_ingestion - INFO - Saved interim => /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/interim/interim_epa_so2_virginia_2023.csv[0m
[32m2025-01-14 19:25:14,195 - data_ingestion - INFO - Archived => /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/archive/20250114_192514_epa_so2_virginia_2023.csv[0m
[32m2025-01-14 19:25:14,195

In [14]:

# %%
# %% [markdown]
# ## Load and Display Combined Data

In [15]:
# %%
processed_dir = Path(config["paths"]["processed_dir"]).resolve()
combined_file = processed_dir / "epa_long_5pollutants.csv"

if combined_file.exists():
    logger.info(f"Loading combined data from {combined_file}")
    df_combined = pd.read_csv(combined_file)
    display(df_combined.head())
    print(f"Combined DataFrame shape: {df_combined.shape}")
else:
    logger.warning(f"Combined data file {combined_file} does not exist.")
    df_combined = pd.DataFrame()

[32m2025-01-14 19:38:53,567 - data_ingestion_notebook - INFO - Loading combined data from /Users/dogaaydin/PycharmProjects/Geo_Sentiment_Climate/data/processed/epa_long_5pollutants.csv[0m


Unnamed: 0,POC,CBSA Code,Site Longitude,Source,Date,Local Site Name,County,Site Latitude,Daily Obs Count,Percent Complete,...,AQS Parameter Description_pm25,CBSA Name_pm25,Daily Mean PM2.5 Concentration_pm25,County FIPS Code_pm25,AQS Parameter Code_pm25,State FIPS Code_pm25,Units_pm25,State_pm25,Daily AQI Value_pm25,Method Code_pm25
0,,,,,01/01/2022,,,,,,...,,,,,,,,,,
1,,,,,01/01/2022,,,,,,...,PM2.5 - Local Conditions,"Gadsden, AL",6.0,55.0,88101.0,1.0,ug/m3 LC,Alabama,33.0,170.0
2,2.0,13820.0,-86.815,AQS,01/01/2022,North Birmingham,Jefferson,33.553056,24.0,100.0,...,,,,,,,,,,
3,1.0,13820.0,-86.915,AQS,01/01/2022,Fairfield,Jefferson,33.485556,24.0,100.0,...,,,,,,,,,,
4,,,,,01/01/2022,,,,,,...,Acceptable PM2.5 AQI & Speciation Mass,"Birmingham-Hoover, AL",6.5,73.0,88502.0,1.0,ug/m3 LC,Alabama,36.0,701.0


Combined DataFrame shape: (1713200, 102)


In [16]:
# %%
# %% [markdown]
# ## Summary Statistics

In [17]:
# %%
if not df_combined.empty:
    summary = df_combined.describe(include='all').transpose()
    display(summary)
else:
    logger.warning("No combined data available for summary statistics.")

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
POC,448876.0,,,,1.415233,1.393485,1.0,1.0,1.0,1.0,9.0
CBSA Code,412828.0,,,,29938.777433,11468.470086,10420.0,19340.0,31080.0,40140.0,49660.0
Site Longitude,448876.0,,,,-92.621632,17.589261,-159.36624,-101.7418,-88.53393,-80.6539,-66.141683
Source,448876,1,AQS,448876,,,,,,,
Date,1713200,730,06/17/2023,2941,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
State FIPS Code_pm25,1056142.0,,,,28.156297,15.891539,1.0,15.0,27.0,42.0,78.0
Units_pm25,1056142,1,ug/m3 LC,1056142,,,,,,,
State_pm25,1056142,47,California,128229,,,,,,,
Daily AQI Value_pm25,1056142.0,,,,38.342746,20.090087,0.0,24.0,37.0,52.0,1435.0


In [13]:
# %%
# %% [markdown]
# ## Visualization

In [18]:
# %%
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots are rendered inline
%matplotlib inline

In [19]:
if not df_combined.empty:
    # 5.1 Metadata'daki rows_count dağılımı
    metadata_path = Path(config["paths"]["metadata_dir"]) / "processed_files.json"
    try:
        metadata = pd.read_json(metadata_path)
        plt.figure(figsize=(10, 6))
        sns.histplot(metadata['rows_count'], bins=30, kde=True)
        plt.title("Distribution of Row Counts per File")
        plt.xlabel("Row Count")
        plt.ylabel("Frequency")
        plt.show()
    except Exception as e:
        logger.error(f"Failed to load metadata for visualization: {e}")

[31m2025-01-14 19:39:11,736 - data_ingestion_notebook - ERROR - Failed to load metadata for visualization: 'rows_count'[0m


In [19]:
    # 5.2 Eksik Değerler (sadece ingestion'da 'common_columns' eklenenler)
    # Eğer isterseniz config data_check veya columns altından required kolonları tekrar çekip inceleyebilirsiniz.
    # Örnek:
    required_columns_global = config["data_check"].get("required_columns", [])
    if required_columns_global:
        missing_counts = df_combined[required_columns_global].isnull().sum()
        plt.figure(figsize=(12, 6))
        sns.barplot(x=missing_counts.index, y=missing_counts.values)
        plt.title("Missing Values in Global Required Columns")
        plt.xlabel("Columns")
        plt.ylabel("Number of Missing Values")
        plt.xticks(rotation=45)
        plt.show()
else:
    logger.warning("No combined data available for visualization.")

IndentationError: expected an indented block after 'else' statement on line 14 (3439071381.py, line 15)

In [17]:
# %% [markdown]
# ## 6. Log Analizi
#
# Log'da ingestion sırasında oluşan uyarı/hata mesajlarına göz atabiliriz.

# %%
log_file = LOG_DIR / "data_ingestion.log"
if log_file.exists():
    with open(log_file, "r", encoding="utf-8") as f:
        logs = f.readlines()
    print(f"Log file contains {len(logs)} lines.")
    # Son 20 satırı yazdıralım
    for line in logs[-20:]:
        print(line.strip())
else:
    print(f"No log file found at {log_file}")

Log file contains 5670 lines.
"file_name": "epa_no2_georgia_2023.csv",
"file_hash": "70d6aac1323e0f25bdc6b2f042d73d69",
"rows_count": 1065,
"timestamp": "2025-01-14T19:25:33.486531"
},
{
"file_name": "epa_no2_ohio_2023.csv",
"file_hash": "5aa8b98664e974fb02f3da3108a86dba",
"rows_count": 2381,
"timestamp": "2025-01-14T19:25:33.500591"
},
{
"file_name": "epa_no2_vermont_2023.csv",
"file_hash": "2c6821177d9368570aaf6471ae6e5113",
"rows_count": 351,
"timestamp": "2025-01-14T19:25:33.515333"
}
]
}
2025-01-14 19:26:10,001 - data_ingestion - INFO - === Multi-pollutant ingestion completed ===


In [None]:
# %% [markdown]
# ## 7. Sonraki Adımlar
#
# - `missing_handle.py` veya data_preprocessing aşamasında, cross-file "Site ID" doldurma gibi gelişmiş işlemler yapılabilir.
# - EDA (exploratory data analysis) notebook'u ile istatistik ve görselleştirme adımı genişletilebilir.
# - Modelleme öncesi veri temizleme/dönüştürme adımları planlanabilir.
#
# İyi çalışmalar!