In [None]:
# Import custom modules
import os
import re
import sys
import glob
import logging
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Plotların notebook içinde gözükmesi için:
%matplotlib inline

In [5]:
# %%
# Projeye özel local script import:
sys.path.append(os.path.abspath("../source"))

In [None]:
# %%
from source.data_check import check_raw_data
from source.utils.logger import setup_logger
from source.config.config_loader import load_config

In [None]:
# %%
config_path = os.path.join(os.path.abspath("../config"), "settings.yml")
config = load_config(config_path)

if config is None:
    logging.error("Failed to load configuration. Terminating notebook.")
    sys.exit(1)

RAW_DIR = os.path.abspath(config["paths"]["raw_dir"])
LOG_DIR = config["paths"].get("logs_dir", "../logs")

print(LOG_DIR)

os.makedirs(LOG_DIR, exist_ok=True)

In [None]:
logger = setup_logger(
    name="data_check_notebook",
    log_file=os.path.join(LOG_DIR, "data_check_notebook.log"),
    log_level=config.get("logging", {}).get("level", "INFO").upper()
)

logger.info("=== Data Check Notebook Initialized ===")
print(f"Log directory is set to: {LOG_DIR}")
print(f"Raw data directory is set to: {RAW_DIR}")

In [None]:
# %%
check_raw_data(raw_dir=RAW_DIR)

In [None]:
# %%
# Find the latest report in the processed directory
processed_dir = os.path.abspath(config["paths"]["processed_dir"])
report_files = glob.glob(os.path.join(processed_dir, "data_check_report_*.csv"))

df_report = None
if not report_files:
    logger.warning("No report files found in the processed directory.")
    df_report = pd.DataFrame()
    print("No data check reports found.")
else:
    latest_report = max(report_files, key=os.path.getctime)
    logger.info(f"Loading the latest report: {latest_report}")
    print(f"Loading the latest report: {latest_report}")
    df_report = pd.read_csv(latest_report)
    display(df_report.head(10))

In [None]:
# %%
if df_report is not None and not df_report.empty:
    summary = df_report.describe(include='all').transpose()
    display(summary)
    print("Preview of the Data Check Report:")
    display(df_report.head())
else:
    print("DataFrame is empty. No summary available.")

In [None]:
# %%
if df_report is not None and not df_report.empty:
    # Histogram of row_counts
    plt.figure(figsize=(10, 6))
    sns.histplot(df_report['row_count'], bins=10, kde=True)
    plt.title("Distribution of Row Counts")
    plt.xlabel("Row Count")
    plt.ylabel("Frequency")
    plt.show()

    # Histogram of col_counts
    plt.figure(figsize=(10, 6))
    sns.histplot(df_report['col_count'], bins=10, kde=True)
    plt.title("Distribution of Column Counts")
    plt.xlabel("Column Count")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# Ensure plots are rendered inline
%matplotlib inline

In [None]:
# Histogram of row counts
plt.figure(figsize=(10, 6))
sns.histplot(df_report['row_count'], bins=5, kde=True)
plt.title("Distribution of Row Counts")
plt.xlabel("Row Count")
plt.ylabel("Frequency")

plt.figure(figsize=(10, 6))
sns.histplot(df_report['col_count'], bins=2, kde=True)
plt.title("Distribution of Column Counts")
plt.xlabel("Column Count")
plt.ylabel("Frequency")

plt.show()

In [None]:
# %%
if df_report is not None and not df_report.empty and "pollutant" in df_report.columns:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df_report, x="pollutant", hue="test_pass")
    plt.title("Files by Pollutant (Test Pass or Fail)")
    plt.xlabel("Pollutant")
    plt.ylabel("Number of Files")
    plt.show()
else:
    print("No 'pollutant' column or empty df_report to analyze pollutant distribution.")

In [None]:
# %% [markdown]
# ### 6.2 Quick Check Eksik Kolon Analizi
#
# Bazı projelerde `quick_check_limit` gibi bir değişken ile ilk birkaç dosyada ekstra log alınır.
# Eğer `notes` sütununda “Missing columns: […]” mesajı varsa, bunu regex ile yakalayarak eksik kolonların hangileri olduğunu tespit edebiliriz.

In [None]:
# %%
if df_report is not None and not df_report.empty and "notes" in df_report.columns:
    # quick_check_limit
    quick_n = config.get("data_check", {}).get("quick_check_limit", 5)
    quick_check_df = df_report.head(quick_n).copy()

    # 'notes' sütununu string'e çevirip NaN / None durumlarını temizleyelim
    quick_check_df['notes'] = quick_check_df['notes'].astype(str).fillna('')

    # Eksik kolonları yakalama (regex)
    missing_values = quick_check_df['notes'].apply(
        lambda x: re.findall(r"Missing columns: \[(.*?)\]", x)
    ).explode().dropna()

    # Her kayıtta birden çok kolon varsa, split edip patlatalım
    missing_values = missing_values.apply(lambda x: [col.strip() for col in x.split(',')])
    missing_columns_series = missing_values.explode().value_counts()

    if not missing_columns_series.empty:
        plt.figure(figsize=(10, 6))
        sns.barplot(x=missing_columns_series.values, y=missing_columns_series.index)
        plt.title("Missing Columns Frequency (Quick Check)")
        plt.xlabel("Count")
        plt.ylabel("Column Names")
        plt.show()
    else:
        print("No missing columns found in quick check files.")
else:
    print("No 'notes' column or empty df_report. Skipping quick check analysis.")



In [None]:

# %%
if df_report is not None and not df_report.empty:
    if "test_pass" in df_report.columns:
        pivot_df = df_report.copy()
        pivot_df['test_pass_numeric'] = pivot_df['test_pass'].apply(lambda x: 1 if x else 0)

        pivot_table = pivot_df.pivot_table(
            index='file_path',
            values=['row_count', 'col_count', 'test_pass_numeric'],
            aggfunc='mean'
        )

        plt.figure(figsize=(8, 6))
        sns.heatmap(pivot_table, annot=True, cmap="YlGnBu")
        plt.title("Heatmap of Row/Col Counts & Test Pass Status by File")
        plt.show()
    else:
        print("'test_pass' column not found in df_report.")
else:
    print("No data in df_report to create a heatmap.")

In [None]:
# %%
log_file_path = os.path.join(LOG_DIR, "data_check_notebook.log")
if os.path.exists(log_file_path):
    with open(log_file_path, "r", encoding="utf-8") as f:
        logs = f.readlines()
    print(f"Log file contains {len(logs)} lines.")
else:
    print("No log file found to analyze.")