## üß© Ongoing Data Quality Monitoring

**Objective:**  
To simulate automated data quality monitoring by computing and logging metrics (completeness, duplicates, timeliness) over time.

**Approach:**  
- The script loads the cleaned dataset and calculates key DQ metrics.  
- Each run appends a new entry to `data_quality_log.csv`.  
- Threshold-based alerts detect anomalies (e.g., low completeness or high duplicate rate).  
- The same script can be scheduled using a task scheduler (e.g., cron, Airflow, or Windows Task Scheduler).


In [4]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import smtplib
from email.mime.text import MIMEText

# --- Paths ---
BASE_DIR = "/Users/deepti.gautam/Documents/Scrapers/Notebook/Analysis/Assignment/Firmable/data/processed"
DATA_PATH = os.path.join(BASE_DIR, "news_events_cleaned.csv")
LOG_PATH = os.path.join(BASE_DIR, "data_quality_log.csv")

# --- Load data ---
df = pd.read_csv(DATA_PATH)

# --- Helper Functions ---
def get_completeness(df):
    return round(100 - df.isna().mean().mean() * 100, 2)

def get_duplicate_rate(df):
    return round(df.duplicated().sum() / len(df) * 100, 2)

def get_timeliness(df):
    if "found_at" in df.columns:
        df["found_at"] = pd.to_datetime(df["found_at"], errors="coerce", utc=True)
        recent = df[df["found_at"] > pd.Timestamp("2024-01-01", tz="UTC")]
        return round(len(recent) / len(df) * 100, 2)
    return np.nan

# --- Compute Metrics ---
metrics = {
    "timestamp": dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "rows": len(df),
    "completeness_%": get_completeness(df),
    "duplicate_rate_%": get_duplicate_rate(df),
    "timeliness_%": get_timeliness(df),
}

# --- Log Results ---
if not os.path.exists(LOG_PATH):
    pd.DataFrame([metrics]).to_csv(LOG_PATH, index=False)
else:
    log_df = pd.read_csv(LOG_PATH)
    log_df = pd.concat([log_df, pd.DataFrame([metrics])], ignore_index=True)
    log_df.to_csv(LOG_PATH, index=False)

print("‚úÖ Data Quality Log Updated:")
print(pd.DataFrame([metrics]))

# --- Alerting ---
THRESHOLDS = {
    "completeness_%": 90,
    "duplicate_rate_%": 5
}

alerts = []
if metrics["completeness_%"] < THRESHOLDS["completeness_%"]:
    alerts.append(f"‚ö†Ô∏è Completeness dropped below {THRESHOLDS['completeness_%']}%")
if metrics["duplicate_rate_%"] > THRESHOLDS["duplicate_rate_%"]:
    alerts.append(f"‚ö†Ô∏è Duplicate rate exceeded {THRESHOLDS['duplicate_rate_%']}%")

if alerts:
    alert_msg = "\n".join(alerts)
    print("üö® ALERT:", alert_msg)

else:
    print("‚úÖ All metrics within acceptable thresholds.")


‚úÖ Data Quality Log Updated:
             timestamp    rows  completeness_%  duplicate_rate_%  timeliness_%
0  2025-11-11 15:47:03  612910           92.35               0.0         17.73
‚úÖ All metrics within acceptable thresholds.
