In [1]:
import json
import pandas as pd
import sqlite3
from pathlib import Path
from datetime import datetime
import plotly.graph_objects as go

In [2]:
cleaned_folder = Path('/Users/abhishekkumar/Desktop/data_quality_project/cleaned_data')
db_path = Path('/Users/abhishekkumar/Desktop/data_quality_project/data_quality.db')

In [3]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [4]:
# Create tables if they don't exist
cursor.execute("""
CREATE TABLE IF NOT EXISTS data_quality_metrics (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file_name TEXT,
    metric_date DATE,
    total_rows INTEGER,
    missing_rows INTEGER,
    duplicate_rows INTEGER,
    invalid_headlines INTEGER,
    future_dates INTEGER
)
""")
conn.commit()

In [None]:
# ---------- Recompute DQ metrics ----------
dq_metrics_list = []

for file_path in cleaned_folder.glob('*.jsonl'):
    file_name = file_path.name
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    except Exception as e:
        print(f"Error reading {file_name}: {e}")
        continue

    total_rows = len(data)
    missing_rows = sum(1 for d in data if not d.get('data') or not d.get('included'))
    duplicate_rows = total_rows - len({json.dumps(d) for d in data})
    invalid_headlines = sum(1 if (not bool(str(d.get('data')).strip()) if d.get('data') is not None else True) else 0 for d in data)
    future_dates = 0  # optional, add if you have date fields

    # Insert into DB
    cursor.execute("""
        INSERT INTO data_quality_metrics
        (file_name, metric_date, total_rows, missing_rows, duplicate_rows, invalid_headlines, future_dates)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    """, (file_name, datetime.today().date(), total_rows, missing_rows, duplicate_rows, invalid_headlines, future_dates))
    dq_metrics_list.append({
        'file_name': file_name,
        'total_rows': total_rows,
        'missing_rows': missing_rows,
        'duplicate_rows': duplicate_rows,
        'invalid_headlines': invalid_headlines,
        'future_dates': future_dates
    })

conn.commit()

  cursor.execute("""
