In [53]:
import sqlite3
import json
import pandas as pd
import os
from datetime import datetime

In [54]:
# Folder containing cleaned JSONL files
cleaned_folder = '/Users/abhishekkumar/Desktop/data_quality_project/cleaned_data'
db_path = '/Users/abhishekkumar/Desktop/data_quality_project/data_quality.db'

In [55]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [56]:
# ----------------- Create tables -----------------
cursor.execute("""
CREATE TABLE IF NOT EXISTS news_events (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    data TEXT,
    included TEXT
);
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS data_quality_metrics (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file_name TEXT,
    metric_date DATE,
    total_rows INTEGER,
    missing_rows INTEGER,
    duplicate_rows INTEGER,
    invalid_headlines INTEGER,
    future_dates INTEGER
);
""")

conn.commit()

In [57]:
# ----------------- Helper: flatten JSON -----------------
def flatten_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    # Each row contains 'data' and 'included' as text
    dfs = []
    for d in data:
        df_row = pd.DataFrame({
            'data': [json.dumps(d.get('data'))],
            'included': [json.dumps(d.get('included'))]
        })
        dfs.append(df_row)
    
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()


In [58]:
# ----------------- Load files -----------------
all_files = [f for f in os.listdir(cleaned_folder) if f.endswith('.jsonl')]

for file_name in all_files:
    file_path = os.path.join(cleaned_folder, file_name)
    df = flatten_json(file_path)
    
    if df.empty:
        print(f"No relevant data in {file_name}, skipping.")
        continue
    
    # Insert news_events rows
    df.to_sql('news_events', conn, if_exists='append', index=False)
    
    # ----------------- Compute simple DQ metrics -----------------
    total_rows = df.shape[0]
    total_missing = df.isnull().sum().sum()
    duplicate_rows = df.duplicated().sum()
    
    future_dates = 0
    invalid_headlines = 0
    # Only compute if you have parsed data further, for now keep 0
    
    cursor.execute("""
    INSERT INTO data_quality_metrics
    (file_name, metric_date, total_rows, missing_rows, duplicate_rows, invalid_headlines, future_dates)
    VALUES (?, ?, ?, ?, ?, ?, ?)
    """, (
        file_name,
        datetime.today().date(),
        total_rows,
        total_missing,
        duplicate_rows,
        invalid_headlines,
        future_dates
    ))

conn.commit()

In [59]:
news_count = cursor.execute("SELECT COUNT(*) FROM news_events").fetchone()[0]
dq_count = cursor.execute("SELECT COUNT(*) FROM data_quality_metrics").fetchone()[0]

print("✅ Phase 5 Complete!")
print(f"News Events Rows: {news_count}")
print(f"DQ Metrics Rows: {dq_count}")

# Optional: close connection
conn.close()


✅ Phase 5 Complete!
News Events Rows: 671717
DQ Metrics Rows: 26
