In [16]:
# ======================================================================
# CELL 1: Imports and Configuration
# ======================================================================
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import json
from pathlib import Path
from dotenv import load_dotenv
from google.cloud import storage

# Load environment variables
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
project_root = os.path.dirname(script_dir) if '__file__' in globals() else os.path.dirname(os.getcwd())
env_path = os.path.join(project_root, ".env")
load_dotenv(dotenv_path=env_path)

GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
BRONZE_NEWS_PATH = "bronze/news/stock_news_api"

if not GCS_BUCKET_NAME:
    raise ValueError("GCS_BUCKET_NAME not found in .env file")
if not GCP_PROJECT_ID:
    raise ValueError("GCP_PROJECT_ID not found in .env file")

# Resolve credentials path if it's relative
credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
if credentials_path and not os.path.isabs(credentials_path):
    credentials_path = os.path.join(project_root, credentials_path)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# Create validation_reports directory
validation_reports_dir = os.path.join(project_root, "validation_reports")
os.makedirs(validation_reports_dir, exist_ok=True)

print("=" * 70)
print("NEWS DATA VALIDATION")
print("=" * 70)
print(f"Project Root: {project_root}")
print(f"GCS Bucket: {GCS_BUCKET_NAME}")
print(f"GCP Project: {GCP_PROJECT_ID}")
print(f"Validation Reports: {validation_reports_dir}")
print("=" * 70)


NEWS DATA VALIDATION
Project Root: /Users/evancallaghan/data_portfolio/data_engineering/stock_x_sentiment
GCS Bucket: stock_sentiment_pipeline
GCP Project: solid-coral-469323-i0
Validation Reports: /Users/evancallaghan/data_portfolio/data_engineering/stock_x_sentiment/validation_reports


In [17]:
# ======================================================================
# CELL 2: Load News Data from GCS Bronze Layer
# ======================================================================
print("=" * 70)
print("LOADING DATA FROM GCS")
print("=" * 70)

# Initialize GCS client
storage_client = storage.Client(project=GCP_PROJECT_ID)
bucket = storage_client.bucket(GCS_BUCKET_NAME)

# List all parquet files
print("Finding parquet files...")
blobs = bucket.list_blobs(prefix=BRONZE_NEWS_PATH)
parquet_files = [blob.name for blob in blobs if blob.name.endswith('.parquet')]

print(f"Found {len(parquet_files)} parquet files")

# Download and load files
dfs = []
for file_path in parquet_files:
    try:
        blob = bucket.blob(file_path)
        # Download to memory
        content = blob.download_as_bytes()
        # Read from bytes
        import io
        df_temp = pd.read_parquet(io.BytesIO(content))
        dfs.append(df_temp)
    except Exception as e:
        print(f"⚠️  Error loading {file_path}: {e}")
        continue

if not dfs:
    raise ValueError("No parquet files were successfully loaded")

# Combine all DataFrames
print("Combining DataFrames...")
df = pd.concat(dfs, ignore_index=True)

print(f"✅ Loaded {len(df):,} articles")
print(f"Columns: {list(df.columns)}")
print(f"\nSample data:")
print(df.head(3))


LOADING DATA FROM GCS
Finding parquet files...
Found 15 parquet files
Combining DataFrames...
✅ Loaded 23,359 articles
Columns: ['news_url', 'image_url', 'title', 'text', 'source_name', 'date', 'topics', 'sentiment', 'type', 'tickers', 'search_source', 'ticker', 'company_name', 'query_date', 'endpoint_used', 'search_terms_used']

Sample data:
                                            news_url  \
0  https://www.reuters.com/technology/artificial-...   
1  https://nypost.com/2024/11/12/business/apple-r...   
2  https://www.cnbc.com/2024/11/12/apple-wont-lau...   

                                           image_url  \
0  https://cdn.snapi.dev/images/v1/0/u/7/aapl28-2...   
1  https://cdn.snapi.dev/images/v1/p/c/b/aapl8-26...   
2  https://cdn.snapi.dev/images/v1/c/0/t/aapl30-2...   

                                               title  \
0  Apple to announce AI wall tablet as soon as Ma...   
1  Apple developing an iPad-like AI device that c...   
2  Apple won't launch a smart ring, s

In [18]:
# ======================================================================
# CELL 3: Run Validation Checks
# ======================================================================
print("=" * 70)
print("RUNNING VALIDATION CHECKS")
print("=" * 70)

validation_results = {
    "validation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "data_source": "news",
    "total_records": len(df),
    "checks": {}
}

# 1. Check for null dates
print("\n1. Checking for null dates...")
date_columns = ['date', 'query_date', 'article_date', 'query_date_parsed', 'date_key']
null_date_checks = {}

for col in date_columns:
    if col in df.columns:
        null_count = df[col].isnull().sum()
        null_date_checks[col] = {
            "null_count": int(null_count),
            "null_percentage": float(null_count / len(df) * 100) if len(df) > 0 else 0.0,
            "passed": null_count == 0
        }
        status = "✅ PASS" if null_count == 0 else f"⚠️  FAIL ({null_count} nulls)"
        print(f"   {col}: {status}")

validation_results["checks"]["null_dates"] = null_date_checks

# 2. Check date continuity (for query_date)
# Note: For news data, gaps are expected (weekends, holidays, no news days)
# We'll flag only large gaps (> 7 days) as potential issues
print("\n2. Checking date continuity...")
if 'query_date' in df.columns:
    # Convert to datetime if string
    if df['query_date'].dtype == 'object':
        df['query_date'] = pd.to_datetime(df['query_date'], errors='coerce')
    
    # Get date range
    min_date = df['query_date'].min()
    max_date = df['query_date'].max()
    
    # Check for large gaps (group by ticker and date)
    # Only flag gaps > 7 days (more than a week)
    continuity_issues = []
    all_gaps = []
    for ticker in df['ticker'].unique() if 'ticker' in df.columns else [None]:
        ticker_df = df[df['ticker'] == ticker] if ticker else df
        dates = ticker_df['query_date'].dropna().dt.date.unique()
        dates_sorted = sorted(dates)
        
        if len(dates_sorted) > 1:
            # Check for gaps larger than 7 days
            for i in range(len(dates_sorted) - 1):
                gap = (dates_sorted[i+1] - dates_sorted[i]).days
                all_gaps.append(gap)
                if gap > 7:  # Flag gaps larger than a week
                    continuity_issues.append({
                        "ticker": ticker,
                        "gap_start": str(dates_sorted[i]),
                        "gap_end": str(dates_sorted[i+1]),
                        "gap_days": gap
                    })
    
    validation_results["checks"]["date_continuity"] = {
        "min_date": str(min_date) if pd.notna(min_date) else None,
        "max_date": str(max_date) if pd.notna(max_date) else None,
        "date_range_days": (max_date - min_date).days if pd.notna(min_date) and pd.notna(max_date) else None,
        "total_gaps_1day": len([g for g in all_gaps if g > 1]),
        "total_gaps_7day": len(continuity_issues),
        "continuity_issues": continuity_issues,
        "passed": len(continuity_issues) == 0  # Only fail on gaps > 7 days
    }
    
    status = "✅ PASS" if len(continuity_issues) == 0 else f"⚠️  WARNING ({len(continuity_issues)} large gaps > 7 days)"
    print(f"   Date range: {min_date} to {max_date}")
    print(f"   Total gaps > 1 day: {len([g for g in all_gaps if g > 1])} (expected for news data)")
    print(f"   Large gaps > 7 days: {len(continuity_issues)}")
    print(f"   Continuity: {status}")
    if continuity_issues:
        print(f"   Note: Large gaps may indicate missing data collection periods")

# 3. Check for duplicates
print("\n3. Checking for duplicates...")
duplicate_checks = {}

# Primary check: duplicate (ticker, query_date, url) combinations
# This is the most reliable check since URLs should be unique
# Use 'news_url' if 'url' doesn't exist
url_col = 'url' if 'url' in df.columns else ('news_url' if 'news_url' in df.columns else None)
if url_col and 'ticker' in df.columns and 'query_date' in df.columns:
    duplicates = df.duplicated(subset=['ticker', 'query_date', url_col], keep=False)
    duplicate_count = duplicates.sum()
    
    # Get more details about duplicates
    duplicate_details = {}
    if duplicate_count > 0:
        duplicate_df = df[duplicates].copy()
        # Count unique duplicate groups
        duplicate_groups = duplicate_df.groupby(['ticker', 'query_date', url_col]).size()
        duplicate_details = {
            "unique_duplicate_groups": int(len(duplicate_groups)),
            "avg_duplicates_per_group": float(duplicate_groups.mean()) if len(duplicate_groups) > 0 else 0.0,
            "max_duplicates_in_group": int(duplicate_groups.max()) if len(duplicate_groups) > 0 else 0,
            "sample_duplicates": []
        }
        
        # Show sample of duplicate groups
        sample_groups = duplicate_groups.head(5)
        for (ticker, date, url), count in sample_groups.items():
            duplicate_details["sample_duplicates"].append({
                "ticker": str(ticker),
                "query_date": str(date),
                "url": str(url)[:100] + "..." if len(str(url)) > 100 else str(url),
                "count": int(count)
            })
    
    duplicate_checks["ticker_date_url"] = {
        "duplicate_count": int(duplicate_count),
        "duplicate_details": duplicate_details if duplicate_count > 0 else None,
        "passed": True,  # Don't fail - duplicates will be handled in transformation
        "note": "Duplicates are expected in Bronze layer and will be deduplicated during transformation"
    }
    status = "✅ PASS" if duplicate_count == 0 else f"ℹ️  INFO ({duplicate_count} duplicate records - will be deduplicated in transformation)"
    print(f"   (ticker, query_date, {url_col}): {status}")
    if duplicate_count > 0:
        print(f"      Unique duplicate groups: {duplicate_details.get('unique_duplicate_groups', 0)}")
        print(f"      Avg duplicates per group: {duplicate_details.get('avg_duplicates_per_group', 0):.1f}")
        print(f"      Max duplicates in a group: {duplicate_details.get('max_duplicates_in_group', 0)}")
        print(f"      Note: Duplicates are expected (same URL found via different search terms)")
        print(f"            They will be removed during the transformation step")
elif not url_col:
    print(f"   ⚠️  WARNING: No URL column found (neither 'url' nor 'news_url')")
    duplicate_checks["ticker_date_url"] = {
        "duplicate_count": 0,
        "passed": True,
        "note": "URL column not found - skipping duplicate check"
    }

# Secondary check: duplicate (ticker, query_date, title) combinations
# This is informational - same titles can appear legitimately on different days
if all(col in df.columns for col in ['ticker', 'query_date', 'title']):
    duplicates = df.duplicated(subset=['ticker', 'query_date', 'title'], keep=False)
    duplicate_count = duplicates.sum()
    duplicate_checks["ticker_date_title"] = {
        "duplicate_count": int(duplicate_count),
        "passed": True,  # Don't fail on this - it's informational
        "note": "Same title on same date may be legitimate (e.g., republished articles)"
    }
    status = f"ℹ️  INFO ({duplicate_count} duplicates - may be legitimate)"
    print(f"   (ticker, query_date, title): {status}")

validation_results["checks"]["duplicates"] = duplicate_checks

# 4. Check expected column types
print("\n4. Checking column types...")
expected_types = {
    'ticker': 'object',  # string
    'query_date': 'datetime64[ns]',
    'title': 'object',  # string
    'text': 'object',  # string (nullable)
    'url': 'object',  # string (nullable)
    'date_key': 'object'  # string
}

type_checks = {}
for col, expected_type in expected_types.items():
    if col in df.columns:
        actual_type = str(df[col].dtype)
        # Normalize type comparison
        type_match = (
            (expected_type == 'object' and actual_type == 'object') or
            (expected_type == 'datetime64[ns]' and 'datetime' in actual_type) or
            actual_type == expected_type
        )
        type_checks[col] = {
            "expected": expected_type,
            "actual": actual_type,
            "passed": type_match
        }
        status = "✅ PASS" if type_match else f"⚠️  FAIL (expected {expected_type}, got {actual_type})"
        print(f"   {col}: {status}")

validation_results["checks"]["column_types"] = type_checks

# 5. Check required fields (title should not be empty)
print("\n5. Checking required fields...")
required_field_checks = {}

if 'title' in df.columns:
    empty_titles = (df['title'].isnull() | (df['title'].astype(str).str.strip() == '')).sum()
    required_field_checks["title"] = {
        "empty_count": int(empty_titles),
        "empty_percentage": float(empty_titles / len(df) * 100) if len(df) > 0 else 0.0,
        "passed": empty_titles == 0
    }
    status = "✅ PASS" if empty_titles == 0 else f"⚠️  FAIL ({empty_titles} empty titles)"
    print(f"   title: {status}")

if 'ticker' in df.columns:
    null_tickers = df['ticker'].isnull().sum()
    required_field_checks["ticker"] = {
        "null_count": int(null_tickers),
        "null_percentage": float(null_tickers / len(df) * 100) if len(df) > 0 else 0.0,
        "passed": null_tickers == 0
    }
    status = "✅ PASS" if null_tickers == 0 else f"⚠️  FAIL ({null_tickers} null tickers)"
    print(f"   ticker: {status}")

validation_results["checks"]["required_fields"] = required_field_checks

# 6. Data quality summary
print("\n6. Data quality summary...")
unique_tickers = df['ticker'].nunique() if 'ticker' in df.columns else 0
unique_dates = df['query_date'].nunique() if 'query_date' in df.columns else 0

validation_results["summary"] = {
    "unique_tickers": int(unique_tickers),
    "unique_dates": int(unique_dates),
    "articles_per_ticker_avg": float(len(df) / unique_tickers) if unique_tickers > 0 else 0.0
}

print(f"   Unique tickers: {unique_tickers}")
print(f"   Unique dates: {unique_dates}")
print(f"   Avg articles per ticker: {len(df) / unique_tickers:.1f}" if unique_tickers > 0 else "   Avg articles per ticker: N/A")

# Overall validation status
# Only fail on critical checks: null dates and required fields
# Duplicates are expected in Bronze layer and will be handled in transformation
critical_checks = []

# Add null date checks
if "null_dates" in validation_results["checks"]:
    for col_check in validation_results["checks"]["null_dates"].values():
        if isinstance(col_check, dict):
            critical_checks.append(col_check)

# Add required field checks
if "required_fields" in validation_results["checks"]:
    for field_check in validation_results["checks"]["required_fields"].values():
        if isinstance(field_check, dict):
            critical_checks.append(field_check)

# Note: URL duplicates are NOT critical - they're expected and will be deduplicated in transformation

all_passed = all(
    check.get("passed", False) 
    for check in critical_checks
    if isinstance(check, dict) and check.get("passed") is not None
)

validation_results["overall_status"] = "PASS" if all_passed else "FAIL"
print(f"\n{'='*70}")
print(f"OVERALL VALIDATION STATUS: {validation_results['overall_status']}")
print(f"{'='*70}")
print("Note: Date continuity gaps and duplicates are informational only")
print("      (Duplicates will be handled during transformation)")
print("      Critical checks: null dates, required fields")


RUNNING VALIDATION CHECKS

1. Checking for null dates...
   date: ✅ PASS
   query_date: ✅ PASS

2. Checking date continuity...
   Date range: 2024-11-12 00:00:00 to 2025-11-12 00:00:00
   Total gaps > 1 day: 589 (expected for news data)
   Large gaps > 7 days: 5
   Note: Large gaps may indicate missing data collection periods

3. Checking for duplicates...
   (ticker, query_date, news_url): ℹ️  INFO (1994 duplicate records - will be deduplicated in transformation)
      Unique duplicate groups: 993
      Avg duplicates per group: 2.0
      Max duplicates in a group: 3
      Note: Duplicates are expected (same URL found via different search terms)
            They will be removed during the transformation step
   (ticker, query_date, title): ℹ️  INFO (1994 duplicates - may be legitimate)

4. Checking column types...
   ticker: ✅ PASS
   query_date: ✅ PASS
   title: ✅ PASS
   text: ✅ PASS

5. Checking required fields...
   title: ✅ PASS
   ticker: ✅ PASS

6. Data quality summary...
   Un

In [19]:
# ======================================================================
# CELL 4: Save Validation Report
# ======================================================================
print("=" * 70)
print("SAVING VALIDATION REPORT")
print("=" * 70)

# Generate report filename with current date
report_date = datetime.now().strftime("%Y%m%d")
report_filename = f"{report_date}_news.json"
report_path = os.path.join(validation_reports_dir, report_filename)

# Save validation report
with open(report_path, 'w') as f:
    json.dump(validation_results, f, indent=2, default=str)

print(f"✅ Validation report saved to: {report_path}")
print(f"\nReport Summary:")
print(f"  Total Records: {validation_results['total_records']:,}")
print(f"  Overall Status: {validation_results['overall_status']}")
print(f"  Checks Performed: {len(validation_results['checks'])}")

# Display validation results
print(f"\n{'='*70}")
print("VALIDATION REPORT")
print(f"{'='*70}")
print(json.dumps(validation_results, indent=2, default=str))


SAVING VALIDATION REPORT
✅ Validation report saved to: /Users/evancallaghan/data_portfolio/data_engineering/stock_x_sentiment/validation_reports/20251114_news.json

Report Summary:
  Total Records: 23,359
  Overall Status: PASS
  Checks Performed: 5

VALIDATION REPORT
{
  "validation_date": "2025-11-14 08:28:25",
  "data_source": "news",
  "total_records": 23359,
  "checks": {
    "null_dates": {
      "date": {
        "null_count": 0,
        "null_percentage": 0.0,
        "passed": "True"
      },
      "query_date": {
        "null_count": 0,
        "null_percentage": 0.0,
        "passed": "True"
      }
    },
    "date_continuity": {
      "min_date": "2024-11-12 00:00:00",
      "max_date": "2025-11-12 00:00:00",
      "date_range_days": 365,
      "total_gaps_1day": 589,
      "total_gaps_7day": 5,
      "continuity_issues": [
        {
          "ticker": "ORCL",
          "gap_start": "2025-04-16",
          "gap_end": "2025-04-25",
          "gap_days": 9
        },
     