# Teradata DBQL Summary Table History Report

This notebook demonstrates how to retrieve DBQL summary table history from Teradata PDCR data using the `PDCRInfoReport` class.

**Report Parameters:**
- User filter: `%` (all users)
- Time range: Last 3 years
- Data source: `PDCRINFO.DBQLSummaryTbl_Hst`

**Focus:**
- Top 10 CPU users
- CPU usage trends over time

## 1. Import Required Libraries

Import necessary libraries for PDCR reporting and data analysis.

In [None]:
import logging
import sys
from pathlib import Path
from datetime import date, timedelta
import pandas as pd
import numpy as np

# Add src to path for imports
sys.path.insert(0, str(Path.cwd()))

# Import the reporting module
from src.reports import PDCRInfoReport
from src.connection import TeradataConnectionError

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("✓ Libraries imported successfully!")

## 2. Configure Date Range

Calculate the date range for the last 3 years of data.

In [None]:
# Calculate last 3 years date range
end_date = date.today() - timedelta(days=1)  # Yesterday
start_date = end_date - timedelta(days=3*365)   # 3 years ago

# User filter pattern
user_pattern = "%"

print(f"Date Range:")
print(f"  Start Date: {start_date}")
print(f"  End Date:   {end_date}")
print(f"  User Pattern: {user_pattern}")
print(f"  Days: {(end_date - start_date).days + 1}")

## 3. Initialize PDCR Report Generator

Create an instance of the `PDCRInfoReport` class to access PDCR data.

In [None]:
try:
    # Initialize the report generator
    report = PDCRInfoReport()
    print("✓ PDCRInfoReport initialized successfully")
    
    # List available environments
    environments = report.conn_mgr.list_environments()
    print(f"✓ Available environments: {environments}")
    
except TeradataConnectionError as e:
    print(f"✗ Connection Error: {e}")
    print("\nPlease ensure:")
    print("1. td_env.yaml file exists in the project root")
    print("2. Copy td_env.yaml.template to td_env.yaml")
    print("3. Update credentials for your test/prod environments")

## 4. Retrieve DBQL Summary Table History Data

Query `PDCRINFO.DBQLSummaryTbl_Hst` for all users over the last 3 years.

In [None]:
try:
    # Retrieve DBQL summary table history
    df = report.get_DBQLSummaryTable_History(
        env_name='test',  # Change to 'prod' for production data
        start_date=start_date,
        end_date=end_date,
        user_name=user_pattern
    )
    
    print(f"✓ Retrieved {len(df):,} rows from PDCRINFO.DBQLSummaryTbl_Hst")
    print(f"\nDataFrame Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
except Exception as e:
    print(f"✗ Error retrieving DBQL summary data: {e}")
    df = None

## 5. Display Sample Data

Preview the first few rows to understand the data structure.

In [None]:
if df is not None and not df.empty:
    print("First 10 rows:")
    display(df.head(10))
    
    print("\nColumn Data Types:")
    print(df.dtypes)
else:
    print("No data available to display.")

## 6. Data Summary Statistics

Analyze the CPU usage across all retrieved data.

In [None]:
if df is not None and not df.empty:
    print("=" * 80)
    print("DBQL SUMMARY TABLE STATISTICS")
    print("=" * 80)
    
    # Date range
    print(f"\nDate Range:")
    print(f"  First Log Date: {df['LogDate'].min()}")
    print(f"  Last Log Date:  {df['LogDate'].max()}")
    print(f"  Unique Dates:   {df['LogDate'].nunique()}")
    
    # User coverage
    print(f"\nUser Coverage:")
    print(f"  Unique Users:    {df['UserName'].nunique()}")
    print(f"  Total Queries:   {df['QueryCount'].sum():,.0f}")
    
    # CPU usage statistics
    print(f"\nAMP CPU Time (seconds):")
    print(f"  Total:   {df['AMPCPUTime'].sum():,.2f}")
    print(f"  Mean:    {df['AMPCPUTime'].mean():,.2f}")
    print(f"  Median:  {df['AMPCPUTime'].median():,.2f}")
    print(f"  Max:     {df['AMPCPUTime'].max():,.2f}")
    
    print(f"\nParser CPU Time (seconds):")
    print(f"  Total:   {df['ParserCPUTime'].sum():,.2f}")
    print(f"  Mean:    {df['ParserCPUTime'].mean():,.2f}")
    print(f"  Median:  {df['ParserCPUTime'].median():,.2f}")
    print(f"  Max:     {df['ParserCPUTime'].max():,.2f}")
    
    # Total CPU Time
    df['TotalCPUTime'] = df['AMPCPUTime'] + df['ParserCPUTime']
    print(f"\nTotal CPU Time (seconds):")
    print(f"  Total:   {df['TotalCPUTime'].sum():,.2f}")
    print(f"  Mean:    {df['TotalCPUTime'].mean():,.2f}")
    print(f"  Median:  {df['TotalCPUTime'].median():,.2f}")
    print(f"  Max:     {df['TotalCPUTime'].max():,.2f}")
    
    # I/O statistics
    print(f"\nI/O Statistics:")
    print(f"  Total I/O Count: {df['TotalIOCount'].sum():,.0f}")
    print(f"  Total I/O (GB):  {df['TotalIOInKB'].sum() / 1024**2:,.2f}")
else:
    print("No data available for analysis.")

## 7. Top 10 CPU Users

Identify the top 10 users by total CPU time consumption.

In [None]:
if df is not None and not df.empty:
    # Calculate total CPU time per user
    if 'TotalCPUTime' not in df.columns:
        df['TotalCPUTime'] = df['AMPCPUTime'] + df['ParserCPUTime']
    
    # Aggregate by user
    user_summary = df.groupby('UserName').agg({
        'TotalCPUTime': 'sum',
        'AMPCPUTime': 'sum',
        'ParserCPUTime': 'sum',
        'QueryCount': 'sum',
        'TotalIOCount': 'sum',
        'TotalIOInKB': 'sum'
    }).reset_index()
    
    # Sort by total CPU time
    user_summary = user_summary.sort_values('TotalCPUTime', ascending=False)
    
    # Convert to hours for readability
    user_summary['TotalCPUTime_Hours'] = user_summary['TotalCPUTime'] / 3600
    user_summary['AMPCPUTime_Hours'] = user_summary['AMPCPUTime'] / 3600
    user_summary['ParserCPUTime_Hours'] = user_summary['ParserCPUTime'] / 3600
    user_summary['TotalIOInKB_GB'] = user_summary['TotalIOInKB'] / 1024**2
    
    print("\nTop 10 Users by Total CPU Time:")
    print("=" * 120)
    display(user_summary.head(10)[[
        'UserName', 'TotalCPUTime_Hours', 'AMPCPUTime_Hours', 'ParserCPUTime_Hours',
        'QueryCount', 'TotalIOInKB_GB'
    ]])
    
    # Calculate percentage of total
    total_cpu = user_summary['TotalCPUTime'].sum()
    top_10_cpu = user_summary.head(10)['TotalCPUTime'].sum()
    print(f"\nTop 10 users consume {top_10_cpu/total_cpu*100:.1f}% of total CPU time")
else:
    print("No data available for user ranking.")

## 8. Visualize Top 10 CPU Users

Bar chart showing the top 10 users by CPU consumption.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

if df is not None and not df.empty:
    # Get top 10 users
    top_users = user_summary.head(10)

    # Plot
    plt.figure(figsize=(12, 6))
    sns.barplot(
        data=top_users,
        x='UserName',
        y='TotalCPUTime_Hours',
        palette='rocket'
    )
    plt.title('Top 10 Users by Total CPU Time (Hours)', fontsize=14, fontweight='bold')
    plt.xlabel('User Name', fontsize=12)
    plt.ylabel('Total CPU Time (Hours)', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 9. CPU Usage Per Day Over Period

Plot total CPU usage aggregated by day over the entire time period.

In [None]:
import matplotlib.pyplot as plt

if df is not None and not df.empty:
    # Ensure TotalCPUTime is calculated
    if 'TotalCPUTime' not in df.columns:
        df['TotalCPUTime'] = df['AMPCPUTime'] + df['ParserCPUTime']
    
    # Aggregate by LogDate
    daily_cpu = df.groupby('LogDate')['TotalCPUTime'].sum().reset_index()
    daily_cpu['TotalCPUTime_Hours'] = daily_cpu['TotalCPUTime'] / 3600
    
    # Plot
    plt.figure(figsize=(16, 6))
    plt.plot(daily_cpu['LogDate'], daily_cpu['TotalCPUTime_Hours'], 
             marker='o', linewidth=2, markersize=4, color='#2E86AB')
    plt.title('Total CPU Usage Per Day Over Time', fontsize=14, fontweight='bold')
    plt.xlabel('Log Date', fontsize=12)
    plt.ylabel('Total CPU Time (Hours)', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"\nDaily CPU Statistics:")
    print(f"  Average CPU per day: {daily_cpu['TotalCPUTime_Hours'].mean():,.2f} hours")
    print(f"  Peak CPU day:        {daily_cpu['TotalCPUTime_Hours'].max():,.2f} hours on {daily_cpu.loc[daily_cpu['TotalCPUTime_Hours'].idxmax(), 'LogDate']}")
    print(f"  Minimum CPU day:     {daily_cpu['TotalCPUTime_Hours'].min():,.2f} hours on {daily_cpu.loc[daily_cpu['TotalCPUTime_Hours'].idxmin(), 'LogDate']}")

## 10. CPU Usage Distribution by Top Users

Pie chart showing CPU distribution among top users.

In [None]:
import matplotlib.pyplot as plt

if df is not None and not df.empty:
    # Get top 10 users and aggregate others
    top_n = 10
    top_users = user_summary.head(top_n)
    other_cpu = user_summary['TotalCPUTime'][top_n:].sum()
    
    # Create pie data
    pie_data = top_users[['UserName', 'TotalCPUTime']].copy()
    pie_data = pd.concat([
        pie_data,
        pd.DataFrame({'UserName': ['Others'], 'TotalCPUTime': [other_cpu]})
    ])
    
    plt.figure(figsize=(10, 8))
    plt.pie(
        pie_data['TotalCPUTime'],
        labels=pie_data['UserName'],
        autopct='%1.1f%%',
        startangle=140
    )
    plt.title('CPU Usage Distribution by User', fontsize=14, fontweight='bold')
    plt.show()

## 11. Top Users CPU Trends Over Time

Line plot showing CPU usage trends for top 6 users over time.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

if df is not None and not df.empty:
    # Get top 6 users by total CPU
    top_6_users = user_summary.head(6)['UserName'].tolist()
    
    # Filter data for top users
    df_top = df[df['UserName'].isin(top_6_users)].copy()
    
    if 'TotalCPUTime' not in df_top.columns:
        df_top['TotalCPUTime'] = df_top['AMPCPUTime'] + df_top['ParserCPUTime']
    
    # Aggregate by user and date
    user_daily = df_top.groupby(['LogDate', 'UserName'])['TotalCPUTime'].sum().reset_index()
    user_daily['TotalCPUTime_Hours'] = user_daily['TotalCPUTime'] / 3600
    
    # Plot
    plt.figure(figsize=(16, 8))
    for user in top_6_users:
        user_data = user_daily[user_daily['UserName'] == user]
        plt.plot(user_data['LogDate'], user_data['TotalCPUTime_Hours'], 
                marker='o', linewidth=2, label=user, alpha=0.7)
    
    plt.title('CPU Usage Trends for Top 6 Users Over Time', fontsize=14, fontweight='bold')
    plt.xlabel('Log Date', fontsize=12)
    plt.ylabel('Total CPU Time (Hours)', fontsize=12)
    plt.legend(loc='best', fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No data available for trend analysis.")

## 12. Query Count Analysis

Analyze query counts by user.

In [None]:
if df is not None and not df.empty:
    # Top users by query count
    query_summary = df.groupby('UserName')['QueryCount'].sum().reset_index()
    query_summary = query_summary.sort_values('QueryCount', ascending=False)
    
    print("\nTop 20 Users by Query Count:")
    print("=" * 80)
    display(query_summary.head(20))
    
    # Plot top 10 by query count
    plt.figure(figsize=(12, 6))
    sns.barplot(
        data=query_summary.head(10),
        x='UserName',
        y='QueryCount',
        palette='viridis'
    )
    plt.title('Top 10 Users by Query Count', fontsize=14, fontweight='bold')
    plt.xlabel('User Name', fontsize=12)
    plt.ylabel('Query Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No data available for query analysis.")

## 13. Export Results to CSV (Optional)

Save the results to CSV files for further analysis or reporting.

In [None]:
if df is not None and not df.empty:
    # Create output directory if it doesn't exist
    output_dir = Path('output')
    output_dir.mkdir(exist_ok=True)
    
    # Save full data
    filename = f"dbql_summary_{start_date}_{end_date}.csv"
    output_path = output_dir / filename
    df.to_csv(output_path, index=False)
    print(f"✓ Full data exported to: {output_path}")
    print(f"  Rows: {len(df):,}")
    print(f"  File size: {output_path.stat().st_size / 1024**2:.2f} MB")
    
    # Save user summary
    user_summary_path = output_dir / f"dbql_user_summary_{start_date}_{end_date}.csv"
    user_summary.to_csv(user_summary_path, index=False)
    print(f"\n✓ User summary exported to: {user_summary_path}")
    
    # Save daily CPU data
    daily_cpu_path = output_dir / f"dbql_daily_cpu_{start_date}_{end_date}.csv"
    daily_cpu.to_csv(daily_cpu_path, index=False)
    print(f"✓ Daily CPU data exported to: {daily_cpu_path}")
else:
    print("No data to export.")

## 14. Close Connections

Properly clean up database connections when done.

In [None]:
try:
    if 'report' in locals():
        report.close()
        print("✓ All database connections closed successfully")
    else:
        print("No report instance to close")
except Exception as e:
    print(f"✗ Error closing connections: {e}")