# Teradata SpoolSpace History Report

This notebook demonstrates how to retrieve spool space usage history from Teradata PDCR data using the `PDCRInfoReport` class.

**Report Parameters:**
- Database filter: `DWP01%` (all databases starting with DWP01)
- Time range: Last 3 years
- Data source: `PDCRINFO.SpoolSpace_Hst`

## 1. Import Required Libraries

Import necessary libraries for PDCR reporting and data analysis.

In [None]:
import logging
import sys
from pathlib import Path
from datetime import date, timedelta
import pandas as pd

# Add src to path for imports
sys.path.insert(0, str(Path.cwd()))

# Import the reporting module
from src.reports import PDCRInfoReport
from src.connection import TeradataConnectionError

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("✓ Libraries imported successfully!")

## 2. Configure Date Range

Calculate the date range for the last 3 years of data.

In [None]:
# Calculate last 3 years date range
end_date = date.today() - timedelta(days=1)  # Yesterday
start_date = end_date - timedelta(days=3*365)   # 3 years ago

# Database filter pattern
database_pattern = "DWP01%"

print(f"Date Range:")
print(f"  Start Date: {start_date}")
print(f"  End Date:   {end_date}")
print(f"  Database Pattern: {database_pattern}")
print(f"  Days: {(end_date - start_date).days + 1}")

## 3. Initialize PDCR Report Generator

Create an instance of the `PDCRInfoReport` class to access PDCR data.

In [None]:
try:
    # Initialize the report generator
    report = PDCRInfoReport()
    print("✓ PDCRInfoReport initialized successfully")
    
    # List available environments
    environments = report.conn_mgr.list_environments()
    print(f"✓ Available environments: {environments}")
    
except TeradataConnectionError as e:
    print(f"✗ Connection Error: {e}")
    print("\nPlease ensure:")
    print("1. td_env.yaml file exists in the project root")
    print("2. Copy td_env.yaml.template to td_env.yaml")
    print("3. Update credentials for your test/prod environments")

## 4. Retrieve SpoolSpace History Data

Query `PDCRINFO.SpoolSpace_Hst` for all databases starting with `DWP01%` over the last 3 years.

In [None]:
try:
    # Retrieve spoolspace history
    df = report.get_spoolspace_history(
        env_name='test',  # Change to 'prod' for production data
        start_date=start_date,
        end_date=end_date,
        database_name=database_pattern
    )
    
    print(f"✓ Retrieved {len(df):,} rows from PDCRINFO.SpoolSpace_Hst")
    print(f"\nDataFrame Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
except Exception as e:
    print(f"✗ Error retrieving spoolspace data: {e}")
    df = None

## 5. Display Sample Data

Preview the first few rows to understand the data structure.

In [None]:
if df is not None and not df.empty:
    print("First 10 rows:")
    display(df.head(10))
    
    print("\nColumn Data Types:")
    print(df.dtypes)
else:
    print("No data available to display.")

## 6. Data Summary Statistics

Analyze the spoolspace usage across all retrieved data.

In [None]:
if df is not None and not df.empty:
    print("=" * 80)
    print("SPOOLSPACE SUMMARY STATISTICS")
    print("=" * 80)
    
    # Date range
    print(f"\nDate Range:")
    print(f"  First Log Date: {df['LogDate'].min()}")
    print(f"  Last Log Date:  {df['LogDate'].max()}")
    print(f"  Unique Dates:   {df['LogDate'].nunique()}")
    
    # Database coverage
    print(f"\nDatabase Coverage:")
    print(f"  Unique Databases: {df['DatabaseName'].nunique()}")
    print(f"  Unique Accounts:  {df['AccountName'].nunique()}")
    
    # Space usage statistics (in bytes, convert to GB)
    print(f"\nCurrent Spool Usage (GB):")
    print(f"  Total:   {df['CURRENTSPOOL'].sum() / 1024**3:,.2f}")
    print(f"  Mean:    {df['CURRENTSPOOL'].mean() / 1024**3:,.2f}")
    print(f"  Median:  {df['CURRENTSPOOL'].median() / 1024**3:,.2f}")
    print(f"  Max:     {df['CURRENTSPOOL'].max() / 1024**3:,.2f}")
    
    print(f"\nPeak Spool Usage (GB):")
    print(f"  Total:   {df['PEAKSPOOL'].sum() / 1024**3:,.2f}")
    print(f"  Mean:    {df['PEAKSPOOL'].mean() / 1024**3:,.2f}")
    print(f"  Median:  {df['PEAKSPOOL'].median() / 1024**3:,.2f}")
    print(f"  Max:     {df['PEAKSPOOL'].max() / 1024**3:,.2f}")
    
    print(f"\nMax Spool Usage (GB):")
    print(f"  Total:   {df['MAXSPOOL'].sum() / 1024**3:,.2f}")
    print(f"  Mean:    {df['MAXSPOOL'].mean() / 1024**3:,.2f}")
    print(f"  Median:  {df['MAXSPOOL'].median() / 1024**3:,.2f}")
    print(f"  Max:     {df['MAXSPOOL'].max() / 1024**3:,.2f}")
    
    # Skew statistics
    print(f"\nSkew Statistics:")
    print(f"  Avg Current Skew: {df['CURRENTSPOOLSKEW'].mean():.2f}%")
    print(f"  Max Current Skew: {df['CURRENTSPOOLSKEW'].max():.2f}%")
else:
    print("No data available for analysis.")

## 7. Top Databases by Current Spool Usage

Identify the databases with highest current spool space usage.

In [None]:
if df is not None and not df.empty:
    # Get the most recent data for each database
    latest_data = df.loc[df.groupby('DatabaseName')['LogDate'].idxmax()]
    
    # Sort by current spool usage
    top_dbs = latest_data.nlargest(20, 'CURRENTSPOOL')[[
        'DatabaseName', 'AccountName', 
        'CURRENTSPOOL', 'PEAKSPOOL', 'MAXSPOOL', 'CURRENTSPOOLSKEW'
    ]].copy()
    
    # Convert to GB for readability
    top_dbs['CURRENTSPOOL_GB'] = top_dbs['CURRENTSPOOL'] / 1024**3
    top_dbs['PEAKSPOOL_GB'] = top_dbs['PEAKSPOOL'] / 1024**3
    top_dbs['MAXSPOOL_GB'] = top_dbs['MAXSPOOL'] / 1024**3
    
    print("\nTop 20 Databases by Current Spool Space Usage:")
    print("=" * 120)
    display(top_dbs[[
        'DatabaseName', 'CURRENTSPOOL_GB', 'PEAKSPOOL_GB', 'MAXSPOOL_GB',
        'CURRENTSPOOLSKEW'
    ]].sort_values('CURRENTSPOOL_GB', ascending=False))
else:
    print("No data available for database ranking.")

## 8. Database-Level Aggregation

Get summary statistics for each database showing spool usage patterns.

In [None]:
if df is not None and not df.empty:
    # Get most recent data
    latest_data = df.loc[df.groupby('DatabaseName')['LogDate'].idxmax()]
    
    # Aggregate by database
    db_summary = latest_data.groupby('DatabaseName').agg({
        'CURRENTSPOOL': 'first',
        'PEAKSPOOL': 'first',
        'MAXSPOOL': 'first',
        'CURRENTSPOOLSKEW': 'first'
    }).round(2)
    
    # Convert to GB
    db_summary['CURRENTSPOOL_GB'] = (db_summary['CURRENTSPOOL'] / 1024**3).round(2)
    db_summary['PEAKSPOOL_GB'] = (db_summary['PEAKSPOOL'] / 1024**3).round(2)
    db_summary['MAXSPOOL_GB'] = (db_summary['MAXSPOOL'] / 1024**3).round(2)
    
    # Sort by current usage
    db_summary = db_summary.sort_values('CURRENTSPOOL_GB', ascending=False)
    
    print("\nDatabase-Level Spool Space Usage Summary (Latest):")
    print("=" * 100)
    display(db_summary[['CURRENTSPOOL_GB', 'PEAKSPOOL_GB', 'MAXSPOOL_GB', 'CURRENTSPOOLSKEW']])
    
    print(f"\nTotal Spool Space Across All DWP01% Databases: {db_summary['CURRENTSPOOL_GB'].sum():,.2f} GB")
else:
    print("No data available for database aggregation.")

## 9. Visualize Database Spool Usage

Visualize the top databases by current spool space usage using a bar chart.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

if df is not None and not df.empty:
    # Get most recent data
    latest_data = df.loc[df.groupby('DatabaseName')['LogDate'].idxmax()]

    # Get top 10 databases
    top_dbs = latest_data.nlargest(10, 'CURRENTSPOOL')[['DatabaseName', 'CURRENTSPOOL']].copy()
    top_dbs['CURRENTSPOOL_GB'] = top_dbs['CURRENTSPOOL'] / 1024**3

    # Plot
    plt.figure(figsize=(12, 6))
    sns.barplot(
        data=top_dbs,
        x='DatabaseName',
        y='CURRENTSPOOL_GB',
        palette='viridis'
    )
    plt.title('Top 10 Databases by Current Spool Space Usage (GB)')
    plt.xlabel('Database Name')
    plt.ylabel('Current Spool Space (GB)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 10. Spool Usage Trends Over Time

Plot total spool usage for all databases over the 3-year period.

In [None]:
# Plot total spool usage over time
import matplotlib.pyplot as plt

if df is not None and not df.empty:
    df_databases_over_time = df.groupby(['LogDate'])['CURRENTSPOOL'].sum().reset_index()
    plt.figure(figsize=(14, 6))
    plt.plot(df_databases_over_time['LogDate'], df_databases_over_time['CURRENTSPOOL'] / 1024**3, marker='o', linewidth=2)
    plt.title('Total Current Spool Space Usage Over 3 Years')
    plt.xlabel('Log Date')
    plt.ylabel('Total Current Spool Space (GB)')
    plt.grid()
    plt.tight_layout()
    plt.show()

## 11. Database Spool Usage Distribution

Pie chart showing spool distribution among top databases.

In [None]:
# Plot as pie chart per database usage
import matplotlib.pyplot as plt

if df is not None and not df.empty:
    df_latest = df.loc[df.groupby('DatabaseName')['LogDate'].idxmax()]
    df_db_usage = df_latest.groupby('DatabaseName')['CURRENTSPOOL'].sum().reset_index()
    df_db_usage = df_db_usage.sort_values('CURRENTSPOOL', ascending=False)
    
    top_n = 5
    df_top = df_db_usage.head(top_n)
    df_other = pd.DataFrame({
        'DatabaseName': ['Other'],
        'CURRENTSPOOL': [df_db_usage['CURRENTSPOOL'][top_n:].sum()]
    })
    df_pie = pd.concat([df_top, df_other])
    
    plt.figure(figsize=(10, 8))
    plt.pie(
        df_pie['CURRENTSPOOL'],
        labels=df_pie['DatabaseName'],
        autopct='%1.1f%%',
        startangle=140
    )
    plt.title('Current Spool Space Usage by Database (Latest)')
    plt.show()

## 12. Top Database Spool Analysis

Analyze spool usage trends for the top 6 databases over time.

In [None]:
# Plot the spool usage of the top 6 databases over time in subplots
import matplotlib.pyplot as plt
import numpy as np

if df is not None and not df.empty:
    df_latest = df.loc[df.groupby('DatabaseName')['LogDate'].idxmax()]
    top_dbs = df_latest.nlargest(6, 'CURRENTSPOOL')['DatabaseName'].tolist()
    
    fig, axes = plt.subplots(3, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    for i, db_name in enumerate(top_dbs):
        ax = axes[i]
        df_db = df[df['DatabaseName'] == db_name].sort_values('LogDate').reset_index(drop=True)
        
        # Plot the data
        ax.plot(df_db.index, df_db['CURRENTSPOOL'] / 1024**3, marker='o', label='Current Spool', linewidth=2)
        ax.plot(df_db.index, df_db['PEAKSPOOL'] / 1024**3, marker='s', label='Peak Spool', linewidth=2, alpha=0.7)
        
        # Add a regression line for trend
        if len(df_db) > 1:
            z = np.polyfit(df_db.index, df_db['CURRENTSPOOL'] / 1024**3, 1)
            p = np.poly1d(z)
            ax.plot(df_db.index, p(df_db.index), "r--", alpha=0.7, label='Trend', linewidth=2)
        
        # Set x-axis labels to show dates
        ax.set_xticks(df_db.index[::max(1, len(df_db)//5)])
        ax.set_xticklabels([str(d) for d in df_db.loc[df_db.index[::max(1, len(df_db)//5)], 'LogDate']], rotation=45)
        
        ax.set_title(f'{db_name} - Spool Usage Over Time')
        ax.set_xlabel('Log Date')
        ax.set_ylabel('Spool Space (GB)')
        ax.grid()
        ax.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No data available for spool analysis.")

## 13. Export Results to CSV (Optional)

Save the results to CSV files for further analysis or reporting.

In [None]:
if df is not None and not df.empty:
    # Create output directory if it doesn't exist
    output_dir = Path('output')
    output_dir.mkdir(exist_ok=True)
    
    # Generate filename with date range
    filename = f"spoolspace_dwp01_{start_date}_{end_date}.csv"
    output_path = output_dir / filename
    
    # Save to CSV
    df.to_csv(output_path, index=False)
    print(f"✓ Data exported to: {output_path}")
    print(f"  Rows: {len(df):,}")
    print(f"  File size: {output_path.stat().st_size / 1024**2:.2f} MB")
else:
    print("No data to export.")

## 14. Close Connections

Properly clean up database connections when done.

In [None]:
try:
    if 'report' in locals():
        report.close()
        print("✓ All database connections closed successfully")
    else:
        print("No report instance to close")
except Exception as e:
    print(f"✗ Error closing connections: {e}")