# Cost Analysis Basics

This notebook introduces the fundamentals of LLM cost analysis using the LLM-Cost-Ops platform.

## Learning Objectives
- Load and explore cost data from the API
- Perform basic aggregations and statistics
- Create time-series visualizations
- Compare costs across models and projects

## Prerequisites
```bash
pip install pandas matplotlib requests seaborn numpy
```

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Configuration and Authentication

In [None]:
# API Configuration
BASE_URL = 'http://localhost:3000/api'  # Update with your instance URL
API_KEY = 'your-api-key-here'  # Replace with your API key

# Set up headers
HEADERS = {
    'Authorization': f'Bearer {API_KEY}',
    'Content-Type': 'application/json'
}

def api_request(endpoint, method='GET', data=None):
    """Helper function to make API requests"""
    url = f"{BASE_URL}/{endpoint}"
    
    if method == 'GET':
        response = requests.get(url, headers=HEADERS, params=data)
    elif method == 'POST':
        response = requests.post(url, headers=HEADERS, json=data)
    
    response.raise_for_status()
    return response.json()

print("Configuration complete!")

## 2. Load Cost Data

We'll fetch cost tracking data from the API and load it into a pandas DataFrame.

In [None]:
# Fetch cost tracking data for the last 30 days
end_date = datetime.now()
start_date = end_date - timedelta(days=30)

params = {
    'start_date': start_date.isoformat(),
    'end_date': end_date.isoformat(),
    'limit': 1000
}

# Fetch data
cost_data = api_request('cost-tracking', method='GET', data=params)

# Convert to DataFrame
df = pd.DataFrame(cost_data['data'])

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date

print(f"Loaded {len(df)} cost records")
df.head()

## 3. Data Exploration

Let's explore the structure and summary statistics of our cost data.

In [None]:
# Display data info
print("Dataset Information:")
print(df.info())

print("\nColumn Names:")
print(df.columns.tolist())

In [None]:
# Summary statistics
print("Cost Summary Statistics:")
print(df[['total_cost', 'input_tokens', 'output_tokens']].describe())

In [None]:
# Unique values
print(f"Unique Models: {df['model'].nunique()}")
print(f"Models: {df['model'].unique().tolist()}")
print(f"\nUnique Projects: {df['project_id'].nunique()}")
print(f"\nUnique Users: {df['user_id'].nunique()}")

## 4. Basic Aggregations

Compute key metrics and aggregations.

In [None]:
# Total costs and tokens
total_cost = df['total_cost'].sum()
total_input_tokens = df['input_tokens'].sum()
total_output_tokens = df['output_tokens'].sum()
total_tokens = total_input_tokens + total_output_tokens

print(f"Total Cost: ${total_cost:,.2f}")
print(f"Total Tokens: {total_tokens:,}")
print(f"  Input Tokens: {total_input_tokens:,}")
print(f"  Output Tokens: {total_output_tokens:,}")
print(f"\nAverage Cost per Request: ${df['total_cost'].mean():.4f}")
print(f"Average Cost per 1K Tokens: ${(total_cost / total_tokens * 1000):.4f}")

In [None]:
# Cost by model
model_costs = df.groupby('model').agg({
    'total_cost': 'sum',
    'input_tokens': 'sum',
    'output_tokens': 'sum',
    'request_id': 'count'
}).round(4)

model_costs.columns = ['Total Cost', 'Input Tokens', 'Output Tokens', 'Request Count']
model_costs['Avg Cost/Request'] = (model_costs['Total Cost'] / model_costs['Request Count']).round(4)
model_costs = model_costs.sort_values('Total Cost', ascending=False)

print("Cost Analysis by Model:")
print(model_costs)

In [None]:
# Daily cost trends
daily_costs = df.groupby('date').agg({
    'total_cost': 'sum',
    'request_id': 'count'
}).round(2)

daily_costs.columns = ['Total Cost', 'Request Count']
daily_costs['Avg Cost/Request'] = (daily_costs['Total Cost'] / daily_costs['Request Count']).round(4)

print("Daily Cost Summary (Last 7 Days):")
print(daily_costs.tail(7))

## 5. Time-Series Visualization

Create visualizations to understand cost trends over time.

In [None]:
# Daily cost trend
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Daily total cost
daily_costs['Total Cost'].plot(ax=ax1, marker='o', linewidth=2, markersize=6)
ax1.set_title('Daily Total Cost Trend', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date', fontsize=12)
ax1.set_ylabel('Total Cost ($)', fontsize=12)
ax1.grid(True, alpha=0.3)
ax1.fill_between(daily_costs.index, daily_costs['Total Cost'], alpha=0.3)

# Plot 2: Daily request count
daily_costs['Request Count'].plot(ax=ax2, marker='s', linewidth=2, markersize=6, color='coral')
ax2.set_title('Daily Request Count', fontsize=14, fontweight='bold')
ax2.set_xlabel('Date', fontsize=12)
ax2.set_ylabel('Number of Requests', fontsize=12)
ax2.grid(True, alpha=0.3)
ax2.fill_between(daily_costs.index, daily_costs['Request Count'], alpha=0.3, color='coral')

plt.tight_layout()
plt.show()

In [None]:
# Hourly pattern analysis
df['hour'] = df['timestamp'].dt.hour
hourly_pattern = df.groupby('hour')['total_cost'].agg(['sum', 'count', 'mean'])

fig, ax = plt.subplots(figsize=(14, 6))
x = hourly_pattern.index
ax.bar(x, hourly_pattern['sum'], alpha=0.7, label='Total Cost')
ax.set_title('Cost Distribution by Hour of Day', fontsize=14, fontweight='bold')
ax.set_xlabel('Hour of Day', fontsize=12)
ax.set_ylabel('Total Cost ($)', fontsize=12)
ax.set_xticks(range(24))
ax.grid(True, alpha=0.3, axis='y')
ax.legend()
plt.tight_layout()
plt.show()

## 6. Comparative Analysis

Compare costs across different dimensions.

In [None]:
# Model comparison - Cost distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Pie chart: Cost share by model
model_cost_share = df.groupby('model')['total_cost'].sum().sort_values(ascending=False)
axes[0].pie(model_cost_share.values, labels=model_cost_share.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Cost Share by Model', fontsize=14, fontweight='bold')

# Bar chart: Cost per model
model_cost_share.plot(kind='barh', ax=axes[1], color='skyblue')
axes[1].set_title('Total Cost by Model', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Total Cost ($)', fontsize=12)
axes[1].set_ylabel('Model', fontsize=12)
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
# Token efficiency comparison
model_efficiency = df.groupby('model').apply(
    lambda x: pd.Series({
        'total_tokens': x['input_tokens'].sum() + x['output_tokens'].sum(),
        'total_cost': x['total_cost'].sum(),
        'requests': len(x)
    })
)

model_efficiency['cost_per_1k_tokens'] = (model_efficiency['total_cost'] / model_efficiency['total_tokens'] * 1000).round(4)
model_efficiency['tokens_per_request'] = (model_efficiency['total_tokens'] / model_efficiency['requests']).round(0)

print("Model Efficiency Metrics:")
print(model_efficiency.sort_values('cost_per_1k_tokens'))

In [None]:
# Cost vs. Token scatter plot
fig, ax = plt.subplots(figsize=(12, 7))

df['total_tokens'] = df['input_tokens'] + df['output_tokens']

for model in df['model'].unique():
    model_df = df[df['model'] == model]
    ax.scatter(model_df['total_tokens'], model_df['total_cost'], 
               alpha=0.6, s=50, label=model)

ax.set_title('Cost vs. Total Tokens by Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Total Tokens', fontsize=12)
ax.set_ylabel('Cost ($)', fontsize=12)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Cost Distribution Analysis

In [None]:
# Cost distribution histogram
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
axes[0].hist(df['total_cost'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Request Costs', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Cost ($)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(df['total_cost'].mean(), color='red', linestyle='--', 
                label=f'Mean: ${df["total_cost"].mean():.4f}')
axes[0].axvline(df['total_cost'].median(), color='green', linestyle='--', 
                label=f'Median: ${df["total_cost"].median():.4f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Box plot by model
df.boxplot(column='total_cost', by='model', ax=axes[1])
axes[1].set_title('Cost Distribution by Model', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Model', fontsize=12)
axes[1].set_ylabel('Cost ($)', fontsize=12)
plt.suptitle('')  # Remove the automatic title

plt.tight_layout()
plt.show()

## 8. Summary Report

Generate a comprehensive summary report.

In [None]:
# Create summary report
print("="*60)
print("LLM COST ANALYSIS SUMMARY REPORT")
print("="*60)
print(f"\nAnalysis Period: {start_date.date()} to {end_date.date()}")
print(f"Total Days: {(end_date - start_date).days}")
print("\n" + "-"*60)
print("OVERALL METRICS")
print("-"*60)
print(f"Total Cost: ${total_cost:,.2f}")
print(f"Total Requests: {len(df):,}")
print(f"Total Tokens: {total_tokens:,}")
print(f"Average Cost per Request: ${df['total_cost'].mean():.4f}")
print(f"Average Cost per 1K Tokens: ${(total_cost / total_tokens * 1000):.4f}")
print(f"\nDaily Average Cost: ${total_cost / (end_date - start_date).days:.2f}")
print(f"Daily Average Requests: {len(df) / (end_date - start_date).days:.0f}")

print("\n" + "-"*60)
print("TOP 3 MOST EXPENSIVE MODELS")
print("-"*60)
for idx, (model, cost) in enumerate(model_cost_share.head(3).items(), 1):
    pct = (cost / total_cost) * 100
    print(f"{idx}. {model}: ${cost:.2f} ({pct:.1f}%)")

print("\n" + "-"*60)
print("COST TRENDS")
print("-"*60)
print(f"Highest Daily Cost: ${daily_costs['Total Cost'].max():.2f} on {daily_costs['Total Cost'].idxmax()}")
print(f"Lowest Daily Cost: ${daily_costs['Total Cost'].min():.2f} on {daily_costs['Total Cost'].idxmin()}")
print(f"Average Daily Cost: ${daily_costs['Total Cost'].mean():.2f}")
print(f"Standard Deviation: ${daily_costs['Total Cost'].std():.2f}")

print("\n" + "-"*60)
print("RECOMMENDATIONS")
print("-"*60)

# Generate recommendations based on data
most_expensive_model = model_cost_share.idxmax()
print(f"1. Review usage of {most_expensive_model} (highest cost contributor)")
print(f"2. Consider consolidating requests during peak hours ({hourly_pattern['sum'].idxmax()}:00)")

if model_efficiency['cost_per_1k_tokens'].std() > 0.001:
    cheapest_model = model_efficiency['cost_per_1k_tokens'].idxmin()
    print(f"3. Evaluate switching to {cheapest_model} for cost savings where appropriate")

print("\n" + "="*60)

## 9. Export Results

Export analysis results for further use.

In [None]:
# Export to CSV
output_dir = 'cost_analysis_output'
import os
os.makedirs(output_dir, exist_ok=True)

# Export daily costs
daily_costs.to_csv(f'{output_dir}/daily_costs.csv')
print(f"Saved: {output_dir}/daily_costs.csv")

# Export model costs
model_costs.to_csv(f'{output_dir}/model_costs.csv')
print(f"Saved: {output_dir}/model_costs.csv")

# Export model efficiency
model_efficiency.to_csv(f'{output_dir}/model_efficiency.csv')
print(f"Saved: {output_dir}/model_efficiency.csv")

# Export raw data
df.to_csv(f'{output_dir}/raw_cost_data.csv', index=False)
print(f"Saved: {output_dir}/raw_cost_data.csv")

print(f"\nAll results exported to '{output_dir}/' directory")

## Next Steps

Now that you've mastered the basics, continue to:
- **02_advanced_analytics.ipynb** - Learn statistical analysis and forecasting
- **03_cost_optimization.ipynb** - Explore cost optimization strategies
- **04_custom_reports.ipynb** - Build custom reporting dashboards
- **05_ml_forecasting.ipynb** - Apply machine learning for cost predictions

## Additional Resources
- [API Documentation](../../../api/)
- [Cost Optimization Guide](../../../guides/cost-optimization.md)
- [Best Practices](../../../best-practices/)