In [None]:
import pandas as pd
import re

# Read the log file and create a DataFrame
# Read the log file
# Example of log:
# 2025-02-14 13:08:12,338 - INFO - Received prompt: 
# Consider this SKELETON_QUERY and these TABLES_SCHEMA:
# 
# <SKELETON_QUERY>
# SELECT
# ...
# 2025-02-14 14:12:09,757  - INFO - Prompt model|llama3|Ollama:
# 2025-02-14 14:12:09,768 - INFO - Response model|llama3|Ollama:
# Consider this SKELETON_QUERY and these TABLES_SCHEMA:
# 
# <SKELETON_QUERY>
# ...


log_data = []
current_entry = None

# Regex pattern to match: "YYYY-MM-DD HH:MM:SS,MMM - INFO - Message"
log_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2},\d{3})\s-\s(INFO)\s-\s([^:]*?):\s?(.*)')

with open('../output/batch_009/app.log', 'r') as file:
    for line in file:
        # Check if line matches the timestamp pattern
        match = log_pattern.match(line)
        if match:
            if current_entry:
                log_data.append(current_entry)
            # Extract groups from regex match
            timestamp, level, type, message = match.groups()
            current_entry = {
                'timestamp': timestamp,
                'level': level,
                'type' : type,
                'message': message.strip()
            }
        elif current_entry and line.strip():
            # Append additional lines to the message
            current_entry['message'] += '\n' + line.strip()

# Don't forget to add the last entry
if current_entry:
    log_data.append(current_entry)

# Create DataFrame
df = pd.DataFrame(log_data)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Create model and platform columns by extracting from relevant rows
df[['type', 'model', 'platform']] = df['type'].str.split('|', expand=True)
# Fill NaN values for rows that don't contain model/platform info
df['model'] = df['model'].fillna('')
df['platform'] = df['platform'].fillna('')

df['message_len']=df['message'].apply(len)
df['tokens']=df['message_len']/4
# Display the first few rows
#print(df[['type','model','platform','message_len','tokens']].head(20).to_markdown())
df['type'].value_counts()
# Filter the DataFrame for 'Prompt model' and 'Response model'
filtered_df = df[df['type'].isin(['Prompt model', 'Response model'])]

# Group by 'type' and calculate the sum of 'tokens' and 'message_len'
grouped_df = filtered_df.groupby('type').agg({'tokens': 'sum', 'message_len': 'sum','message':'count'}).reset_index()

# Calculate the cost of the prompts in o1
# Prompt model:  $15 / Million of tokens
# Response model: $60 / Million of tokens

grouped_df['cost'] = grouped_df.apply(lambda row: (row['tokens'] / 1_000_000) * (15 if row['type'] == 'Prompt model' else 60), axis=1)

# Add a totals row
totals = grouped_df[['tokens', 'message_len', 'message', 'cost']].sum()
totals['type'] = 'Total'
grouped_df = pd.concat([grouped_df, pd.DataFrame([totals])], ignore_index=True)

print(grouped_df.to_markdown(index=False))

| type           |   tokens |   message_len |   message |    cost |
|:---------------|---------:|--------------:|----------:|--------:|
| Prompt model   | 234114   |        936455 |        98 | 3.51171 |
| Response model |  10704.5 |         42818 |        96 | 0.64227 |
| Total          | 244818   |        979273 |       194 | 4.15398 |


  df['timestamp'] = pd.to_datetime(df['timestamp'])
