In [9]:
import pandas as pd
import re
import pyarrow.csv as pv
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

In [18]:
# Constants
filepath = "access.log"
url_prefix = ""
individual_url_postfix = "load"

In [19]:
# Read log lines from file
with open(filepath, "r") as f:
    log_lines = f.readlines()

# Pattern to match log format
pattern = re.compile(
    r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[(?P<timestamp>.*?)\]\s+"(?P<method>\w+)\s+(?P<path>.*?)\s+HTTP/[\d.]+"\s+(?P<status>\d+)\s+(?P<bytes>\d+)\s+(?P<response_time>\d+)'
)

# Parse each line into a dictionary
parsed_logs = []
for line in log_lines:
    match = pattern.match(line)
    if match:
        parsed_logs.append(match.groupdict())

# Convert to DataFrame
df = pd.DataFrame(parsed_logs)

# Convert numeric columns to appropriate types
df['status'] = df['status'].astype(int)
df['bytes'] = df['bytes'].astype(int)
df['response_time'] = df['response_time'].astype(int)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')

print(df.head())


               ip                 timestamp  method  \
0  192.100.20.135 2014-10-07 22:44:16+00:00     GET   
1    203.0.113.45 2014-10-07 22:44:17+00:00    POST   
2   198.51.100.12 2014-10-07 22:44:18+00:00     GET   
3   203.0.113.100 2014-10-07 22:44:19+00:00     GET   
4   198.51.100.22 2014-10-07 22:44:20+00:00  DELETE   

                               path  status  bytes  response_time  
0  /ServiceAPI/mappings/123456/load     200   1414           6234  
1      /ServiceAPI/data/789012/push     201   2345           7123  
2  /ServiceAPI/mappings/123456/load     200   1450           6345  
3                /ServiceAPI/status     200    512           1023  
4           /ServiceAPI/data/789012     204      0            789  


In [20]:
# Filter and clean URLs
df = df[df['path'].str.startswith(url_prefix)].copy()
df['url'] = df['path'].str.replace(url_prefix, '', regex=False)

# Group by URL for statistics
query_url_stats = (
    df.groupby('url')
    .agg(
        cnt=('response_time', 'size'),
        min=('response_time', 'min'),
        max=('response_time', 'max'),
        mean=('response_time', 'mean'),
        median=('response_time', 'median'),
        P75=('response_time', lambda x: x.quantile(0.75)),
        P90=('response_time', lambda x: x.quantile(0.90)),
        P95=('response_time', lambda x: x.quantile(0.95)),
        P99=('response_time', lambda x: x.quantile(0.99)),
        P999=('response_time', lambda x: x.quantile(0.999)),
    )
    .reset_index()
    .sort_values(by='P95', ascending=False)
)

print(query_url_stats)

                                  url  cnt   min   max         mean  median  \
1        /ServiceAPI/data/789012/push    1  7123  7123  7123.000000  7123.0   
2    /ServiceAPI/mappings/123456/load    3  6001  6345  6193.333333  6234.0   
3  /ServiceAPI/mappings/123456/update    1  4580  4580  4580.000000  4580.0   
4                  /ServiceAPI/status    1  1023  1023  1023.000000  1023.0   
0             /ServiceAPI/data/789012    1   789   789   789.000000   789.0   

      P75     P90     P95      P99      P999  
1  7123.0  7123.0  7123.0  7123.00  7123.000  
2  6289.5  6322.8  6333.9  6342.78  6344.778  
3  4580.0  4580.0  4580.0  4580.00  4580.000  
4  1023.0  1023.0  1023.0  1023.00  1023.000  
0   789.0   789.0   789.0   789.00   789.000  


In [21]:
container_data = df[df['url'].str.endswith(individual_url_postfix)].copy()

# Convert time to datetime and round
container_data['timestamp'] = pd.to_datetime(container_data['timestamp'])
container_data['time_second'] = container_data['timestamp'].dt.floor('s')
container_data['time_minute'] = container_data['timestamp'].dt.floor('min')

# Stats per second
container_stats_per_sec = (
    container_data.groupby('time_second')
    .agg(cnt=('url', 'size'))
    .reset_index()
)

# Plot with Plotly: Requests per second
fig1 = px.line(container_stats_per_sec, x='time_second', y='cnt',
               title='Requests per Second (Container URLs)',
               labels={'time_second': 'Time (sec)', 'cnt': 'Request Count'})
fig1.update_layout(xaxis_tickformat='%H:%M:%S')
fig1.show()

# Stats per minute with percentiles
container_stats_per_minute = (
    container_data.groupby('time_minute')
    .agg(
        cnt=('url', 'size'),
        mean=('response_time', 'mean'),
        median=('response_time', 'median'),
        P90=('response_time', lambda x: x.quantile(0.90)),
        P95=('response_time', lambda x: x.quantile(0.95)),
        P99=('response_time', lambda x: x.quantile(0.99)),
    )
    .reset_index()
)

# Reshape to long format
long_df = container_stats_per_minute.melt(
    id_vars='time_minute',
    value_vars=['mean', 'P95', 'P99'],
    var_name='Percentile',
    value_name='Time'
)

# Plot with Plotly: Time Percentiles
fig2 = px.line(long_df, x='time_minute', y='Time', color='Percentile',
               title='Processing Time Percentiles Over Time (per minute)',
               labels={'time_minute': 'Time (min)', 'Time': 'Time Total (s)'})

fig2.update_layout(
    yaxis=dict(range=[0, 0.5], tick0=0, dtick=0.05),
    xaxis_tickformat='%H:%M',
    legend_title_text='Metric'
)
fig2.show()