In [1]:
#| echo: false

from IPython.display import display_html
def html(text):
    return display_html(text, raw=True)

# Custom log file analysis

* Create your own log format, and file
* Parse into a DataFrame
* Ask any question you want

In [2]:
import advertools as adv
import pandas as pd
import plotly.express as px
from dash_bootstrap_templates import load_figure_template
load_figure_template('all')

## Sample lines from log file

In [3]:
#| echo: false
with open('sitemap_logs.log', 'rt') as file:
    for i, line in enumerate(file):
        if i in range(100, 2200, 200):
            print(line, end='')

Calling: download_display_sitemap [2021-06-07 03:43:21.901649  2021-06-07 03:43:23.049807] Args: (1, 'https://catalogos365.com/sitemap.xml', [True]) Kwargs: {}
Calling: download_display_sitemap [2021-06-12 11:00:36.218844  2021-06-12 11:00:42.400995] Args: (1, 'https://www.bestbettingsites.com/sitemap.xml', [True]) Kwargs: {}
Calling: download_display_sitemap [2021-09-19 19:16:27.382222  2021-09-19 19:16:28.991722] Args: (1, 'https://www.sitepen.com/sitemap.xml', [True]) Kwargs: {}
Calling: download_display_sitemap [2022-01-09 22:49:44.248476  2022-01-09 22:49:44.736855] Args: (1, 'https://www.herbalife.co.uk/content/herbalifenutrition/emea/en_gb.sitemap.xml', [True]) Kwargs: {}
Calling: download_display_sitemap [2022-06-03 13:55:47.990933  2022-06-03 13:55:51.198758] Args: (25, 'https://www.holzprofi24.at/sitemap_index.xml/', [True]) Kwargs: {}
Calling: download_display_sitemap [2022-09-07 10:36:07.026254  2022-09-07 10:36:10.659670] Args: (3, 'https://www.samco.in/knowledge-center/ht

## Convert the log file to a parquet file
* All you need is a regular expression matching the template you created

In [4]:
#| output: false
log_format_regex = r"\[([^]]+)\].*?(http[^']+).*?(True|\[\])"
adv.logs_to_df(
    log_file='sitemap_logs.log',
    output_file='sitemap_logs.parquet',
    errors_file='sitemap_logs_errors.txt',
    log_format=log_format_regex,
    fields=['start_end', 'url', 'recursive'])

Parsed               0 lines.Parsed           2,034 lines.


## Minor cleanup

In [5]:
start_end = pd.read_parquet('sitemap_logs.parquet')['start_end'].str.split('  ', expand=True).rename(columns={0: 'start', 1: 'end'})
logs_df = pd.read_parquet('sitemap_logs.parquet')
logs_df['recursive'] = [True if x == 'True' else False for x in logs_df['recursive']]
logs_df = pd.concat([logs_df, start_end], axis=1)
logs_df['start'] = pd.to_datetime(logs_df['start'])
logs_df['end'] = pd.to_datetime(logs_df['end'])
logs_df['time_diff'] = logs_df['end'].sub(logs_df['start'])
logs_df['seconds'] = logs_df['time_diff'].dt.total_seconds()
logs_df['date'] = logs_df['start'].dt.date

## Final DataFrame preview

In [6]:
logs_df[['url', 'start', 'end', 'time_diff', 'seconds', 'date']]

Unnamed: 0,url,start,end,time_diff,seconds,date
0,https://www.foxnews.com/sitemap.xml?type=news,2021-05-10 13:00:24.424540,2021-05-10 13:00:24.594706,0 days 00:00:00.170166,0.170166,2021-05-10
1,https://video.foxnews.com/videositemap.xml,2021-05-10 13:01:18.726869,2021-05-10 13:01:19.327550,0 days 00:00:00.600681,0.600681,2021-05-10
2,https://video.foxnews.com/videositemap.xml,2021-05-10 13:01:48.376871,2021-05-10 13:01:48.478090,0 days 00:00:00.101219,0.101219,2021-05-10
3,https://video.foxnews.com/videositemap.xml?idx=3,2021-05-10 13:03:04.170089,2021-05-10 13:03:07.091164,0 days 00:00:02.921075,2.921075,2021-05-10
4,https://www.nytimes.com/sitemaps/new/news.xml.gz,2021-05-10 15:11:05.961278,2021-05-10 15:11:07.405241,0 days 00:00:01.443963,1.443963,2021-05-10
...,...,...,...,...,...,...
2030,https://demandsphere.com/sitemap_index.xml,2024-04-27 17:35:13.665991,2024-04-27 17:35:16.142825,0 days 00:00:02.476834,2.476834,2024-04-27
2031,https://www.google.com/sitemap.xml,2024-04-27 18:15:49.215830,2024-04-27 18:15:53.549334,0 days 00:00:04.333504,4.333504,2024-04-27
2032,https://demandsphere.com/sitemap_index.xml,2024-04-27 18:17:19.848497,2024-04-27 18:17:22.209764,0 days 00:00:02.361267,2.361267,2024-04-27
2033,https://www.ppchero.com/robots.txt,2024-04-27 19:38:32.857199,2024-04-27 19:38:41.746417,0 days 00:00:08.889218,8.889218,2024-04-27


#### `start`, `end`: right before/after the function was invoked

#### Unique XML sitemaps processed: 1,206
#### Total sitemaps processed: 2,035

## Sitemap processing time (cumulative) distribution

In [7]:
#| code-fold: true
fig = px.ecdf(
    logs_df,
    x='seconds',
    lines=False,
    markers=True,
    hover_data=['date', 'recursive'],
    ecdfnorm='percent',
    labels={'seconds': 'Process time (seconds)'},
    height=650,
    opacity=0.8,
    template='slate',
    symbol_sequence=['circle-open'],
    marginal='histogram',
    title='XML Sitemap Downloading and Parsing Time<br><b><a href="https://bit.ly/3gUDXvY">https://adver.tools/xml-sitemaps</a></b>',
    hover_name='url')
fig.layout.yaxis.title = 'Sitemaps taking less time'
fig.layout.yaxis.ticksuffix = '%'
fig.data[0].marker.size = 14
fig.data[0].marker.color = 'snow'
fig.data[1].marker.color = 'snow'
fig

* The largest sitemap recorded took around six minutes
* 99.75% of all sitemaps were processed within two and a half minutes
* 98% of sitemaps were processed within one minute
* No need to make any changes to the server

## Usage over time

In [8]:
#| code-fold: true
fig = px.histogram(
    logs_df,
    x='date',
    hover_data=['date', 'recursive'],
    height=650,
    template='slate',
    title='XML Sitemap Usage Over Time<br><b><a href="https://bit.ly/3gUDXvY">https://adver.tools/xml-sitemaps</a></b>',
    hover_name='url')
fig.layout.yaxis.title = 'Number of requests'
fig.data[0].marker.color = 'snow'
fig.add_shape(type='rect', x0='2023-05-01', x1='2023-11-30', y0=0, y1=90, line={'color': 'snow'}, label={'text': '🔴🔴🔴<br>What the hell happened here??<br>😨😨😨'})

fig.add_annotation(x='2021-06-15', y=310, text='🟢🟢🟢<br>SEOFomo newsletter mention.<br>Talk more about it!', showarrow=True)
fig

<h3><a href="https://twitter.com/eliasdabbas/status/1401833343369555974">The tweet...</a></h3>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Honored to have my <a href="https://twitter.com/hashtag/XML?src=hash&amp;ref_src=twsrc%5Etfw">#XML</a> <a href="https://twitter.com/hashtag/Sitemap?src=hash&amp;ref_src=twsrc%5Etfw">#Sitemap</a> analysis tool mentioned in this week&#39;s <a href="https://twitter.com/hashtag/SEOFOMO?src=hash&amp;ref_src=twsrc%5Etfw">#SEOFOMO</a> edition.<br><br>- You might need to disable your ad blocker<br>- There was a timeout issue with large sitemaps, it should be much better now<br><br>Thanks a lot <a href="https://twitter.com/aleyda?ref_src=twsrc%5Etfw">@aleyda</a> you made my week!<a href="https://t.co/v2sVU8T7fE">https://t.co/v2sVU8T7fE</a> <a href="https://t.co/tQCnz0Y6e9">https://t.co/tQCnz0Y6e9</a></p>&mdash; Elias Dabbas (@eliasdabbas) <a href="https://twitter.com/eliasdabbas/status/1401833343369555974?ref_src=twsrc%5Etfw">June 7, 2021</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

## Missing tracking
For some reason this function wasn't logged between May and November 2023. Analytics shows pageviews were still happening. Not clear if there was a massive unknown bug, or for some reason logging didn't happen.