# Sitemap Analysis

+ [Content Analysis with XML Sitemaps and Python](https://www.semrush.com/blog/content-analysis-xml-sitemaps-python/)
+ [Analyze and visualize URLs with Network Graph](https://towardsdatascience.com/analyze-and-visualize-urls-with-network-graph-ee3ad5338b69)

In [None]:
# Python3 ipykernel
!pip install advertools polars plotly

In [None]:
!pip install yarl graphistry[bolt,gremlin,nodexl,igraph,networkx] 


In [5]:
!pwd

/data/org/roam/slips


In [169]:
import advertools as adv
import plotly.graph_objects as go
import polars as pl
import pandas as pd
import numpy as np
import re

In [19]:
_DATA="/data/mr/sitemap/data"
_SITEMAPS=f'{_DATA}/mkr_curled.json'

# Makerspace List

## Read JSON into dataframe

In [36]:

# TEST_SITEMAP="https://www.nova-labs.org/sitemap.xml"
# nova_siteindex=adv.sitemap_to_df(TEST_SITEMAP)

sitemaps_json=pl.read_json(_SITEMAPS,
                           schema={
                              'id': pl.Utf8, 
                              'http': pl.Utf8,
                              'redirect': pl.Utf8,
                              "mmap_eventname": pl.Utf8,
                              "mmap_url": pl.Utf8,
                              "mmap_city": pl.Utf8,
                              "mmap_state": pl.Utf8,
                              "mmap_country": pl.Utf8,
                              "mmap_zip": pl.Utf8,
                              "mmap_lat": pl.Float32,
                              "mmap_lng": pl.Float32}) \
    .with_columns(
        [(pl.col('id').cast(pl.Int16).alias('id')),
         (pl.col('http').cast(pl.Int16).alias('http'))])


In [61]:
offline405 = sitemaps_json.filter(pl.col('http') > 404)
offline404 = sitemaps_json.filter(pl.col('http') == 404)
offline403 = sitemaps_json.filter(pl.col('http') == 403)
offline402 = sitemaps_json.filter(pl.col('http') == 402)
offline401 = sitemaps_json.filter(pl.col('http') == 401)

redirect = sitemaps_json.filter(pl.col('http') >= 300, pl.col('http') < 400) # "redirect"
success = sitemaps_json.filter(pl.col('http') >= 200, pl.col('http') < 300) # "success"
# continue
other = sitemaps_json.filter(pl.col('http') < 100) # "dns"

# np.shape([offline.to_numpy(),redirect.to_numpy()])




In [66]:
list(zip(["20X", "30X","401","402","403","404","405"],
     map(np.shape, [success.to_numpy(), 
                    redirect.to_numpy()
                    ,offline401.to_numpy()
                    ,offline402.to_numpy()
                    ,offline403.to_numpy()
                    ,offline404.to_numpy()
                    ,offline405.to_numpy()])))

[('20X', (53, 11)),
 ('30X', (85, 11)),
 ('401', (0, 11)),
 ('402', (0, 11)),
 ('403', (1, 11)),
 ('404', (41, 11)),
 ('405', (2, 11))]

In [82]:
active_sitemaps = sitemaps_json.filter((pl.col("http") >= 200) & (pl.col("http") < 400))
active_sitemaps[:, ['id', 'http', 'mmap_eventname', 'mmap_url', 'redirect']]

# TODO add aliases where to help filter domain redirects

id,http,mmap_eventname,mmap_url,redirect
i16,i16,str,str,str
4401,200,"""Procrastinatio…","""https://www.my…",""""""
4397,200,"""Minnesota Cent…","""https://www.mn…",""""""
4388,301,"""FUSE Makerspac…","""https://www.fu…","""https://www.fu…"
4387,301,"""FirstBuild""","""https://www.fi…","""https://firstb…"
4380,200,"""Valley Makers …","""Https://www.va…",""""""
4379,200,"""MakerFX Makers…","""https://www.ma…",""""""
4376,301,"""The Hub""","""https://www.gr…","""https://www.gr…"
4370,301,"""MPL Makerspace…","""https://eols.o…","""https://eols.o…"
4368,301,"""FLC Innovation…","""https://flcinn…","""https://www.fl…"
4364,301,"""Georgia Cyber …","""https://www.ga…","""https://www.ga…"


In [76]:
sitemaps_by_response = sitemaps_json.group_by(pl.col('http'))
sitemaps_by_response.agg(pl.all(),pl.len())[:, ['http','len']]

http,len
i16,u32
400,1
200,53
302,26
403,1
0,28
303,1
500,1
301,58
406,1
404,41


# Fetch Sitemaps

## Dataframe

In [128]:
re.match(".xml$", ".xml")

<re.Match object; span=(0, 4), match='.xml'>

In [148]:
urls = map((lambda u: (u['id'], u['mmap_eventname'], u['redirect']
                            if (len(u['redirect']) > 0) else f'{u['mmap_url']}/sitemap.xml')),
    active_sitemaps.select(['id','mmap_eventname','redirect','mmap_url']).to_struct())
urls = filter((lambda u: re.match(".*.xml$", u[2])), urls)
# urls = filter((lambda u: len(u[2]) > 0), urls)
urls = list(urls)

In [None]:
dfs = []
# df = adv.sitemap_to_df(urls[0][2])
i = 0
for u in list(urls):
    df = adv.sitemap_to_df(u[2])
    df.loc[:,'id'] = u[0]
    dfs.append(df)
    i += 1

# TODO: refactor to handle 404's using with

In [None]:
dfs = pd.concat(dfs)
dfs = pl.from_pandas(dfs)

# Transform URLs

urls need to be further destructured into a new dataframe that forms the basis of a graph

+ [urlparse and RFC](https://stackoverflow.com/a/70329643)
+ 

In [None]:
dfs