# Generating an image XML sitemap from a crawled website file

In [3]:
import advertools as adv
import pandas as pd
pd.options.display.max_columns = None

## Image [sitemap format](https://developers.google.com/search/docs/crawling-indexing/sitemaps/image-sitemaps)
```xml
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
    xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
  <url>
    <loc>https://example.com/sample1.html</loc>
    <image:image>
      <image:loc>https://example.com/image.jpg</image:loc>
    </image:image>
    <image:image>
      <image:loc>https://example.com/photo.jpg</image:loc>
    </image:image>
  </url>
  <url>
    <loc>https://example.com/sample2.html</loc>
    <image:image>
      <image:loc>https://example.com/picture.jpg</image:loc>
    </image:image>
  </url>
</urlset>
```

## Read the `url` and `img_src` columns from the crawl file

In [2]:
url_imgsrc = pd.read_parquet('nasa_crawl.parquet', columns=['url', 'img_src'])
url_imgsrc

Unnamed: 0,url,img_src
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/themes/nasa/as...
1,https://www.nasa.gov/?search=SpaceX%20Crew-2,https://www.nasa.gov/wp-content/themes/nasa/as...
2,https://www.nasa.gov/?search=International%20S...,https://www.nasa.gov/wp-content/themes/nasa/as...
3,https://www.nasa.gov/?search=Mars%20perseverance,https://www.nasa.gov/wp-content/themes/nasa/as...
4,https://www.nasa.gov/?search=Expedition%2064,https://www.nasa.gov/wp-content/themes/nasa/as...
...,...,...
9937,https://www.nasa.gov/image-detail/pushing-enve...,https://www.nasa.gov/wp-content/themes/nasa/as...
9938,https://www.nasa.gov/missions/icon/nasas-icon-...,https://www.nasa.gov/wp-content/themes/nasa/as...
9939,https://www.nasa.gov/image-article/apollo-17-b...,https://www.nasa.gov/wp-content/themes/nasa/as...
9940,https://www.nasa.gov/image-detail/ttbw-prepare...,https://www.nasa.gov/wp-content/themes/nasa/as...


## Summarize images with the `adv.crawlytics.images` function

In [242]:
img_df = adv.crawlytics.images(url_imgsrc)
img_df

Unnamed: 0,url,img_src
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/themes/nasa/as...
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/themes/nasa/as...
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/uploads/2024/0...
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/uploads/2023/1...
0,https://www.nasa.gov/,https://www.nasa.gov/wp-content/uploads/2024/0...
...,...,...
9941,https://www.nasa.gov/image-detail/bracing-fuel...,https://www.nasa.gov/wp-content/uploads/2023/0...
9941,https://www.nasa.gov/image-detail/bracing-fuel...,https://www.nasa.gov/wp-content/uploads/2023/0...
9941,https://www.nasa.gov/image-detail/bracing-fuel...,https://www.nasa.gov/wp-content/uploads/2023/0...
9941,https://www.nasa.gov/image-detail/bracing-fuel...,https://www.nasa.gov/wp-content/uploads/2023/0...


## Example with a simple dataset:

In [170]:
df = pd.DataFrame({
    'url': ['https://example.com/_A' for i in range(3)] + ['https://example.com/_B' for i in range(3)] + ['https://example.com/_C' for i in range(3)],
    'image': [f'/image_A_{n}.png' for n in range(1, 4)] * 3
})
df

df

Unnamed: 0,url,image
0,https://example.com/_A,/image_A_1.png
1,https://example.com/_A,/image_A_2.png
2,https://example.com/_A,/image_A_3.png
3,https://example.com/_B,/image_A_1.png
4,https://example.com/_B,/image_A_2.png
5,https://example.com/_B,/image_A_3.png
6,https://example.com/_C,/image_A_1.png
7,https://example.com/_C,/image_A_2.png
8,https://example.com/_C,/image_A_3.png


In [234]:
from lxml.builder import E
from lxml import etree

# adapted from the lxml tutorial https://lxml.de/tutorial.html
def prettyprint(element, file=None, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode()
          .replace('image_loc', 'image:loc')
          .replace('<image>', '<image:image>')
          .replace('</image>', '</image:image>'),
          file=file,
          end='')


for url in df['url'].drop_duplicates():
    tempdf = df[df['url'].eq(url)]
    image_urls = tempdf['image'].tolist()
    image_elements = [E.image_loc(u) for u in image_urls]
    prettyprint(
        E.url(
            E.loc(url),
            E.image(*image_elements)
        )
    )

<url>
  <loc>https://example.com/_A</loc>
  <image:image>
    <image:loc>/image_A_1.png</image:loc>
    <image:loc>/image_A_2.png</image:loc>
    <image:loc>/image_A_3.png</image:loc>
  </image:image>
</url>
<url>
  <loc>https://example.com/_B</loc>
  <image:image>
    <image:loc>/image_A_1.png</image:loc>
    <image:loc>/image_A_2.png</image:loc>
    <image:loc>/image_A_3.png</image:loc>
  </image:image>
</url>
<url>
  <loc>https://example.com/_C</loc>
  <image:image>
    <image:loc>/image_A_1.png</image:loc>
    <image:loc>/image_A_2.png</image:loc>
    <image:loc>/image_A_3.png</image:loc>
  </image:image>
</url>


## Create an image sitemap for three images of the first five URLs

In [246]:
print("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">""")
for url in img_df['url'].drop_duplicates()[:5]:
    tempdf = img_df[img_df['url'].eq(url)]
    image_urls = tempdf['img_src'].tolist()
    image_elements = [E.image(E.image_loc(u)) for u in image_urls[:3]]
    prettyprint(
        E.url(
            E.loc(url),
            *image_elements
        ))
print('</urlset>')

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
<url>
  <loc>https://www.nasa.gov/</loc>
  <image:image>
    <image:loc>https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png</image:loc>
  </image:image>
  <image:image>
    <image:loc>https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png</image:loc>
  </image:image>
  <image:image>
    <image:loc>https://www.nasa.gov/wp-content/uploads/2024/02/ksc-20240130-ph-geb01-0093-rotated.jpg?w=640</image:loc>
  </image:image>
</url>
<url>
  <loc>https://www.nasa.gov/?search=SpaceX%20Crew-2</loc>
  <image:image>
    <image:loc>https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png</image:loc>
  </image:image>
  <image:image>
    <image:loc>https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png</image:loc>
  </image:image>
  <image:image>
    <image:loc>ht

## Do the full thing and save to a file
I only saved the first 100 URLs with all theri respective images to save space on Github.

In [249]:
with open('image_sitemap.xml', 'wt') as file:
    print("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">""", file=file)
    for url in img_df['url'].drop_duplicates():
        tempdf = img_df[img_df['url'].eq(url)].dropna()
        image_urls = tempdf['img_src'].tolist()
        image_elements = [E.image(E.image_loc(u)) for u in image_urls]
        prettyprint(
            E.url(
                E.loc(url),
                *image_elements
            ),
        file=file)
with open('image_sitemap.xml', 'at') as file:
    print('</urlset>', file=file)