# Process Julia Evans' atom feed to a list of blogposts (dictionaries)

In [None]:
import json
import pprint
from pathlib import Path

import feedparser
import html2text

In [None]:
DATA_FOLDER = Path("data/julia_evans")

# source: https://jvns.ca/atom.xml, retrieved september 4, 2023
atom_path = DATA_FOLDER / 'atom.xml' 

## Load Julia Evans' atom feed from file

In [None]:
feedparser_result = feedparser.parse(atom_path)

In [None]:
pprint.pprint(feedparser_result['feed'])

{'author': 'Julia Evans',
 'author_detail': {'name': 'Julia Evans'},
 'authors': [{'name': 'Julia Evans'}],
 'generator': 'Hugo',
 'generator_detail': {'href': 'http://gohugo.io/', 'name': 'Hugo'},
 'guidislink': False,
 'id': 'http://jvns.ca',
 'link': 'http://jvns.ca',
 'links': [{'href': 'http://jvns.ca/atom.xml',
            'rel': 'self',
            'type': 'application/atom+xml'},
           {'href': 'http://jvns.ca', 'rel': 'alternate', 'type': 'text/html'}],
 'title': 'Julia Evans',
 'title_detail': {'base': '',
                  'language': None,
                  'type': 'text/plain',
                  'value': 'Julia Evans'},
 'updated': '2023-08-11T08:13:16+00:00',
 'updated_parsed': time.struct_time(tm_year=2023, tm_mon=8, tm_mday=11, tm_hour=8, tm_min=13, tm_sec=16, tm_wday=4, tm_yday=223, tm_isdst=0)}


In [None]:
pprint.pprint(feedparser_result['entries'][0])

{'content': [{'base': '',
              'language': None,
              'type': 'text/html',
              'value': '<p>I started using Mastodon back in November, and '
                       'it&rsquo;s the Twitter alternative\n'
                       'where I&rsquo;ve been spending most of my time '
                       'recently, mostly because the Fediverse\n'
                       'is where a lot of the Linux nerds seem to be right '
                       'now.</p>\n'
                       '\n'
                       '<p>I&rsquo;ve found Mastodon quite a bit more '
                       'confusing than Twitter because it&rsquo;s a\n'
                       'distributed system, so here are a few technical things '
                       'I&rsquo;ve learned about it\n'
                       'over the last 10 months. I&rsquo;ll mostly talk about '
                       'what using a single-person\n'
                       'server has been like for me, as well as a couple of 

In [None]:
# how many blogposts are in the atom feed?
print(len(feedparser_result['entries']))

20


## Clean up entries and remove all keys we don't need

In [None]:
# summary is the same as content
for blogpost in feedparser_result['entries']:
    _ = blogpost.pop('summary')
    _ = blogpost.pop('links')
    _ = blogpost.pop('title_detail')
    _ = blogpost.pop('updated_parsed')
    

In [None]:
pprint.pprint(feedparser_result['entries'][0])

## Turn contents field into Markdown text field

In [None]:
h = html2text.HTML2Text()
h.ignore_images = True
h.ignore_tables = True
h.escape_all = True
h.reference_links = True
h.mark_code = True
h.body_width = 0

In [None]:
for blogpost in feedparser_result['entries']:
    blogpost['text'] = h.handle(blogpost['content'][0]['value'])
    blogpost['author'] = feedparser_result['feed']['author']
    _ = blogpost.pop('content')

In [None]:
pprint.pprint(feedparser_result['entries'][0])

{'author': 'Julia Evans',
 'guidislink': False,
 'id': 'https://jvns.ca/blog/2023/08/11/some-notes-on-mastodon/',
 'link': 'https://jvns.ca/blog/2023/08/11/some-notes-on-mastodon/',
 'text': "I started using Mastodon back in November, and it's the Twitter "
         "alternative where I've been spending most of my time recently, "
         'mostly because the Fediverse is where a lot of the Linux nerds seem '
         'to be right now.\n'
         '\n'
         "I've found Mastodon quite a bit more confusing than Twitter because "
         "it's a distributed system, so here are a few technical things I've "
         "learned about it over the last 10 months. I'll mostly talk about "
         'what using a single-person server has been like for me, as well as a '
         'couple of notes about the API, DMs and ActivityPub.\n'
         '\n'
         "I might have made some mistakes, please let me know if I've gotten "
         'anything wrong!\n'
         '\n'
         "### what's a ma

## Save to data folder

In [None]:
with open(DATA_FOLDER / 'blogposts.json', 'w') as outfile:
    outfile.write(json.dumps(feedparser_result['entries'], indent=4))