# 0. Upload Pocket export file

First, let upload the `ril_export.html` file generated in https://getpocket.com/**export**

In [None]:
from ipywidgets import FileUpload
from IPython.display import display

import os
from pathlib import Path
from time import sleep

upload = FileUpload(accept='.html', multiple=False)

def save_file(inputs):
    for v in upload.value:
        content = v['content']
        with open(v['name'], 'wb') as f:
            f.write(bytes(content))

upload.observe(save_file, names='value')

display(upload)

In [None]:
assert Path("ril_export.html").exists(), "Upload the file before continue running"

In [None]:
import datetime as dt

from bs4 import BeautifulSoup
import pandas as pd

OMNIVORE_API_URL = "https://api-prod.omnivore.app/api/graphql"
# The API key will have the following format "00000000-0000-0000-0000-000000000000"
# OMNIVORE_API_KEY = "<your api key>"
OMNIVORE_API_KEY = os.environ.get('OMNIVORE_API_KEY')
SCHEMA_URL = "https://raw.githubusercontent.com/omnivore-app/omnivore/c9fcbe72ddc6f40dd06e7073b8ffe3c1e71bd650/packages/api/src/generated/schema.graphql"
REQUESTS_SLEEP_TIME = 1 # Number of seconds

if not OMNIVORE_API_KEY:
    OMNIVORE_API_KEY=input('Enter your omnivore API key (should have a format similar to 00000000-0000-0000-0000-000000000000)')

# 1. Parse the export file

The html has the following extructure:

- `<h1>` Unread
- `<ul>` with list items of `<a>`. The href is the link to the article, and the anchor text is the title. It also has a `tags` and `time_added` attributes.
- `<h1>` Read
- Another `<ul>` like the one above

We will transform this into tabular data, with the following fields:
- *read*: Boolean on wether the article has been read
- *time_added*: The time the item was added
- *tags*: An array of strings
- *href*: The url
- *title*: The title of the article

In [None]:
with open('ril_export.html', 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')

soup.title

In [None]:
soup.findAll('h1')

In [None]:
def process_list(h1) -> pd.DataFrame:
    ul = h1.find_next_sibling('ul')
    print(len(ul), h1.text, 'articles')
    read = h1.text != 'Unread'

    items = []
    for a in ul.findAll('a', href=True):
        items.append({
            'read': read,
            'time_added': a['time_added'],
            'href': a['href'],
            'tags': a['tags'],
            'title': a.text,
        })

    df = pd.DataFrame(items)
    df['time_added'] = pd.to_datetime(df['time_added'], unit='s')
    return df

In [None]:
df = pd.concat([process_list(h1) for h1 in soup.findAll('h1')])
df

# 2. Realize parse was not needed, just upload the file

At least using bs is fun

In [None]:
!pip install -q gql[all]

In [None]:
import requests

with requests.get(SCHEMA_URL) as r:
    r.raise_for_status()
    schema = r.text

    assert schema is not None

print(schema[:100])

In [None]:
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

# Select your transport with a defined url endpoint

transport = RequestsHTTPTransport(
    url=OMNIVORE_API_URL,
    headers = {
        'authorization': OMNIVORE_API_KEY,
    }
)

# Create a GraphQL client using the defined transport
client = Client(transport=transport, schema=schema, fetch_schema_from_transport=False)

In [None]:
# Doing a "test query" to check if everything is correct
r = client.execute(gql("""
query Viewer {
    me {
        id
        name
        profile {
            username
        }
    }
}
"""))

USERNAME = r['me']['profile']['username']

print(f"Hello {r['me']['name']} ({USERNAME})!")

In [None]:
createArticle = gql("""
  mutation CreateArticleSavingRequest($url: String!) {
    createArticleSavingRequest(input: {url: $url}) {
      ... on CreateArticleSavingRequestSuccess {
        articleSavingRequest {
          id
          status
          slug
          createdAt
          updatedAt
          url
          errorCode
        }
      }
      ... on CreateArticleSavingRequestError {
        errorCodes
      }
    }
  }
""")

updatePageSavedDate =  gql("""
mutation UpdatePageDate($id: ID!, $date: Date!) {
    updatePage(input: {pageId: $id, savedAt: $date}) {
        ... on UpdatePageSuccess {
            updatedPage {
                id
                savedAt
                publishedAt
                title
            }
        }
        ...on UpdatePageError {
            errorCodes
        }
    }
}
""")

archivePage = gql("""
mutation ArchivePage($id: ID!) {
    setLinkArchived (input: {linkId: $id, archived: true}) {
        ... on ArchiveLinkSuccess {
            linkId
            message
        }
        ... on ArchiveLinkError {
            message
            errorCodes
        }
    }
}
""")

def saveArticle(url, date: dt.datetime = None, archive: bool = False):
    # First createArticleSavingRequest
    r = client.execute(createArticle, variable_values={'url': url})
    rid = r['createArticleSavingRequest']['articleSavingRequest']['id']
    rslug = r['createArticleSavingRequest']['articleSavingRequest']['slug']
    print(r)

    # Then updatePage to change the saved date
    if date is not None:
        r = client.execute(updatePageSavedDate, {
            'id': rid,
            'date': date.isoformat(),
        })
        print(r)

    # Then, if the content is already read, archive it directly
    if archive:
        r = client.execute(archivePage, {
            'id': rid
        })

saveArticle("https://blog.ddavo.me/posts/tutorials/ros2-coppelia-lidar/", dt.datetime(2023,5,1), True)

In [None]:
from tqdm import tqdm
from time import sleep

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    saveArticle(
        url=row['href'],
        date=row['time_added'],
        archive=row['read']
    )
    sleep(REQUESTS_SLEEP_TIME)