This script aims to delete duplicates created by running [pocket2omnivore.ipynb](./pocket2omnivore.ipynb) repeatedly by error

In [None]:
from IPython.display import display
import datetime as dt
from pathlib import Path
import logging
import os
import requests
import pandas as pd
import backoff

from urllib.parse import urlparse, parse_qs

from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

In [None]:
OMNIVORE_API_URL = "https://api-prod.omnivore.app/api/graphql"
OMNIVORE_API_KEY = os.environ.get('OMNIVORE_API_KEY')
SCHEMA_URL = "https://raw.githubusercontent.com/omnivore-app/omnivore/c9fcbe72ddc6f40dd06e7073b8ffe3c1e71bd650/packages/api/src/generated/schema.graphql"
ARTICLES_BATCH = 1000
AMBIGUOUS_TAG = '_p2o_ambiguous'
TODELETE_TAG = '_p2o_to-delete'

In [None]:
with requests.get(SCHEMA_URL) as r:
    r.raise_for_status()
    schema = r.text

    assert schema is not None

In [None]:
def create_client():
    transport = RequestsHTTPTransport(
       url=OMNIVORE_API_URL,
        headers = {
            'authorization': OMNIVORE_API_KEY,
        }
    )
    return Client(transport=transport, schema=schema, fetch_schema_from_transport=False, execute_timeout=None)

In [None]:
# Doing a "test query" to check if everything is correct

with create_client() as session: 
    r = session.execute(gql("""
    query Viewer {
        me {
            id
            name
            profile {
                username
            }
        }
    }
    """))

    result = r
    USERNAME = result['me']['profile']['username']

    print(f"Hello {result['me']['name']} ({USERNAME})!")

In [None]:
queryArticles = """
query Articles ($batch: Int, $cursor: String!) {
    articles(after: $cursor, first: $batch, query: "in:all", sharedOnly: false) {
    ... on ArticlesSuccess {
        pageInfo {
            hasNextPage
            totalCount
            endCursor
        }
        edges {
            cursor
            node {
                id
                url
                createdAt
                publishedAt
                savedAt
                isArchived
                slug
                originalArticleUrl
                readingProgressPercent
                wordsCount
                title
                hash
            }
        }
    }
    ... on ArticlesError {
        errorCodes
    }
    }
}
"""

def getAllArticles():
    hasNextPage = True
    cursor = ""
    total = None

    articles = []

    with create_client() as c:
        while hasNextPage:
            r = c.execute(gql(queryArticles), {'cursor': cursor, 'batch': ARTICLES_BATCH})
            r = r['articles']

            if 'errorCodes' in r:
                raise ValueError(r['errorCodes'])

            for e in r['edges']:
                articles.append(e['node'])

            total = r['pageInfo']['totalCount']
            hasNextPage = r['pageInfo']['hasNextPage']
            cursor = r['pageInfo']['endCursor']

            print(f"Current cursor: {cursor}, total: {total}")

    return articles

def clean_url(url):
    p = urlparse(url)
    return p.netloc + p.path.rstrip('/')

articles = pd.DataFrame.from_records(getAllArticles()).set_index('id')
articles['cleanUrl'] = articles['originalArticleUrl'].apply(clean_url)
articles

In [None]:
IGNORE_URLS = ['omnivore.app/no_url', 'www.google.com/url', 'www.google.com/search']
# duplicated = articles.duplicated('hash', keep=False)
duplicated = articles.duplicated('cleanUrl', keep=False) & ~articles['cleanUrl'].isin(IGNORE_URLS)
first_duplicates = articles.duplicated('cleanUrl', keep='first') & ~articles['cleanUrl'].isin(IGNORE_URLS)

print(f"There are {duplicated.sum()} duplicates")
articles[duplicated].sort_values('title')

In [None]:
def should_keep(a1, a2):
    """ Returns the item to be deleted or None if none of them should be deleted
    For example, if they are not "comparable"
    """

    # Policy https://github.com/daviddavo/pocket2omnivore/issues/4
    if a1.name == a2.name:
        raise ValueError("Is the same article!")

    if a1['cleanUrl'] != a2['cleanUrl']:
        raise ValueError("Articles have different URL")

    if a1['wordsCount'] != a2['wordsCount']:
        print(f"Warning: The articles {a1.name} and {a2.name} have different word counts")
        return None

    # Keep archived criteria
    if a1['isArchived'] and not a2['isArchived']:
        return a1
    elif a2['isArchived'] and not a1['isArchived']:
        return a2

    # Keep most read criteria
    if (a1['readingProgressPercent'] > 1) and not (a2['readingProgressPercent'] > 1):
        return a1
    elif (a2['readingProgressPercent'] > 1) and not (a1['readingProgressPercent'] > 1):
        return a2

    # Finally, keep oldest criteria
    old_date = min(a1['savedAt'], a2['savedAt'])
    if a1['savedAt'] == old_date:
        return a1
    else:
        return a2

should_keep(articles.loc["d51e0273-cc1a-4d22-8c8f-3df76655a875"], articles.loc["5edf1133-d465-4177-8489-642b52f4c039"])

Now, for each group of duplicates, we should make a kind of tournament where only one remains.

In [None]:
keep = {}
delete = {}

ambiguous = []

for url, g in articles[duplicated].groupby('cleanUrl'):
    assert len(g) >= 2
    keep[url] = g.iloc[0]
    delete[url] = []

    for id, a in g.iloc[1:].iterrows():
        ret = should_keep(keep[url], a)

        if ret is None:
            ambiguous.append(url)
            break
        # delete new
        elif ret.name == keep[url].name:
            delete[url].append(a)
        # keep new
        else:
            delete[url].append(keep[url])
            keep[url] = a

print("The following articles should be checked and deleted manually")
articles[articles['cleanUrl'].isin(ambiguous)]

# Tag articles to delete

## Create tags and needed functions

In [None]:
def getExistingTags():
  with create_client() as session: 
    r = session.execute(gql("""
    query Labels {
        labels {
              ...on LabelsSuccess { 
                  labels { name, id }
              }
          }
    }
    """))

    result = r
    return result['labels']['labels']

def saveTags(tagName): 
    with create_client() as client: 
      mutation = f"""
      mutation {{
        createLabel(input: {{color: "#F00", name: "{tagName}" }}) {{
          ... on CreateLabelSuccess {{
            label {{
              id
              name
              color
              description
              createdAt
            }}
          }}
          ... on CreateLabelError {{
            errorCodes
          }}
        }}
      }}
      """

      r = client.execute(gql(mutation), {'name': str(tagName)})
      print(r)
      return r['createLabel']['label']['id']

In [None]:
server_tags = pd.DataFrame.from_records(getExistingTags())
create_tags = {AMBIGUOUS_TAG, TODELETE_TAG} - set(server_tags['name'])
for tag in create_tags:
    print(tag)
    saveTags(tag)
    
if create_tags:
    server_tags = pd.DataFrame.from_records(getExistingTags())

server_tags

In [None]:
setLabels = gql("""
mutation SetLabel($articleId: ID!, $labelIds: [ID!]!) { 
    setLabels(input: {pageId: $articleId, labelIds: $labelIds}) {
        ...on SetLabelsSuccess { 
            labels { 
                id
            }
        }
    }
}
""")

@backoff.on_predicate(
    backoff.runtime,
    predicate=lambda r: isinstance(r, RequestsHTTPTransport),
    value=lambda r: int(r.response_headers["RateLimit-Reset"]) + 1,
    jitter=None,
)
def saveLabels(articleId, labels): 
    with create_client() as client: 
      try:
        return client.execute(setLabels, {'articleId': articleId, 'labelIds': labels})
      except Exception as e:
          if (hasattr(e, 'code') and e.code == 429): 
            return session.transport
          raise

## Now tag the articles

In [None]:
TODELETE_TAG_ID = server_tags.set_index('name')['id'][TODELETE_TAG]

for lst in delete.values():
    for a in lst:
        saveLabels(a.name, TODELETE_TAG_ID)