In [None]:
from pyoxigraph import Store, NamedNode, Literal, Variable, serialize, RdfFormat

#### Fedora 4 objects

In [None]:
store = Store.read_only('./db')

In [None]:
# Looking for null values in the proxyFor relation
query_null = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER (STRLEN(?o2) = 0) 
} 
'''

In [None]:
# Find all unique values of the proxyFor relation
query = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?o2
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
} 
'''

In [None]:
results = store.query(query)

In [None]:
results = list(results)

In [None]:
[r for r in results if not isinstance(r[Variable('o2')], NamedNode)]

In [None]:
# Find any objects of the proxyFor relation that are not subjects in the graph
query_orphan = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?o2
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?o2 ?p ?o3 .
    }
} 
'''

In [None]:
results = store.query(query)

In [None]:
len(list(results))

In [None]:
# Find proxy containers lacking the proxyFor predicate
query_dangling = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
results = list(store.query(query_dangling))
len(results)

In [None]:
with open('dangling-objects.txt', 'w') as f:
    for result in results:
        row = result['s'].value
        f.write(f'{row}\n')

In [None]:
results[0]

This record, for instance

- `hasModel` = `ActiveFedora::Aggregation::Proxy`
- `proxyFor` **is missing**
- `hasParent` = `http://localhost:8984/rest/prod/fx/71/9n/26/fx719n26s/members`, which `hasModel` = `ActiveFedora::IndirectContainer`
    - the parent `hasParent` = `http://localhost:8984/rest/prod/fx/71/9n/26/fx719n26s`, which `hasModel` = `GwWork`
    - the parent of the proxy record has **2 children**, one of which is the expected `FileSet` record: `http://localhost:8984/rest/prod/g4/45/cd/81/g445cd81f`

In [None]:
# This should be the usual case
query_normal = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
norm_results = list(store.query(query_normal))

In [None]:
len(norm_results)

In [None]:
norm_results[0]

This record, for instance

- `hasModel` = `ActiveFedora::Aggregation::Proxy`
- `proxyFor` = `http://localhost:8984/rest/prod/s4/65/5g/72/s4655g72c`, which `hasModel` = `FileSet`
- `hasParent` = `http://localhost:8984/rest/prod/hd/76/s0/18/hd76s0189/members`, which `hasModel` = `ActiveFedora::IndirectContainer`
    - the parent `hasParent` = `http://localhost:8984/rest/prod/hd/76/s0/18/hd76s0189`, which `hasModel` = `GwEtd`
    - the parent of the proxy record has only 1 child

In [None]:
# For those lacking the proxyFor relation, do their parents have VALID children?
query_valid_sibling = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>
PREFIX ldp:   <http://www.w3.org/ns/ldp#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>

SELECT DISTINCT ?s ?s1 ?s3
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?s fedora:hasParent ?s1.
    ?s1 ldp:contains ?s3.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
    FILTER EXISTS { 
        ?s3  ns012:proxyFor ?o3.
    }
} 
'''

In [None]:
siblings = list(store.query(query_valid_sibling))

In [None]:
len(siblings)

In [None]:
len({q['s'].value for q in siblings})

In [None]:
from csv import DictWriter

In [None]:
fields = ['bad_node', 'parent', 'valid_node']
with open('./nodes_to_test.csv', 'w') as f:
    writer = DictWriter(f, fields)
    writer.writeheader()
    for q in siblings:
        writer.writerow(dict(zip(fields, [s.value for s in q])))

In [None]:
valid_sibling_ids = {q['s'].value for q in siblings}

In [None]:
# One Proxy object without a valid sibling
# This work has no FileSet associated with it
{q['s'].value for q in results} - valid_sibling_ids

In [None]:
# Grandparents of the problematic objects
query_ancestor_work = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>
PREFIX ldp:   <http://www.w3.org/ns/ldp#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?work ?title ?date
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?s fedora:hasParent ?s1.
    ?s1 fedora:hasParent ?work.
    ?work dcterms:title ?title.
    ?work fedora:created ?date.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
works = list(store.query(query_ancestor_work))

In [None]:
works[0]

In [None]:
fields = ['uri', 'title', 'date_created']
with open('./works_with_dangling_proxies.csv', 'w') as f:
    writer = DictWriter(f, fields)
    writer.writeheader()
    for q in works:
        writer.writerow(dict(zip(fields, [s.value for s in q])))

In [None]:
# How many objects point to each of these invalid objects?
query_points_to = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s (COUNT(?other) AS ?other_count)
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?other ?p ?s.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
GROUP BY ?s
'''

In [None]:
points_to = list(store.query(query_points_to))

In [None]:
[p for p in points_to if int(p['other_count'].value) > 1]

##### Mitigation

1. `DELETE` query for `AggregationProxy` object.
    - Response should be `204`
2. `PATCH` query for `IndirectContainer` object.
```
PREFIX ldp:  <http://www.w3.org/ns/ldp#>
DELETE {
    <> ldp:contains <deleted-uri> .
}
WHERE { }
```

**Note**: It does not appear to be necessary to update the parent of the deleted object; the reference seems to be removed by Fedora upon the object's deletion.

In [None]:
import requests
from requests import HTTPError

In [None]:
def delete_object(session, uri):
    try:
        r = session.delete(uri)
        if r.status_code != 204:
            r.raise_for_status()
    except HTTPError:
        print(f'Error deleting {uri}: {r.text}')

In [None]:
def delete_from_parent(session, uri):
    update_q = '''
    PREFIX ldp:  <http://www.w3.org/ns/ldp#>

    DELETE {{
        <> ldp:contains <{deleted_uri}> .
    }}
    WHERE {{ }}
    '''
    headers = {'Content-Type': 'application/sparql-update'}
    parent_uri = '/'.join(uri.split('/')[:-1])
    try:
        r = session.patch(parent_uri, data=update_q.format(deleted_uri=uri))
        if r.status_code != 204:
            r.raise_for_status()
    except HTTPError:
        print(f'Error patching {uri}: {r.text}')

In [None]:
test, rest = results[0], results[1:]

In [None]:
sess = requests.Session()

In [None]:
delete_object(sess, test['s'].value)

In [None]:
delete_from_parent(sess, test['s'].value)

In [None]:
for r in rest:
    delete_object(sess, r['s'].value)

In [None]:
[r['s'].value for r in rest]

#### Objects outside repo root

In [None]:
# These seem to be content-admin ACL's, unconnected to other objects
query_ca = '''
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT (COUNT(?s) as ?count)

WHERE { 
    ?s acl:agent <http://projecthydra.org/ns/auth/group#content-admin>
    
}
'''

In [None]:
list(store.query(query_ca))

In [None]:
# Model types for objects outside the root
query_orphans1 = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT DISTINCT ?o ?o2 (COUNT(?s) AS ?count) 

WHERE { 
    ?s ns001:hasModel ?o.
    ?s fedora:hasParent <http://localhost:8984/rest/>.
    ?s acl:agent ?o2
}
GROUP BY ?o ?o2
'''

In [None]:
# All orphans seems to be content-admin ACL's
list(store.query(query_orphans1))

In [None]:
# Comparing these ACL's to those under the repo root
query_acl = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT ?o (COUNT(?s) AS ?count)

WHERE { 
    ?s ns001:hasModel "Hydra::AccessControls::Permission".
    ?s acl:agent <http://projecthydra.org/ns/auth/group#content-admin>.
    ?s fedora:hasParent ?s2.
    ?s2 ns001:hasModel ?o
}
GROUP BY ?o
'''

In [None]:
list(store.query(query_acl))

All objects with the `acl:agent` predicate of `content-admin`, with the exception of the those outside the repo root, have a parent of the type `Hydra::AccessControl`. Those outside the root have the `rest` endpoint as their parent, which isn't even a proper container. I think, therefore, that these orphans can be excluded from the export.

#### Deletion of non-prod objects

In [None]:
rest_uri = 'http://localhost:8984/rest/'
headers =  {'Accept': 'application/x-turtle'}

In [None]:
import requests
rdf = requests.get(rest_uri, headers=headers)

In [None]:
print(rdf.text)

In [None]:
prefixes = [r for r in rdf.text.split('\n') if r.startswith('@prefix')]

In [None]:
prefix_dict = {}
for prefix in prefixes:
    p = prefix.split()
    prefix_dict[p[1]] = p[2][1:-1]

In [None]:
prefix_dict

In [None]:
from pyoxigraph import parse, RdfFormat, NamedNode
g = parse(rdf.text, format=RdfFormat.TURTLE)

In [None]:
print(serialize(input=g, format=RdfFormat.TURTLE, prefixes=prefix_dict).decode())

In [None]:
non_prod = [node.object.value for node in g if node.predicate == NamedNode('http://www.w3.org/ns/ldp#contains') 
            and node.object != NamedNode('http://localhost:8984/rest/prod')]

In [None]:
session = requests.session()
for uri in non_prod:
    delete_object(session, uri)

#### Reindexing Error

In [None]:
list(store.quads_for_pattern(NamedNode('http://localhost:8984/rest/prod/5t/34/sk/59/5t34sk59v'), None, None))

#### Fedora 6 objects

In [None]:
store_6 = Store.read_only('./db-ocfl')

In [None]:
list(store_6.quads_for_pattern(NamedNode('info:fedora/prod/xd/07/gt/33/xd07gt33n/members/74a1b297-89a2-4c73-873a-2e6fb86c739c'), None, None))