In [1]:
from pyoxigraph import Store, NamedNode, Literal, Variable, serialize, RdfFormat
from itertools import groupby
from collections import defaultdict
import json
from pathlib import Path
import re
from datetime import datetime
from  dateutil.parser import isoparse 

#### Fedora 4 objects

In [None]:
store = Store.read_only('./db')

In [None]:
# Looking for null values in the proxyFor relation
query_null = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER (STRLEN(?o2) = 0) 
} 
'''

In [None]:
# Find all unique values of the proxyFor relation
query = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?o2
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
} 
'''

In [None]:
results = store.query(query)

In [None]:
results = list(results)

In [None]:
[r for r in results if not isinstance(r[Variable('o2')], NamedNode)]

In [None]:
# Find any objects of the proxyFor relation that are not subjects in the graph
query_orphan = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?o2
WHERE 
{
    ?s ns001:hasModel ?o1;
         ns012:proxyFor ?o2.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?o2 ?p ?o3 .
    }
} 
'''

In [None]:
results = store.query(query)

In [None]:
len(list(results))

In [None]:
# Find proxy containers lacking the proxyFor predicate
query_dangling = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
results = list(store.query(query_dangling))
len(results)

In [None]:
with open('dangling-objects.txt', 'w') as f:
    for result in results:
        row = result['s'].value
        f.write(f'{row}\n')

In [None]:
results[0]

This record, for instance

- `hasModel` = `ActiveFedora::Aggregation::Proxy`
- `proxyFor` **is missing**
- `hasParent` = `http://localhost:8984/rest/prod/fx/71/9n/26/fx719n26s/members`, which `hasModel` = `ActiveFedora::IndirectContainer`
    - the parent `hasParent` = `http://localhost:8984/rest/prod/fx/71/9n/26/fx719n26s`, which `hasModel` = `GwWork`
    - the parent of the proxy record has **2 children**, one of which is the expected `FileSet` record: `http://localhost:8984/rest/prod/g4/45/cd/81/g445cd81f`

In [None]:
# This should be the usual case
query_normal = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s
WHERE 
{
    ?s ns001:hasModel ?o1;
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
norm_results = list(store.query(query_normal))

In [None]:
len(norm_results)

In [None]:
norm_results[0]

This record, for instance

- `hasModel` = `ActiveFedora::Aggregation::Proxy`
- `proxyFor` = `http://localhost:8984/rest/prod/s4/65/5g/72/s4655g72c`, which `hasModel` = `FileSet`
- `hasParent` = `http://localhost:8984/rest/prod/hd/76/s0/18/hd76s0189/members`, which `hasModel` = `ActiveFedora::IndirectContainer`
    - the parent `hasParent` = `http://localhost:8984/rest/prod/hd/76/s0/18/hd76s0189`, which `hasModel` = `GwEtd`
    - the parent of the proxy record has only 1 child

In [None]:
# For those lacking the proxyFor relation, do their parents have VALID children?
query_valid_sibling = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>
PREFIX ldp:   <http://www.w3.org/ns/ldp#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>

SELECT DISTINCT ?s ?s1 ?s3
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?s fedora:hasParent ?s1.
    ?s1 ldp:contains ?s3.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
    FILTER EXISTS { 
        ?s3  ns012:proxyFor ?o3.
    }
} 
'''

In [None]:
siblings = list(store.query(query_valid_sibling))

In [None]:
len(siblings)

In [None]:
len({q['s'].value for q in siblings})

In [None]:
from csv import DictWriter

In [None]:
fields = ['bad_node', 'parent', 'valid_node']
with open('./nodes_to_test.csv', 'w') as f:
    writer = DictWriter(f, fields)
    writer.writeheader()
    for q in siblings:
        writer.writerow(dict(zip(fields, [s.value for s in q])))

In [None]:
valid_sibling_ids = {q['s'].value for q in siblings}

In [None]:
# One Proxy object without a valid sibling
# This work has no FileSet associated with it
{q['s'].value for q in results} - valid_sibling_ids

In [None]:
# Grandparents of the problematic objects
query_ancestor_work = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>
PREFIX ldp:   <http://www.w3.org/ns/ldp#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?work ?title ?date
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?s fedora:hasParent ?s1.
    ?s1 fedora:hasParent ?work.
    ?work dcterms:title ?title.
    ?work fedora:created ?date.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
'''

In [None]:
works = list(store.query(query_ancestor_work))

In [None]:
works[0]

In [None]:
fields = ['uri', 'title', 'date_created']
with open('./works_with_dangling_proxies.csv', 'w') as f:
    writer = DictWriter(f, fields)
    writer.writeheader()
    for q in works:
        writer.writerow(dict(zip(fields, [s.value for s in q])))

In [None]:
# How many objects point to each of these invalid objects?
query_points_to = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX ns012: <http://www.openarchives.org/ore/terms/>

SELECT DISTINCT ?s (COUNT(?other) AS ?other_count)
WHERE 
{
    ?s ns001:hasModel ?o1.
    ?other ?p ?s.
    FILTER (?o1 = "ActiveFedora::Aggregation::Proxy")
    FILTER NOT EXISTS { 
        ?s ns012:proxyFor ?o2.
    }
} 
GROUP BY ?s
'''

In [None]:
points_to = list(store.query(query_points_to))

In [None]:
[p for p in points_to if int(p['other_count'].value) > 1]

##### Mitigation

1. `DELETE` query for `AggregationProxy` object.
    - Response should be `204`
2. `PATCH` query for `IndirectContainer` object.
```
PREFIX ldp:  <http://www.w3.org/ns/ldp#>
DELETE {
    <> ldp:contains <deleted-uri> .
}
WHERE { }
```

**Note**: It does not appear to be necessary to update the parent of the deleted object; the reference seems to be removed by Fedora upon the object's deletion.

In [None]:
import requests
from requests import HTTPError

In [None]:
def delete_object(session, uri):
    try:
        r = session.delete(uri)
        if r.status_code != 204:
            r.raise_for_status()
    except HTTPError:
        print(f'Error deleting {uri}: {r.text}')

In [None]:
def delete_from_parent(session, uri):
    update_q = '''
    PREFIX ldp:  <http://www.w3.org/ns/ldp#>

    DELETE {{
        <> ldp:contains <{deleted_uri}> .
    }}
    WHERE {{ }}
    '''
    headers = {'Content-Type': 'application/sparql-update'}
    parent_uri = '/'.join(uri.split('/')[:-1])
    try:
        r = session.patch(parent_uri, data=update_q.format(deleted_uri=uri))
        if r.status_code != 204:
            r.raise_for_status()
    except HTTPError:
        print(f'Error patching {uri}: {r.text}')

In [None]:
test, rest = results[0], results[1:]

In [None]:
sess = requests.Session()

In [None]:
delete_object(sess, test['s'].value)

In [None]:
delete_from_parent(sess, test['s'].value)

In [None]:
for r in rest:
    delete_object(sess, r['s'].value)

In [None]:
[r['s'].value for r in rest]

#### Objects outside repo root

In [None]:
# These seem to be content-admin ACL's, unconnected to other objects
query_ca = '''
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT (COUNT(?s) as ?count)

WHERE { 
    ?s acl:agent <http://projecthydra.org/ns/auth/group#content-admin>
    
}
'''

In [None]:
list(store.query(query_ca))

In [None]:
# Model types for objects outside the root
query_orphans1 = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT DISTINCT ?o ?o2 (COUNT(?s) AS ?count) 

WHERE { 
    ?s ns001:hasModel ?o.
    ?s fedora:hasParent <http://localhost:8984/rest/>.
    ?s acl:agent ?o2
}
GROUP BY ?o ?o2
'''

In [None]:
# All orphans seems to be content-admin ACL's
list(store.query(query_orphans1))

In [None]:
# Comparing these ACL's to those under the repo root
query_acl = '''
PREFIX ns001: <info:fedora/fedora-system:def/model#>
PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
PREFIX acl: <http://www.w3.org/ns/auth/acl#>

SELECT ?o (COUNT(?s) AS ?count)

WHERE { 
    ?s ns001:hasModel "Hydra::AccessControls::Permission".
    ?s acl:agent <http://projecthydra.org/ns/auth/group#content-admin>.
    ?s fedora:hasParent ?s2.
    ?s2 ns001:hasModel ?o
}
GROUP BY ?o
'''

In [None]:
list(store.query(query_acl))

All objects with the `acl:agent` predicate of `content-admin`, with the exception of the those outside the repo root, have a parent of the type `Hydra::AccessControl`. Those outside the root have the `rest` endpoint as their parent, which isn't even a proper container. I think, therefore, that these orphans can be excluded from the export.

#### Deletion of non-prod objects

In [None]:
rest_uri = 'http://localhost:8984/rest/'
headers =  {'Accept': 'application/x-turtle'}

In [None]:
import requests
rdf = requests.get(rest_uri, headers=headers)

In [None]:
print(rdf.text)

In [None]:
prefixes = [r for r in rdf.text.split('\n') if r.startswith('@prefix')]

In [None]:
prefix_dict = {}
for prefix in prefixes:
    p = prefix.split()
    prefix_dict[p[1]] = p[2][1:-1]

In [None]:
prefix_dict

In [None]:
from pyoxigraph import parse, RdfFormat, NamedNode
g = parse(rdf.text, format=RdfFormat.TURTLE)

In [None]:
print(serialize(input=g, format=RdfFormat.TURTLE, prefixes=prefix_dict).decode())

In [None]:
non_prod = [node.object.value for node in g if node.predicate == NamedNode('http://www.w3.org/ns/ldp#contains') 
            and node.object != NamedNode('http://localhost:8984/rest/prod')]

In [None]:
session = requests.session()
for uri in non_prod:
    delete_object(session, uri)

#### Reindexing Error

In [None]:
list(store.quads_for_pattern(NamedNode('http://localhost:8984/rest/prod/5t/34/sk/59/5t34sk59v'), None, None))

#### Fedora 6 objects: Hyrax 3 vs. Hyrax 5

In [3]:
store_6 = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/fedora-6-gwss/rdf-db')

In [16]:
yamls = ['https://raw.githubusercontent.com/samvera/hyrax/refs/tags/hyrax-v5.2.0/config/metadata/basic_metadata.yaml',
        'https://raw.githubusercontent.com/samvera/hyrax/refs/tags/hyrax-v5.2.0/config/metadata/core_metadata.yaml',
        'https://raw.githubusercontent.com/samvera/hyrax/refs/tags/hyrax-v5.2.0/config/metadata/file_set_metadata.yaml',
        'https://raw.githubusercontent.com/samvera/hyrax/refs/tags/hyrax-v5.2.0/config/metadata/hyrax_internal_metadata.yaml']


In [17]:
import yaml
from yaml import CLoader
import requests

In [102]:
yaml_predicates = []
for y in yamls:
    r = requests.get(y)
    doc = yaml.load(r.text, Loader=CLoader)
    for k, d in doc['attributes'].items():
        yaml_predicates.append((k, d['predicate']))

In [35]:
missing_predicates = []
for p in yaml_predicates:
    it = iter(store_6.quads_for_pattern(None, NamedNode(p[1]), None))
    try:
        next(it)
    except StopIteration:
        missing_predicates.append(p)

Of the explicitly defined predicates in the Hyrax 5 metadata YAML files, the following appears to hold true:
- The Dublin Core terms are derived in [Hyrax 3.6](https://github.com/samvera/hyrax/blob/hyrax-v3.6.0/app/models/concerns/hyrax/basic_metadata.rb) from `::RDF::Vocab::DC` and `::RDF::Vocab::DC11` (which, n.b., is the same version used in Hyrax 5.x).
- The `#importUrl` and `#relativePath` predicates are defined in Hyrax 3.6 basic metadata, too, using the same URI's.
- The `#arkivoChecksum` is likewise [defined](https://github.com/samvera/hyrax/blob/hyrax-v3.6.0/app/models/concerns/hyrax/works/metadata.rb) and unchanged



In [97]:
pred_query = '''
select distinct ?p
where { 
    ?s ?p ?o.
}
'''

In [103]:
gwss_pred_set = list(store_6.query(pred_query))

In [109]:
gwss_pred_set = sorted([p['p'].value for p in gwss_pred_set])

In [50]:
pcdm_query = '''
PREFIX use: <http://pcdm.org/use#>

SELECT ?s ?p ?o

WHERE { 
    ?s ?p ?o
    FILTER (STRSTARTS(STR(?o), "http://pcdm.org/use"))
}
'''

In [51]:
pcdm_results = iter(store_6.query(pcdm_query))

In [75]:
next(pcdm_results)

<QuerySolution s=<NamedNode value=info:fedora/prod/63/95/w7/69/6395w769w/files/747b4659-5ce2-4177-82b7-e4df0590c52f> p=<NamedNode value=http://www.w3.org/1999/02/22-rdf-syntax-ns#type> o=<NamedNode value=http://pcdm.org/use#ExtractedText>>

The `pcdm` URI's are used to define the role of a file (`OriginalFile` or `ExtractedText`) in GWSS. How does this relate to the `http://vocabulary.samvera.org/ns#pcdmUse` defined in Hyrax 5?

Let's build a graph from a sample Hyrax 5 repo to compare.

In [100]:
store_h5 =Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/hyrax-5/ocfl-db')

In [107]:
h5_pred_set = list(store_h5.query(pred_query))
h5_pred_set = [p['p'].value for p in h5_pred_set]

In [108]:
# Combine YAML-defined predicates with predicates from test repo graph
h5_pred_set = set(h5_pred_set).union({p[1] for p in yaml_predicates})

In [110]:
len(set(gwss_pred_set) & (h5_pred_set))

41

In [112]:
combined_set = set(gwss_pred_set) & (h5_pred_set)
pred_table = []
for pred in h5_pred_set:
    if pred in combined_set:
        pred_table.append({'gwss': pred, 'hyrax-5': pred})
    else:
        pred_table.append({'gwss': None, 'hyrax-5': pred})
for pred in gwss_pred_set:
    if pred not in combined_set:
        pred_table.append({'gwss': pred, 'hyrax-5': None})

In [119]:
from csv import DictWriter
headers = ['gwss', 'hyrax-5']
with open('../predicates.csv', 'w') as f:
    w = DictWriter(f, fieldnames=headers)
    w.writeheader()
    for row in pred_table:
        w.writerow(row)

In [79]:
# Predicates in the Hyrax 5 test not in GWSS
h5_only = [p['p'].value for p in h5_pred_set if p['p'].value not in pred_set]

In [90]:
h5_only = set(h5_only).union(set([p[1] for p in missing_predicates]))

In [81]:
it = iter(store_h5.quads_for_pattern(None, NamedNode(h5_only[0]), None))
next(it)

<Quad subject=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9> predicate=<NamedNode value=http://vocabulary.samvera.org/ns#pcdmUse> object=<NamedNode value=http://pcdm.org/use#OriginalFile> graph_name=<DefaultGraph>>

In [177]:
def match_1_object(g, pred=None, obj=None):
    query = '''
    select distinct ?s ?p ?o
        where {{
            ?s ?p ?o
            {{select ?s
                where {{  
                    ?s ?p ?o;
                    filter(strstarts(str({node}), "{uri}"))
                }} 
                limit 1
            }}
        }}
    '''
    if pred:
        query = query.format(node="?p", uri=pred)
    else:
        query = query.format(node="?o", uri=obj)
    return list(g.query(query))

In [160]:
def get_model_counts(pred_uri, g):
    query = '''
    prefix fedora: <info:fedora/fedora-system:def/model#>
    
    select (count(distinct ?s) as ?count) ?o
        where {{
            ?s fedora:hasModel ?o;
                ?p ?o1.
                filter(strstarts(str(?p), "{uri}"))
        }}
        group by ?o
    '''
    return {r['o'].value: r['count'].value for r in g.query(query.format(uri=pred_uri))}

In [195]:
def get_objects_no_model(g):
    query = '''
    prefix fedora: <info:fedora/fedora-system:def/model#>
    
    select distinct ?s ?p ?o
        where {
            ?s ?p ?o;
            filter not exists { ?s fedora:hasModel ?o1 }
        }
        order by desc(?s)
    '''
    return {k: list(g) for k, g in groupby(g.query(query), key=lambda x: x['s'].value)}

In [201]:
def count_objects_no_model(g):
    query = '''
        prefix fedora: <info:fedora/fedora-system:def/model#>
        prefix ns: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        select (count(distinct ?s) as ?count) ?o
            where {
                ?s ns:type ?o.
                filter not exists { ?s fedora:hasModel ?o1 }
            }
            group by ?o
        '''
    return {r['o'].value: r['count'].value for r in g.query(query)}

In [211]:
def count_objects_pcdm(g):
    query = '''
     select (count(distinct ?s) as ?count) ?p ?o
            where {
                ?s ?p ?o.
                filter(strstarts(str(?p), "http://vocabulary.samvera.org/ns#pcdm"))
            }
            group by ?p ?o 
        '''
    return {(r['p'].value, r['o'].value): r['count'].value for r in g.query(query)}

In [227]:
def get_links_by_model(g):
    query = '''
    prefix fedora: <info:fedora/fedora-system:def/model#>
    select distinct (?m1 as ?model) (?m2 as ?model_linked_to)
    where { 
        ?s fedora:hasModel ?m1.
        ?s ?p ?s2.
        ?s2 ?p1 ?o.
        ?s2 fedora:hasModel ?m2.
    }
    '''
    return [(r['model'].value, r['model_linked_to'].value) for r in g.query(query)]

In [162]:
get_model_counts('info:fedora/fedora-system:def/model#', store_h5)

{'AcademicDocument': '1',
 'Hyrax::AdministrativeSet': '2',
 'Hyrax::FileMetadata': '61',
 'Hyrax::AccessControl': '40',
 'Hyrax::FileSet': '26',
 'ArchivalDocument': '10',
 'Hyrax::Embargo': '2',
 'CollectionResource': '1',
 'Hyrax::Permission': '212'}

In [163]:
get_model_counts('info:fedora/fedora-system:def/model#', store_6)

{'ActiveFedora::Aggregation::Proxy': '14674',
 'GwJournalIssue': '3',
 'Hydra::AccessControls::Permission': '82313',
 'ActiveFedora::DirectContainer': '13717',
 'ActiveFedora::Aggregation::ListSource': '12667',
 'ActiveFedora::IndirectContainer': '13553',
 'Hydra::AccessControls::Embargo': '3245',
 'FileSet': '13719',
 'GwEtd': '6457',
 'Collection': '11',
 'AdminSet': '3',
 'GwWork': '6216',
 'Hydra::AccessControls::Lease': '1823',
 'Hydra::AccessControl': '30949'}

The following objects in Hyrax 5 would have a model of `Hyrax::FileMetadata`

In [202]:
count_objects_no_model(store_6)

{'http://pcdm.org/use#OriginalFile': '13710',
 'http://pcdm.org/models#File': '24276',
 'http://pcdm.org/use#ExtractedText': '13060'}

In [212]:
count_objects_pcdm(store_h5)

{('http://vocabulary.samvera.org/ns#pcdmUse',
  'http://pcdm.org/use#OriginalFile'): '27',
 ('http://vocabulary.samvera.org/ns#pcdmUse',
  'http://pcdm.org/use#ExtractedText'): '3',
 ('http://vocabulary.samvera.org/ns#pcdmUse',
  'http://pcdm.org/use#ServiceFile'): '12',
 ('http://vocabulary.samvera.org/ns#pcdmUse',
  'http://pcdm.org/use#ThumbnailImage'): '19'}

In [173]:
objs_no_model = get_objects_no_model(store_h5)

In [175]:
objs_no_model

{'info:fedora/production/b9/08/e5/bb/b908e5bb-28be-4733-bd64-e749a13c8207#g746240': [<QuerySolution s=<NamedNode value=info:fedora/production/b9/08/e5/bb/b908e5bb-28be-4733-bd64-e749a13c8207#g746240> p=<NamedNode value=http://www.openarchives.org/ore/terms/proxyFor> o=<NamedNode value=info:fedora/production/0f/d1/37/8d/0fd1378d-6b31-4ce2-86a5-e23da5b739c3>>],
 'info:fedora/production/a6/bb/fc/4a/a6bbfc4a-0b4d-4172-b234-7a85b14c33bd#g634040': [<QuerySolution s=<NamedNode value=info:fedora/production/a6/bb/fc/4a/a6bbfc4a-0b4d-4172-b234-7a85b14c33bd#g634040> p=<NamedNode value=http://www.openarchives.org/ore/terms/proxyFor> o=<NamedNode value=info:fedora/production/44/18/c8/a4/4418c8a4-a0ed-4ee9-9385-e8b29be234c8>>],
 'info:fedora/production/a6/bb/fc/4a/a6bbfc4a-0b4d-4172-b234-7a85b14c33bd#g634020': [<QuerySolution s=<NamedNode value=info:fedora/production/a6/bb/fc/4a/a6bbfc4a-0b4d-4172-b234-7a85b14c33bd#g634020> p=<NamedNode value=http://www.openarchives.org/ore/terms/proxyFor> o=<NamedN

The `updatedAt` and `createdAt` relations are server-managed triples in Fedora 6. Those relations appear to be replciated in the object-level schema using the predicate below (and similar for `#createdAt`) by Hyrax 5.

In [182]:
match_1_object(store_h5, pred='http://vocabulary.samvera.org/ns#updatedAt')

[<QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://vocabulary.samvera.org/ns#pcdmUse> o=<NamedNode value=http://pcdm.org/use#ThumbnailImage>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://www.loc.gov/premis/rdf/v1#hasSize> o=<Literal value=18218 datatype=<NamedNode value=http://example.com/predicate/valkyrie_int>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#filename> o=<Literal value=79-thumbnail.jpeg datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=info:fedora/fedora-system:def/model#hasModel> o=<Literal value=Hyrax::FileMetadata data

**QUESTION**: It doesn't appear as though Hyrax 3 created metadata objects for thumbnails, whereas they do exist in Hyrax 5. 

In [180]:
match_1_object(store_6, obj="disk:///")

[]

In [181]:
match_1_object(store_h5, obj="disk:///")

[<QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://vocabulary.samvera.org/ns#pcdmUse> o=<NamedNode value=http://pcdm.org/use#ThumbnailImage>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://www.loc.gov/premis/rdf/v1#hasSize> o=<Literal value=18218 datatype=<NamedNode value=http://example.com/predicate/valkyrie_int>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#filename> o=<Literal value=79-thumbnail.jpeg datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/c7/1f/34/bf/c71f34bf-3618-4000-b855-71a3f82a1b08> p=<NamedNode value=info:fedora/fedora-system:def/model#hasModel> o=<Literal value=Hyrax::FileMetadata data

In Hyrax 3, the `#OriginalFile` metadata object does not have a `fedora:hasModel` relation. Rather, it has two instances of `rdf-syntax-ns#type`, `OriginalFile` and `File`. 

If we look at this metadata object on disk (in OCFL), it lives beside the binary object it describes, such that the binary object has the name `a00be7f3-92ac-4dda-aeab-12a1be087b89`, and the metadata triples have the name `a00be7f3-92ac-4dda-aeab-12a1be087b89~fcr-desc.nt`.

In [185]:
match_1_object(store_6, obj="http://pcdm.org/use#OriginalFile")

[<QuerySolution s=<NamedNode value=info:fedora/prod/05/74/1r/76/05741r76p/files/a00be7f3-92ac-4dda-aeab-12a1be087b89> p=<NamedNode value=http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#dateCreated> o=<Literal value=2013:05:08 18:44:13-04:00 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/prod/05/74/1r/76/05741r76p/files/a00be7f3-92ac-4dda-aeab-12a1be087b89> p=<NamedNode value=http://purl.org/dc/elements/1.1/creator> o=<Literal value=Christine datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/prod/05/74/1r/76/05741r76p/files/a00be7f3-92ac-4dda-aeab-12a1be087b89> p=<NamedNode value=http://purl.org/dc/elements/1.1/creator> o=<Literal value=MicrosoftÂ® Word 2010 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/prod/05/74/1r/76/05741r76p/files/a00be7f3-92ac-4dda-aeab-12a1be087b89> p=<

In Hyrax 5, file-level metadata has a `fedora:hasModel` relation of `Hyrax::FileMetadata`. This object also has a reference to the binary stored under the `fileIdentifier` relation. 

In the OCFL, the binary resides under this separate location, as specified by the file identifier, along with what appears to be an empty `.nt` file. 

In [191]:
match_1_object(store_h5, obj="http://pcdm.org/use#OriginalFile")

[<QuerySolution s=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9> p=<NamedNode value=http://vocabulary.samvera.org/ns#valid> o=<Literal value=true datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9> p=<NamedNode value=http://purl.org/dc/terms/id> o=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9> p=<NamedNode value=http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#width> o=<Literal value=6690 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>>>,
 <QuerySolution s=<NamedNode value=info:fedora/production/a9/b2/41/54/a9b24154-cefb-4aca-9f5c-f073ae08f4f9> p=<NamedNode value=http://www.w3.org/2003/12/exif/ns#colorSpace> o=<Literal value=YCbCr datatype=<Named

In [228]:
get_links_by_model(store_6)

[('AdminSet', 'Hydra::AccessControl'),
 ('Collection', 'Hydra::AccessControl'),
 ('FileSet', 'Hydra::AccessControl'),
 ('FileSet', 'Hydra::AccessControls::Embargo'),
 ('FileSet', 'Hydra::AccessControls::Lease'),
 ('GwEtd', 'FileSet'),
 ('GwEtd', 'AdminSet'),
 ('GwEtd', 'Hydra::AccessControls::Lease'),
 ('GwEtd', 'Hydra::AccessControls::Embargo'),
 ('GwEtd', 'Hydra::AccessControl'),
 ('GwJournalIssue', 'GwWork'),
 ('GwJournalIssue', 'Collection'),
 ('GwJournalIssue', 'AdminSet'),
 ('GwJournalIssue', 'Hydra::AccessControl'),
 ('GwJournalIssue', 'FileSet'),
 ('GwWork', 'FileSet'),
 ('GwWork', 'AdminSet'),
 ('GwWork', 'Hydra::AccessControl'),
 ('GwWork', 'Collection'),
 ('GwWork', 'Hydra::AccessControls::Lease'),
 ('GwWork', 'Hydra::AccessControls::Embargo'),
 ('GwWork', 'GwWork'),
 ('Hydra::AccessControls::Permission', 'GwWork'),
 ('Hydra::AccessControls::Permission', 'FileSet'),
 ('Hydra::AccessControls::Permission', 'GwEtd'),
 ('Hydra::AccessControls::Permission', 'Collection'),
 ('Hydr

In [229]:
get_links_by_model(store_h5)

[('Hyrax::FileSet', 'Hyrax::FileSet'),
 ('Hyrax::FileSet', 'Hyrax::FileMetadata'),
 ('Hyrax::FileSet', 'Hyrax::Embargo'),
 ('Hyrax::FileMetadata', 'Hyrax::FileSet'),
 ('Hyrax::FileMetadata', 'Hyrax::FileMetadata'),
 ('CollectionResource', 'CollectionResource'),
 ('Hyrax::AccessControl', 'Hyrax::FileSet'),
 ('Hyrax::AccessControl', 'Hyrax::Permission'),
 ('Hyrax::AccessControl', 'Hyrax::AccessControl'),
 ('Hyrax::AccessControl', 'ArchivalDocument'),
 ('Hyrax::AccessControl', 'Hyrax::AdministrativeSet'),
 ('Hyrax::AccessControl', 'CollectionResource'),
 ('Hyrax::AccessControl', 'AcademicDocument'),
 ('ArchivalDocument', 'ArchivalDocument'),
 ('ArchivalDocument', 'Hyrax::AdministrativeSet'),
 ('ArchivalDocument', 'Hyrax::FileSet'),
 ('ArchivalDocument', 'CollectionResource'),
 ('AcademicDocument', 'AcademicDocument'),
 ('AcademicDocument', 'Hyrax::AdministrativeSet'),
 ('AcademicDocument', 'Hyrax::FileSet'),
 ('AcademicDocument', 'Hyrax::Embargo')]

#### Mapping GWSS/Fedora 6 objects to Solr documents

In [2]:
store_6 = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/fedora-6-gwss/rdf-db')

In [3]:
def to_fedora_node(solr_id):
    return NamedNode(f'info:fedora/prod/{solr_id[:2]}/{solr_id[2:4]}/{solr_id[4:6]}/{solr_id[6:8]}/{solr_id}')

In [4]:
fedora_id_pattern = re.compile(r'info:fedora/prod/(?:[a-z0-9]{2}/){4}(.+)')

In [11]:
def to_solr_id(uri):
    m = fedora_id_pattern.match(uri)
    if m:
        return m.group(1)

In [33]:
def convert_timestamp(t1):
    try:
        return isoparse(t1).strftime('%Y-%m-%dT%H:%M:%S')
    except ValueError:
        return t1

`SolrFedoraMapping.map_docs` iterates through a directory of `.json` files exported from a Hyrax Solr index, creating a mapping from Solr fields to the Fedora predicates that serve as their counterparts.

Predicates are mapped to Solr fields using the predicate/field values, which are assumed (in most cases) to be identical. The mapping is still indeterminate, however, since there is no guarantee that the same value will not appear in multiple fields in the same document. 

Values that require transformation:
- Timestamps, which are standardized by removing time zone information and converting to ISO format strings.
- Object identifiers, which are converted between Solr strings and Fedora pairtree URIs

The process is as follows:
1. For each Solr doc, invert it, so that the values becomes keys, with each value pointing to one or more fields that hold that value in the Solr doc. Ignore any values that are blank/null.
2. Retrieve the node (a collection of triples) from the Fedora RDF graph that corresponds to this object, using the Solr document identifier.
3. For each predicate, try to find a match by value in the inverted Solr doc.
   - To aid in the reduction of ambiguity, each possible match is counted, resulting in an overall tally (across all Solr documents) of how many documents contained that possible predicate/field pair.
4. Record the tally of possible matches, grouped by Fedora object model 
5. Unmapped predicates are recorded (with a count of the models associated with each)
6. Solr IDs with no match in the RDF grap are also recorded.

In [103]:
class SolrFedoraMapping:

    def __init__(self, path_to_solr_docs, rdf_graph):
        self.g = rdf_graph
        self.path = Path(path_to_solr_docs)
        self.mapping = defaultdict(dict)
        # URI for model definition
        self.model_predicate = NamedNode('info:fedora/fedora-system:def/model#hasModel')
        # Used for type definitions for file metadata objects in Hyrax 3
        self.type_predicate = NamedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
        # Unmapped predicates
        self.unmapped = defaultdict(dict)
        # Unmatched objects
        self.unmatched = []

    def make_solr_lookup(self, solr_doc):
         # Construct reverse lookup for Solr doc
        lookup = defaultdict(list)
        for k, v in solr_doc.items():
            # Solr internal fields
            if k == 'id' or k == 'score':
                continue
            # Handle multi-valued predicates
            if isinstance(v, list):
                for value in v:
                    # Skip empty values
                    if value:
                        lookup[value].append(k)
            elif isinstance(v, str) and k.endswith('dtsi'):
                ts_key = convert_timestamp(v)
                lookup[ts_key].append(k) 
            else:
                lookup[v].append(k)
        return lookup
    
    def add_to_map(self, solr_doc):
        _id = solr_doc.get('id')
        lookup = self.make_solr_lookup(solr_doc)
        # Get all triples with this ID as the subject
        triples = self.g.quads_for_pattern(to_fedora_node(_id), None, None)
        mapping = defaultdict(dict)
        unmapped = []
        model = None
        types = []
        unmatched = True
        for triple in triples:
            unmatched = False
            # The object value for the triple should match the datum in at least one Solr field       
            lookup_key = triple.object.value
            solr_field = lookup.get(lookup_key) or lookup.get(to_solr_id(lookup_key)) or lookup.get(convert_timestamp(lookup_key))
            if solr_field:
                # In case the mapping is not 1:1
                for field in solr_field:
                    mapping[triple.predicate.value][field] =  mapping[triple.predicate.value].get(field, 0) + 1
            else:
                unmapped.append(triple.predicate.value)
            if triple.predicate == self.model_predicate:
                model = triple.object.value
            elif not model and triple.predicate == self.type_predicate:
                types.append(triple.object.value)
        if unmatched:
            self.unmatched.append(_id)
            return
        if model:
            model_key = (model,)
        else:
            model_key = tuple(sorted(types))
        model_mapping = self.mapping[model_key]
        for k, v in mapping.items():
            if k in model_mapping:
                for value, count in v.items():
                    model_mapping[k][value] = model_mapping[k].get(value, 0) + count
            else:
                model_mapping[k] = v
        for pred in unmapped:
            self.unmapped[pred][model_key] = self.unmapped[pred].get(model_key, 0) + 1
                    
    def map_docs(self):
        for p in self.path.glob('*.json'):
            with open(p) as f:
                docs = json.load(f)
                for doc in docs:
                    self.add_to_map(doc)

In [16]:
test_file = '/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/solr_docs/solr-batch-184.json'

In [17]:
with open(test_file) as f:
    test_doc = json.load(f)

In [49]:
test_doc[0]['id']

'xs55mc65x'

In [104]:
sfm_test = SolrFedoraMapping('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/solr_docs', store_6)

In [105]:
for d in test_doc[:100]:
    sfm_test.add_to_map(d)

In [None]:
sfm_test.mapping

In [107]:
sfm = SolrFedoraMapping('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/solr_docs', store_6)

In [108]:
sfm.map_docs()

In [123]:
with open('../gwss-solr-fedora-mapping.pkl', 'wb') as f:
    pickle.dump(sfm.mapping, f)
with open('../gwss-solr-fedora-unmapped.pkl', 'wb') as f:
    pickle.dump(sfm.unmapped, f)
with open('../gwss-solr-fedora-unmatched.json', 'w') as f:
    json.dump(sfm.unmatched, f)

In [110]:
len(sfm.unmatched)

6012

In [124]:
sfm.mapping.keys()

dict_keys([('Hydra::AccessControls::Permission',), ('Hydra::AccessControl',), ('Hydra::AccessControls::Lease',), ('Hydra::AccessControls::Embargo',), ('FileSet',), ('ActiveFedora::DirectContainer',), ('GwEtd',), ('ActiveFedora::Aggregation::ListSource',), ('ActiveFedora::IndirectContainer',), ('ActiveFedora::Aggregation::Proxy',), ('GwWork',), ('Collection',), ('GwJournalIssue',), ('AdminSet',)])

#### Resource Types

In [4]:
store_6 = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/fedora-6-gwss/rdf-db')

In [9]:
def count_resource_type(g):
    query = '''
    prefix pcdm_type: <http://purl.org/dc/terms/type>

    select (count(distinct ?s) as ?count) ?o
    where { 
        ?s pcdm_type: ?o
    }
    group by ?o
    '''
    return {r['o'].value: r['count'].value for r in g.query(query)}

In [10]:
count_resource_type(store_6)

{'Research Paper': '61',
 'Poster': '177',
 'Microfilm': '2511',
 'Brochure': '1',
 'Newsletter': '115',
 'Meeting minutes': '734',
 'Meeting Minutes': '184',
 'Other': '717',
 'article': '1',
 'Journal': '329',
 'Exhibition catalog': '7',
 'Dataset': '7',
 'Dissertation': '4321',
 'Book review': '1',
 'Archival materials': '3432',
 'Working Paper': '41',
 'Report': '160',
 'Article': '724',
 'Thesis or Dissertation': '45',
 'Part of Book': '15',
 'Artice': '14',
 'Book': '6',
 "Master's Thesis": '2128',
 'Commentary': '78',
 'Presentation': '23',
 'Capstone Project': '10',
 'Project': '11',
 'Research paper': '6',
 'Book Review': '98',
 'Capstone Project ': '26',
 'Image': '123'}

In [44]:
archival = ['Microfilm', 'Brochure', 'Meeting minutes', 'Meeting Minutes', 'Exhibition catalog', 'Archival materials', 'Image']

In [125]:
def get_ids_by_type_with_file_data(g, resource_types):
    values_string = " ".join([f'("{t}")' for t in resource_types])
    query = '''
    prefix pcdm_type: <http://purl.org/dc/terms/type>
    prefix pcdm_models: <http://pcdm.org/models#>
    prefix sd: <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo>
    prefix ns: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    prefix ec: <http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#>
    prefix premis: <http://www.loc.gov/premis/rdf/v1#>
    
    select distinct ?s (group_concat(?type; separator="|") as ?resource_types) ?mimetype ?filesize
    where {{ 
        values (?type) {{ {values_string} }}
        ?s pcdm_type: ?type.
        ?s pcdm_models:hasMember ?fileset.
        ?fileset pcdm_models:hasFile ?file.
        ?file ns:type ?filetype.
        ?file ec:hasMimeType ?mimetype.
        ?file premis:hasSize ?filesize.
        filter(?filetype = <http://pcdm.org/use#OriginalFile>)
    }} 
    group by ?s ?mimetype ?filesize
    '''
    return [[r.value for r in result] for result in g.query(query.format(values_string=values_string))]

In [126]:
archival_objects = get_ids_by_type_with_file_data(store_6, archival)

In [134]:
import csv

In [135]:
with open('gwss-archival-works.csv', 'w') as f:
    writer = csv.writer(f)
    for ao in archival_objects:
        uri = f"https://scholarspace.library.gwu.edu/concern/gw_works/{ao[0].split('/')[-1]}"
        writer.writerow([uri] + ao[1:])

#### Embargo analysis

In [3]:
store_gwss = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/rdf-db/')

In [12]:
# Work that should have embargo on the file
embargoed_work = 'dv13zv069'

In [5]:
def id_to_uri(_id):
    return f'http://localhost:8984/rest/prod/{_id[:2]}/{_id[2:4]}/{_id[4:6]}/{_id[6:8]}/{_id}'

In [70]:
def make_gwss_url(uri, work_type):
    _id = uri.split('/')[-1]
    work_type = 'gw_works' if work_type == 'GwWork' else 'gw_etds'
    return f'https://scholarspace.library.gwu.edu/concern/{work_type}/{_id}'

In [105]:
def make_fileset_url(parent, self):
    parent_id = parent.split('/')[-1]
    self_id = self.split('/')[-1]
    return f'https://scholarspace.library.gwu.edu/concern/parent/{parent_id}/file_sets/{self_id}'

In [108]:
def get_embargoes(g):
    emb_query = '''
    prefix fedora_model: <info:fedora/fedora-system:def/model#>
    prefix hydra_acl: <http://projecthydra.org/ns/auth/acl#>
    prefix pcdm_model: <http://pcdm.org/models#>
    
    select distinct ?embargoSubject ?sModel ?parent ?pModel ?visibility ?releaseDate 
    where {
        ?s fedora_model:hasModel ?model.
        ?s hydra_acl:embargoReleaseDate ?releaseDate.
        ?s hydra_acl:visibilityDuringEmbargo ?visibility.
        ?embargoSubject hydra_acl:hasEmbargo ?s.
        ?embargoSubject fedora_model:hasModel ?sModel.
        optional {?parent pcdm_model:hasMember ?embargoSubject.
                ?parent fedora_model:hasModel ?pModel}
        filter(str(?model) = "Hydra::AccessControls::Embargo")
    
    }
    '''
    results = []
    for result in g.query(emb_query):
        row = {'embargoed_object': result['embargoSubject'].value.split('/')[-1],
               'object_model': result['sModel'].value,
                'visibility': result['visibility'].value,
               'release_date': result['releaseDate'].value}
        if result['parent']:
            row['parent_url'] = make_gwss_url(result['parent'].value, result['pModel'].value)
            row['work_type'] = result['pModel'].value
            row['url'] = make_fileset_url(result['parent'].value, result['embargoSubject'].value)
        else:
            row['url'] = make_gwss_url(result['embargoSubject'].value, result['sModel'].value)
        results.append(row)
    return results

In [109]:
embargoes = get_embargoes(store_gwss)

In [124]:
import requests
from datetime import datetime

In [134]:
def check_access(record):
    if check_embargo_date(record):
        return False, record['url']
    json_url = record['url'] + '.json'
    r = requests.get(json_url)
    r = r.json()
    if (not r.get('code')) or (not (r['code'] == 401)):
        return True, record['url']
    return False, record['url']

In [135]:
access_checks = [check_access(e) for e in embargoes]

In [131]:
def check_embargo_date(record):
    return datetime.fromisoformat(record['release_date']).replace(tzinfo=None) < datetime.now()

In [137]:
[row for row in access_checks if row[0]]

[(True,
  'https://scholarspace.library.gwu.edu/concern/parent/12579t09n/file_sets/8336h293m'),
 (True,
  'https://scholarspace.library.gwu.edu/concern/parent/08612p543/file_sets/tt44pn73b')]

#### Version Analysis

In [138]:
def get_version_stats(g):
    versions_query = '''
    PREFIX fedora: <http://fedora.info/definitions/v4/repository#>
    PREFIX fedmodel: <info:fedora/fedora-system:def/model#> 
    PREFIX ldp: <http://www.w3.org/ns/ldp#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX pcdm: <http://pcdm.org/models#>
    prefix ebucore: <http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#> 
    
     select ?work ?model ?filename ?num_versions
     where {
         { select ?work ?model ?filename (count(?version) as ?num_versions)
    
            where { 
                ?file fedora:hasParent ?f.
                ?file rdf:type <http://pcdm.org/use#OriginalFile>.
                ?f ldp:hasMemberRelation <http://pcdm.org/models#hasFile>.
                ?f ldp:membershipResource ?fileset.
                ?work pcdm:hasMember ?fileset.
                ?work fedmodel:hasModel ?model.
                ?file fedora:hasVersion ?version.
                ?file ebucore:filename ?filename
            }
            group by ?work ?model ?filename
        }
        filter(?num_versions > 1)
    }
    '''
    return list(dict(zip(['work', 'model', 'filename', 'num_versions'], [c.value for c in r]))
                for r in g.query(versions_query))

In [None]:
store_gwss = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/rdf-db/')

In [None]:
versions = get_version_stats(store_gwss)

In [6]:
def fed_to_gwss(record):
    model = record['model'].lower()
    model = f'{model[:2]}_{model[2:]}s'
    _id = record['work'].split('/')[-1]
    return f'https://scholarspace.library.gwu.edu/concern/{model}/{_id}'

In [None]:
from csv import DictWriter
with open('../gwss/multiple_versions.csv', 'w') as f:
    fieldnames = versions[0].keys()
    writer = DictWriter(f, fieldnames)
    writer.writeheader()
    for version in versions:
        row = {'work': fed_to_gwss(version)}
        row['model'] = version['model']
        row['filename'] = version['filename']
        row['num_versions'] = version['num_versions']
        writer.writerow(row)

#### Language field analysis

In [2]:
store_gwss = Store.read_only('/Users/dsmith/Documents/code/gwss/testing/migration/fcrepo_graph/gwss/rdf-db/')

In [12]:
def get_language_and_model(g):
    lang_query = '''
    prefix fedora: <info:fedora/fedora-system:def/model#>
    select ?resource ?language ?model
    where { 
        values ?hasLang { <http://purl.org/dc/elements/1.1/language> }
        ?resource  <http://purl.org/dc/elements/1.1/language> ?language.
        ?resource fedora:hasModel ?model
    }
    '''
    return [{"work": r['resource'].value,
            "language": r['language'].value,
            "model": r['model'].value}
    for r in g.query(lang_query)]

In [13]:
resources_with_lang = get_language_and_model(store_gwss)

In [21]:
aggregate = {k: len(list(g)) for k, g in groupby(sorted(resources_with_lang, key=lambda x: (x["model"], x["language"])), 
                                     key=lambda x: (x["model"], x["language"]))}

In [24]:
import csv
with open('./languages_by_model.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["model", "language", "count"])
    for k, v in aggregate.items():
        writer.writerow([*k, v])