# Playing around with solr update operations.

See also: https://hpad.dataone.org/AwEwLAZhBMBsEFpoQIawWAjHBAOArLAEYKzQDMmmAnGNVHEA

Notes:

Use `update` operations to set, add specific fields to a record, and to create the record if it doesn't exist. See https://lucene.apache.org/solr/guide/6_6/updating-parts-of-documents.html#updating-parts-of-documents (doc is for Solr 6.6 but applies to 5.2.1 as well)

Use `commitWithin` rather than specific `commit` requests (has the effect of batching during high indexing load).

Use `get` rather than `search` when retriving a document by Id. https://lucene.apache.org/solr/guide/6_6/realtime-get.html


* [Indexing is much faster when batched](https://lucidworks.com/2015/10/05/really-batch-updates-solr-2/)
* [Simple atomic update examples](http://yonik.com/solr/atomic-updates/)

In [18]:
import requests
import json
from datetime import datetime
from pprint import pprint
from werkzeug.datastructures import MultiDict

# If necessary, use: ssh -L8983:localhost:8983 cn-dev-ucsb-2.test.dataone.org

#Name of the search index
INDEX_NAME = "search_core"

#number of milliseconds before Solr does a commit for any pending changes
COMMIT_WITHIN = 1000

#Solr likes this format for dateTime values
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

#URL to solr end point
SOLR = "http://localhost:8983/solr/"

#Working with JSON requests and responses here
HEADERS = {
        "user-agent":"python-d1_solr_update_play-0.0",
        "Content-Type":"application/json",
}

#Prefix for identifiers used here for testing purposes
id_p = "dv20190412_"


def solrCommit(index_name=INDEX_NAME):
    """
    Perform a hard commit. This can be a slow operation and should be used 
    sparingly when heavy indexing is in progress
    """
    url = f"{SOLR}{index_name}/update"
    params = {
        "commit":"true"
    }
    response = requests.get(url, params=params)
    print(f"COMMIT status = {response.status_code}")

    
def deleteDocFromSolr(did, index_name=INDEX_NAME):
    """
    Remove the specified document or documents from the index and perform a hard commit.
    """
    url = f"{SOLR}{index_name}/update"
    data = {
        "delete":{
            "id":did
        }
    }
    params = {"commitWithin":1000}
    response = requests.get(url, headers=HEADERS, data=json.dumps(data), params=params)
    if response.status_code == 200:
        print(f"DELETE OK: {response.text}")
    else:
        print(f"DELETE FAILED: {response.text}")
    
        

def deleteDocsFromSolr(ids, index_name=INDEX_NAME):
    for doc in ids:
        deleteDocFromSolr(doc, index_name=index_name)


def sendDocToSolr(doc, index_name=INDEX_NAME):
    """
    Uses the update method in Solr to perform the equivalent of an UPSERT operation.
    
    If a document with the same id exists, then it is updated with the doc properties.
    
    If a document without the id exists, then a new one is created.
    """
    add_fields = ["documents", "idDocumentedBy", "resourceMap", ]
    upd = {}
    for k,v in doc.items():
        if k == "id":
            upd[k] = v
        elif k in add_fields:
            upd[k] = {"add":v}
        else:
            upd[k] = {"set":v}

    print("###")
    print("Sending document to Solr: ")
    upd = {
        "add": {
            "doc": upd,
            "overwrite":True,
        },
    }
    pprint(upd, indent=2)
    url = f"{SOLR}{index_name}/update"
    params = {"commitWithin":COMMIT_WITHIN}
    response = requests.post(url, headers=HEADERS, params=params, data=json.dumps(upd))
    print(f"URL for POST request = {response.url}")
    print(f"Response status = {response.status_code}")
    print(f"Response message = {response.text}")
    #solrCommit()
    

def getDocFromSolr(did, index_name=INDEX_NAME):
    """
    Use the realtime get functionality to retrieve a document from solr by Id.
    This is faster than retrieval by /select and works without the need for a hard commit.
    
    """
    url = f"{SOLR}{index_name}/get"
    params = {"id":did}
    response = requests.get(url, headers=HEADERS, params=params)
    data = None
    if response.status_code == 200:
        data = json.loads(response.text)
    return data["doc"]


def getDocsFromSolr(ids, index_name=INDEX_NAME):
    res = []
    for did in ids:
        res.append(getDocFromSolr(did, index_name=index_name))
    return res




    

In [19]:
def createIndexDoc(**kw):
    """
    Convenience for creating an index doc using kw params
    """
    doc = {
        "id":kw.pop("id"),
        "title":"Test data - safe to delete",
        "dateModified":f"{datetime.utcnow().strftime(DATE_FORMAT)}"
    }
    doc["identifier"] = doc["id"]
    for k,v in kw.items():
        doc[k] = v
    return doc



ORE = "http://www.openarchives.org/ore/terms"
DATA = "text/csv"
METADATA = "eml://ecoinformatics.org/eml-2.1.1"

docs = [
]

# Create a simple dataset entry with:
# 01 = ORE
# 02 = Metadata
# 03 = Data
# ORE stub

pid_ore = f"{id_p}01"
pid_metadata = f"{id_p}02"
pid_data = f"{id_p}03"
docs.append(createIndexDoc(formatType="RESOURCE",
                           formatId=ORE,
                           id=pid_ore
                          )
           )
docs.append(createIndexDoc(formatType="METADATA",
                           formatId=METADATA,
                           id=pid_metadata,
                           resourceMap=pid_ore,
                           documents=pid_data
                          )
           )
docs.append(createIndexDoc(formatType="DATA",
                           formatId=DATA,
                           id=pid_data,
                           resourceMap=pid_ore,
                           isDocumentedBy=pid_metadata
                          )
           )

print("Documents before sending to Solr:")
pprint(docs)

# Send docs to solr
for doc in docs:
    sendDocToSolr(doc)

# list of the document identifiers
ids = set([doc["id"] for doc in docs])
pprint(ids)

# retrieve docs from solr
d = getDocsFromSolr(ids)
print("Documents retrieved from Solr:")
pprint(d)


Documents before sending to Solr:
[{'dateModified': '2019-04-15T15:57:29.874600Z',
  'formatId': 'http://www.openarchives.org/ore/terms',
  'formatType': 'RESOURCE',
  'id': 'dv20190412_01',
  'identifier': 'dv20190412_01',
  'title': 'Test data - safe to delete'},
 {'dateModified': '2019-04-15T15:57:29.874676Z',
  'documents': 'dv20190412_03',
  'formatId': 'eml://ecoinformatics.org/eml-2.1.1',
  'formatType': 'METADATA',
  'id': 'dv20190412_02',
  'identifier': 'dv20190412_02',
  'resourceMap': 'dv20190412_01',
  'title': 'Test data - safe to delete'},
 {'dateModified': '2019-04-15T15:57:29.874722Z',
  'formatId': 'text/csv',
  'formatType': 'DATA',
  'id': 'dv20190412_03',
  'identifier': 'dv20190412_03',
  'isDocumentedBy': 'dv20190412_02',
  'resourceMap': 'dv20190412_01',
  'title': 'Test data - safe to delete'}]
###
Sending document to Solr: 
{ 'add': { 'doc': { 'dateModified': {'set': '2019-04-15T15:57:29.874600Z'},
                    'formatId': { 'set': 'http://www.openarchi

In [20]:

# Now update the metadata partial document with some additional information
# Get the metadata doc before updating properties
d = getDocFromSolr(pid_metadata)
print("Metadata doc before update:")
pprint(d)
doc = createIndexDoc(
    id=pid_metadata,
    title="Updated Test Data - DELETE ME",
    submitter="TEST"
)
sendDocToSolr(doc)
d = getDocFromSolr(pid_metadata)
print("Metadata doc after update:")
pprint(d)


Metadata doc before update:
{'_version_': 1630896233069412352,
 'dateModified': '2019-04-15T15:57:29.874Z',
 'documents': ['dv20190412_03'],
 'formatId': 'eml://ecoinformatics.org/eml-2.1.1',
 'formatType': 'METADATA',
 'id': 'dv20190412_02',
 'isService': False,
 'resourceMap': ['dv20190412_01'],
 'serviceCoupling': 'false',
 'title': 'Test data - safe to delete'}
###
Sending document to Solr: 
{ 'add': { 'doc': { 'dateModified': {'set': '2019-04-15T15:57:31.518709Z'},
                    'id': 'dv20190412_02',
                    'identifier': {'set': 'dv20190412_02'},
                    'submitter': {'set': 'TEST'},
                    'title': {'set': 'Updated Test Data - DELETE ME'}},
           'overwrite': True}}
URL for POST request = http://localhost:8983/solr/search_core/update?commitWithin=1000
Response status = 200
Response message = {"responseHeader":{"status":0,"QTime":38}}

Metadata doc after update:
{'_version_': 1630896234589847552,
 'dateModified': '2019-04-15T15:57:

In [21]:
#Clean up
deleteDocsFromSolr(ids)

DELETE OK: {"responseHeader":{"status":0,"QTime":36}}

DELETE OK: {"responseHeader":{"status":0,"QTime":34}}

DELETE OK: {"responseHeader":{"status":0,"QTime":34}}

