Skip to content

Commit

Permalink
Merge pull request #77 from collective/update_content
Browse files Browse the repository at this point in the history
Support update and replace for existing content
  • Loading branch information
pbauer committed Dec 8, 2021
2 parents 046e7fd + 7bcd36b commit 5583517
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 27 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Changelog
- Add hook global_obj_hook_before_deserializing to modify the created obj before deserializing the data.
[pbauer]

- Add support to update and to replace existing content during import (#76)
[pbauer]


1.2 (2021-10-11)
----------------
Expand Down
10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ You can use this addon to
* Combine content from mutiple plone-sites into one.
* Import a plone-site as a subsite into another.
* Import content from other systems as long as it fits the required format.
* Update or replace existing data
* ...

Details
Expand Down Expand Up @@ -175,6 +176,13 @@ To fix this you can check the checkbox "Modify exported data for migrations". Th
Control creating imported content
---------------------------------

You can choose between four options how to deal with content that already exists:

* Skip: Don't import at all
* Replace: Delete item and create new
* Update: Reuse and only overwrite imported data
* Ignore: Create with a new id

Imported content is initially created with ``invokeFactory`` using portal_type and id of the exported item before deserialing the rest of the data.
You can set additional values by specifying a dict ``factory_kwargs`` that will be passed to the facory.
Like this you can set values on the imported object that are expected to be there by subscribers to IObjectAddedEvent.
Expand All @@ -189,6 +197,8 @@ Exporting and importing large amounts of content can take a while. Export is pre
* Importing 5000 Documents takes >25 minutes because of versioning.
* Importing 5000 Documents without versioning takes ~7 minutes.

During import you can commit every x number of items which will free up memory and disk-space in your TMPDIR (where blobs are added before each commit).

When exporting large numbers of blobs (binary files and images) you will get huge json-files and may run out of memory.
You have various options to deal with this.
The best way depends on how you are going to import the blobs:
Expand Down
59 changes: 45 additions & 14 deletions src/collective/exportimport/import_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,14 @@ def __call__(self, jsonfile=None, return_json=False, limit=None, server_file=Non
self.limit = limit
self.commit = int(request["commit"]) if request.get("commit") else None
self.import_to_current_folder = request.get("import_to_current_folder", False)
self.skip_existing_content = request.get("skip_existing_content", False)

self.handle_existing_content = int(request.get("handle_existing_content", 0))
self.handle_existing_content_options = (
("0", "Skip: Don't import at all"),
("1", "Replace: Delete item and create new"),
("2", "Update: Reuse and only overwrite imported data"),
("3", "Ignore: Create with a new id"),
)

if not self.request.form.get("form.submitted", False):
return self.template()
Expand Down Expand Up @@ -278,25 +285,47 @@ def import_new_content(self, data): # noqa: C901
)
continue

# Speed up import by not using autogenerated ids for conflicts
factory_kwargs = item.get("factory_kwargs", {})

# Handle existing content
self.update_existing = False
if new_id in container:
if self.skip_existing_content:
if self.handle_existing_content == 0:
# Skip
logger.info(u"{} ({}) already exists. Skipping it.".format(
new_id, item["@id"])
)
continue
duplicate = new_id
new_id = "{}-{}".format(new_id, random.randint(1000, 9999))
item["id"] = new_id
logger.info(
u"{} ({}) already exists. Created as {}".format(
duplicate, item["@id"], new_id

elif self.handle_existing_content == 1:
# Replace content before creating it new
logger.info(u"{} ({}) already exists. Replacing it.".format(
new_id, item["@id"])
)
)
api.content.delete(container[new_id])

factory_kwargs = item.get("factory_kwargs", {})
container.invokeFactory(item["@type"], item["id"], **factory_kwargs)
new = container[item["id"]]
elif self.handle_existing_content == 2:
# Update existing item
logger.info(u"{} ({}) already exists. Updating it.".format(
new_id, item["@id"])
)
self.update_existing = True
new = container[new_id]

else:
# Create with new id. Speed up by using random id.
duplicate = new_id
new_id = "{}-{}".format(new_id, random.randint(1000, 9999))
item["id"] = new_id
logger.info(
u"{} ({}) already exists. Created as {}".format(
duplicate, item["@id"], new_id
)
)

if not self.update_existing:
container.invokeFactory(item["@type"], item["id"], **factory_kwargs)
new = container[item["id"]]

new, item = self.global_obj_hook_before_deserializing(new, item)

Expand All @@ -320,6 +349,7 @@ def import_new_content(self, data): # noqa: C901
self.custom_obj_hook(new, item)

uuid = self.set_uuid(item, new)

if uuid != item["UID"]:
item["UID"] = uuid

Expand Down Expand Up @@ -649,8 +679,9 @@ def create_container(self, item):

def set_uuid(self, item, obj):
uuid = item["UID"]
if api.content.find(UID=uuid):
if not self.update_existing and api.content.find(UID=uuid):
# this should only happen if you run import multiple times
# without updating existing content
uuid = obj.UID()
logger.info(
"UID {} of {} already in use by {}. Using {}".format(
Expand Down
30 changes: 18 additions & 12 deletions src/collective/exportimport/templates/import_content.pt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@
</div>
</tal:block>

<div class="field mb-3">
<label for="include_blobs">Handle existing content</label>
<span class="formHelp">
How should content be handled that exists with the same id/path?
</span>
<div class="widget">
<select name="handle_existing_content" class="">
<option value="0"
tal:repeat="current python:view.handle_existing_content_options"
tal:attributes="value python: current[0];
selected python:'selected' if int(current[0]) == view.handle_existing_content else False"
tal:content="python:current[1]">
0
</option>
</select>
</div>
</div>

<div class="field mb-3">
<label for="commit">Do a commit after each number of items</label>
<div class="widget">
Expand All @@ -54,18 +72,6 @@
</label>
</div>

<div class="field">
<label>
<input
type="checkbox"
name="skip_existing_content:boolean"
id="skip_existing_content"
tal:attributes="checked python:view.skip_existing_content"
/>
Skip existing content
</label>
</div>

<div class="formControls" class="form-group">
<input type="hidden" name="form.submitted" value="1"/>
<button class="btn btn-primary submit-widget button-field context"
Expand Down
142 changes: 141 additions & 1 deletion src/collective/exportimport/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,24 +206,164 @@ def test_import_content_document(self):
self.assertEqual(new_doc.portal_type, "Document")
self.assertEqual(new_doc.UID(), doc_uid)

# See what happens when we import it a second time.
# When we import it a second time is is ignored by default.
original_ids = portal.contentIds()
browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(raw_data, "application/json", "Document.json")
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 0 items in 0 seconds", browser.contents)

# Now we ignore it
original_ids = portal.contentIds()
browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(raw_data, "application/json", "Document.json")
browser.getControl(name="handle_existing_content").value = ["3"]
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 1 items", browser.contents)

# A second document should be there.
new_ids = [docid for docid in portal.contentIds() if docid not in original_ids]
self.assertEqual(len(new_ids), 1)
self.assertEqual(len(portal.contentIds()), 2)
doc2 = portal[new_ids[0]]
# doc2 is the same as the original
self.assertEqual(doc2.Title(), "Document 1")
self.assertEqual(doc2.portal_type, "Document")
# except for the UID
self.assertNotEqual(doc2.UID(), doc_uid)

def test_import_content_update(self):
# First create some content.
app = self.layer["app"]
portal = self.layer["portal"]
login(app, SITE_OWNER_NAME)
doc = api.content.create(
container=portal, type="Document", id="doc1", title="Document 1"
)
doc_uid = doc.UID()
transaction.commit()

# Now export it.
browser = self.open_page("@@export_content")
browser.getControl(name="portal_type").value = ["Document"]
browser.getForm(action="@@export_content").submit(name="submit")

# We should have gotten json.
raw_data = browser.contents
if not browser.contents:
raw_data = DATA[-1]

# Remove the added content.
api.content.delete(doc)
transaction.commit()
self.assertNotIn("doc1", portal.contentIds())

# Now import it.
browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(raw_data, "application/json", "Document.json")
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 1 items", browser.contents)

# The document should be back.
self.assertIn("doc1", portal.contentIds())
new_doc = portal["doc1"]
self.assertEqual(new_doc.Title(), "Document 1")
self.assertEqual(new_doc.portal_type, "Document")
self.assertEqual(new_doc.UID(), doc_uid)

# When we import and update it with some changed data.
data = json.loads(raw_data)
data[0]["title"] = "A different title"
data[0].pop("description")
changed_raw_data = json.dumps(data)

browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(changed_raw_data.encode(), "application/json", "Document.json")
browser.getControl(name="handle_existing_content").value = ["2"] # update!
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 1 items in 0 seconds", browser.contents)

# new_doc now has a updated title
new_doc = portal["doc1"]
self.assertEqual(len(portal.contentIds()), 1)
self.assertEqual(new_doc.Title(), "A different title")
self.assertEqual(new_doc.portal_type, "Document")
# The UID is still the same
self.assertEqual(new_doc.UID(), doc_uid)

def test_import_content_replace(self):
# First create some content.
app = self.layer["app"]
portal = self.layer["portal"]
login(app, SITE_OWNER_NAME)
doc = api.content.create(
container=portal,
type="Document",
id="doc1",
title="Document 1",
description="A Description",
)
doc_uid = doc.UID()
transaction.commit()

# Now export it.
browser = self.open_page("@@export_content")
browser.getControl(name="portal_type").value = ["Document"]
browser.getForm(action="@@export_content").submit(name="submit")

# We should have gotten json.
raw_data = browser.contents
if not browser.contents:
raw_data = DATA[-1]

# Remove the added content.
api.content.delete(doc)
transaction.commit()
self.assertNotIn("doc1", portal.contentIds())

# Now import it.
browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(raw_data, "application/json", "Document.json")
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 1 items", browser.contents)

# The document should be back.
self.assertIn("doc1", portal.contentIds())
new_doc = portal["doc1"]
self.assertEqual(new_doc.Title(), "Document 1")
self.assertEqual(new_doc.Description(), "A Description")
self.assertEqual(new_doc.portal_type, "Document")
self.assertEqual(new_doc.UID(), doc_uid)

# When we import and replace it with different data.
data = json.loads(raw_data)
data[0]["title"] = "A different title"
data[0].pop("description")
changed_raw_data = json.dumps(data)

browser = self.open_page("@@import_content")
upload = browser.getControl(name="jsonfile")
upload.add_file(changed_raw_data.encode(), "application/json", "Document.json")
browser.getControl(name="handle_existing_content").value = ["1"] # replace!
browser.getForm(action="@@import_content").submit()
self.assertIn("Imported 1 items in 0 seconds", browser.contents)

# still only one item
self.assertEqual(len(portal.contentIds()), 1)
# new_doc now has a updated title
new_doc = portal["doc1"]
self.assertEqual(new_doc.Title(), "A different title")
# description is new (=empty)
self.assertEqual(new_doc.Description(), "")
self.assertEqual(new_doc.portal_type, "Document")
# The UID is still the same
self.assertEqual(new_doc.UID(), doc_uid)

def test_import_content_with_missing_folder(self):
# First create some content.
app = self.layer["app"]
Expand Down

0 comments on commit 5583517

Please sign in to comment.