In [1]:
%load_ext lab_black

In [2]:
import re
import json

from uuid import uuid4

from django.db import connections

from rich import print

# Manual Stuff

* Set DELETE_WAGTAIL_IMAGES=False to deactivate post_delete_file_cleanup, to avoid deleting original 🥶
* Start only postgres and then: dropdb homepage && createdb homepage && python manage.py migrate
* Sometimes after convert: python manage.py fixtree
* Sometimes, you have to run: python manage.py sqlsequencereset cast + execute via pgcli

# Create New Empty DB

In [3]:
import os

from django.conf import settings
from django.core.management import call_command

from pathlib import Path

In [4]:
current_working_dir = Path.cwd()
os.chdir(settings.ROOT_DIR)
!dropdb homepage && createdb homepage
call_command("migrate")
os.chdir(current_working_dir)

Operations to perform:
  Apply all migrations: account, admin, auth, authtoken, cast, contenttypes, django_comments, filepond, fluent_comments, indieweb, sessions, sites, socialaccount, taggit, threadedcomments, users, wagtailadmin, wagtailcore, wagtaildocs, wagtailembeds, wagtailforms, wagtailimages, wagtailredirects, wagtailsearch, wagtailusers, watson
Running migrations:
  Applying contenttypes.0001_initial... OK
  Applying contenttypes.0002_remove_content_type_name... OK
  Applying auth.0001_initial... OK
  Applying auth.0002_alter_permission_name_max_length... OK
  Applying auth.0003_alter_user_email_max_length... OK
  Applying auth.0004_alter_user_username_opts... OK
  Applying auth.0005_alter_user_last_login_null... OK
  Applying auth.0006_require_contenttypes_0002... OK
  Applying auth.0007_alter_validators_add_error_messages... OK
  Applying auth.0008_alter_user_username_max_length... OK
  Applying users.0001_initial... OK
  Applying account.0001_initial... OK
  Applying accou

# Fetch Legacy Data from Database and Restore

In [5]:
def dictfetchall(cursor):
    "Return all rows from a cursor as a dict"
    columns = [col[0] for col in cursor.description]
    return [dict(zip(columns, row)) for row in cursor.fetchall()]


class Legacy:
    def __init__(self, db_name="legacy"):
        self.db_name = db_name
        self.users = self.fetch_rows("select * from users_user")
        self.blogs = self.fetch_rows("select * from cast_blog")
        self.posts = self.fetch_rows("select * from cast_post")
        self.images = self.fetch_rows("select * from cast_image")
        self.galleries = self.fetch_rows("select * from cast_gallery")
        self.gallery_images = self.fetch_rows("select * from cast_gallery_images")
        self.videos = self.fetch_rows("select * from cast_video")
        self.audios = self.fetch_rows("select * from cast_audio")

    def fetch_rows(self, stmt):
        with connections[self.db_name].cursor() as cursor:
            cursor.execute(stmt)
            rows = dictfetchall(cursor)
        return rows


class Converter:
    def __init__(self, legacy):
        self.legacy = legacy
        self.blog_content_type = ContentType.objects.get(app_label="cast", model="blog")

    def users(self):
        for l_user in self.legacy.users:
            user = User(**l_user)
            user.save()
        return {user.pk: user for user in User.objects.all()}

    def blogs(self):
        blog_legacy_to_wagtail = {}
        root = Page.objects.get(title="Welcome to your new Wagtail site!")
        for l_blog in self.legacy.blogs:
            kwargs = l_blog.copy()
            del kwargs["user_id"]
            del kwargs["id"]
            kwargs["owner"] = self.user_lookup[l_blog["user_id"]]
            kwargs["last_published_at"] = l_blog["modified"]
            # l_blog["content_type"] = self.blog_content_type
            blog = Blog(**kwargs)
            blog = root.add_child(instance=blog)
            blog_legacy_to_wagtail[l_blog["id"]] = blog.pk
        return blog_legacy_to_wagtail

    def images(self):
        for num, l_image in enumerate(self.legacy.images):
            image = Image(
                pk=l_image["id"],
                file=l_image["original"],
                uploaded_by_user=self.user_lookup[l_image["user_id"]],
                created_at=l_image["created"],
                width=l_image["original_width"],
                height=l_image["original_height"],
            )
            image.save()
            if num % 300 == 0:
                print(num)

    def galleries(self):
        for l_gallery in self.legacy.galleries:
            kwargs = {k: v for k, v in l_gallery.items() if k != "user_id"}
            gallery = Gallery(**kwargs)
            gallery.save()

    def gallery_image_links(self):
        links = [
            (gi["id"], gi["gallery_id"], gi["image_id"])
            for gi in self.legacy.gallery_images
        ]
        stmt = "insert into cast_gallery_images (id, gallery_id, image_id) values (%s, %s, %s)"
        with connections["default"].cursor() as cursor:
            cursor.executemany(stmt, links)

    def videos(self):
        for num, video in enumerate(self.legacy.videos):
            upload_user = self.user_lookup[video["user_id"]]
            video = Video(
                pk=video["id"],
                title=Path(video["original"]).name,
                user=upload_user,
                poster=video["poster"],
                poster_seconds=video["poster_seconds"],
                original=video["original"],
                created=video["created"],
                modified=video["modified"],
            )
            video.save(poster=False)
            if num % 50 == 0:
                print(num)

    def convert(self):
        self.user_lookup = self.users()
        self.blog_lookup = self.blogs()
        self.images()
        self.galleries()
        self.gallery_image_links()
        self.videos()

In [6]:
converter = Converter(Legacy())

In [7]:
%%time
converter.convert()

CPU times: user 4.82 s, sys: 413 ms, total: 5.23 s
Wall time: 4.73 s


# Migrate Posts

In [8]:
def is_tag(text):
    return text.startswith("{%") and text.endswith("%}")


def tag_to_block(tag):
    tag_name, tag_id = tag.strip("{%").strip("%}").split()
    tag_id = int(tag_id)
    if tag_name == "gallery":
        image_blocks = []
        for image in Gallery.objects.get(pk=tag_id).images.all():
            image_blocks.append({"type": "item", "value": image.pk, "id": str(uuid4())})
        return {"type": tag_name, "value": image_blocks}
    return {"type": tag_name, "value": tag_id}


def content_to_blocks(content):
    blocks = []
    just_tag = re.compile(r"({% \w+ \d+ %})")
    for part in just_tag.split(content):
        if len(part) == 0:
            continue
        if is_tag(part):
            blocks.append(tag_to_block(part))
        else:
            blocks.append({"type": "paragraph", "value": part})
    return blocks


def content_to_streamfield(content):
    detail_blocks = None
    pattern = r"(?P<overview>.*){% if include_detail %}(?P<detail>.*){% endif %}.*"
    match = re.search(pattern, content, re.DOTALL)
    if match is not None:
        overview_blocks = content_to_blocks(match.group("overview"))
        detail_blocks = content_to_blocks(match.group("detail"))
    else:
        overview_blocks = content_to_blocks(content)
    streamfield = [{"type": "overview", "value": overview_blocks}]
    if detail_blocks is not None:
        streamfield.append({"type": "detail", "value": detail_blocks})
    return streamfield


def build_post_from_legacy(legacy, user_lookup):
    # post = Post(title=legacy["title"], visible_date=legacy["visible_date"])
    kwargs = {
        k: v
        for k, v in legacy.items()
        if k not in ["content", "author_id", "blog_id", "id"]
    }
    kwargs["content_type"] = ContentType.objects.get(app_label="cast", model="post")
    kwargs["live"] = legacy["pub_date"] is not None
    if kwargs["live"]:
        kwargs["first_published_at"] = legacy["pub_date"]
        kwargs["last_published_at"] = legacy["modified"]
    kwargs["latest_revision_created_at"] = legacy["modified"]
    kwargs["owner"] = user_lookup[legacy["author_id"]]
    # print(kwargs)
    post = Post(**kwargs)
    raw_body = content_to_streamfield(legacy["content"])
    post.body = json.dumps(raw_body)
    return post

In [9]:
legacy_post_lookup = {p["id"]: p for p in converter.legacy.posts}

In [10]:
# for lpost in legacy_post_lookup.values():
#     if "video" in lpost["content"] and lpost["blog_id"] == 1:
#         print(lpost)

In [11]:
# for lpost in legacy_post_lookup.values():
#     if "include_detail" in lpost["content"]:
#         print(lpost)

In [12]:
%%time
blog_lookup = {k: Page.objects.get(pk=v) for k, v in converter.blog_lookup.items()}
legacy_post_lookup = {p["id"]: p for p in converter.legacy.posts}
# for p_id in (9,):
# for p_id in (183,):
# for p_id in (9, 183, 332, 333):
# for p_id in (329,):
for num, p_id in enumerate(legacy_post_lookup.keys()):
    l_post = legacy_post_lookup[p_id]
    blog = blog_lookup[l_post["blog_id"]]
    post = build_post_from_legacy(l_post, converter.user_lookup)
    post = blog.add_child(instance=post)
    if num % 20 == 0:
        print(num)

CPU times: user 8.43 s, sys: 84.3 ms, total: 8.51 s
Wall time: 9.41 s


In [13]:
[l_post] = [p for p in converter.legacy.posts if p["id"] == 329]

In [14]:
l_post

{'id': 329,
 'created': datetime.datetime(2022, 10, 22, 10, 37, 0, 444369, tzinfo=datetime.timezone.utc),
 'modified': datetime.datetime(2022, 10, 22, 10, 56, 8, 61463, tzinfo=datetime.timezone.utc),
 'title': 'Django Beginner Series: Update Python',
 'pub_date': None,
 'visible_date': datetime.datetime(2022, 10, 22, 0, 0, tzinfo=datetime.timezone.utc),
 'content': 'cd ~/.pyenv<br />\r\ngit pull<br />\r\n<br />\r\npyenv install 3.10.8<br />\r\n<br />\r\npyenv global 3.10.8',
 'slug': 'update-python',
 'author_id': 1,
 'blog_id': 1,
 'podcast_audio_id': None,
 'uuid': UUID('bf0666a1-62d2-438d-9d11-1d195388b68d'),
 'keywords': '',
 'explicit': 1,
 'block': False,
 'comments_enabled': True}

In [29]:
counter = {}
for page in Page.objects.all():
    counter[page.owner] = counter.get(page.owner, 0) + 1
print(counter)

In [36]:
Post.objects.first().visible_date

datetime.datetime(2018, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)

In [15]:
Page.objects.all()[5].post.visible_date

datetime.datetime(2018, 1, 24, 0, 0, tzinfo=datetime.timezone.utc)

In [13]:
# for page in Page.objects.all():
#    print(page.first_published_at)

# Backup migrated Database

Before applying the backup, change the site name in wagtail settings to the new fqdn + port 443. Otherwise item.get_full_url() will not be correct.

```
pg_dump homepage | gzip > backups/db.psql.gz
cd deploy
ansible-playbook restore_database.yml
```

In [21]:
import requests

In [26]:
for page in list(Page.objects.all()):
    url = f"https://homepage.staging.django-cast.com{page.url}"
    print(url)
    r = requests.get(url)
    print(r.status_code)

ConnectionError: HTTPSConnectionPool(host='homepage.staging.django-cast.comnone', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x11c8ca450>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))