In [None]:
# necessary imports and driver initialization

from datetime import datetime
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

from PIL import Image
from io import BytesIO
import win32clipboard

options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(options=options)
driver.get("https://artofproblemsolving.com/community/c542329") # link to blog

In [None]:
# scroll all the way down until stationary

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.3)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
# delete the banner and other static elements

def delete_by_id(id):
    driver.execute_script(f"""
    var element = document.getElementById("{id}");
    element.parentNode.removeChild(element);
    """)

delete_by_id("navigation_box")
delete_by_id("blog_keywords")
delete_by_id("feed-global-tab")

In [None]:
# explode all hidden text

def explode(post):
    # while there is a class "cmty-hide-heading faux-link", click it so it becomes class "cmty-hide-heading faux-link cmty-hide-open"
    while True:
        hidden = post.find_elements("css selector", ".cmty-hide-heading.faux-link")
        true_hidden = []
        for h in hidden:
            if "cmty-hide-open" not in h.get_attribute("class"):
                true_hidden.append(h)
        if len(true_hidden) == 0:
            break
        for h in true_hidden:
            h.click()

In [None]:
# find all post elements and explode them

posts = driver.find_elements("css selector", ".entry")
for post in posts:
    explode(post)

In [None]:
# scroll down by page height and take full page screenshots and save them to ./images/screenshots/

driver.execute_script("window.scrollTo(0, 0);")
prev_y = driver.execute_script("return window.scrollY")
i = 0
imgs = []
while True:
    img = Image.open(BytesIO(driver.get_screenshot_as_png()))
    time.sleep(0.3)
    driver.execute_script("window.scrollTo(0, window.scrollY + window.innerHeight);")
    curr_y = driver.execute_script("return window.scrollY")
    if curr_y == prev_y:
        break
    imgs.append((img, i, prev_y, curr_y, curr_y - prev_y))
    prev_y = curr_y
    i += 1

for img, i, prev_y, curr_y, diff in imgs:
    img.save(f"./images/screenshots/{i}.png")

In [None]:
# merge all screenshots into one image vertically

cnt = len(imgs)
widths = [img.size[0] for img, i, prev_y, curr_y, diff in imgs]
heights = [img.size[1] for img, i, prev_y, curr_y, diff in imgs]

max_width = max(widths)
assert all(width == max_width for width in widths)
sum_height = sum(heights)

merged = Image.new("RGB", (max_width, sum_height))

y_offset = 0
for img, i, prev_y, curr_y, diff in imgs:
    merged.paste(img, (0, y_offset))
    y_offset += heights[i]

merged.save("./images/merged.png")

In [None]:
# unwrap each post to its title, link, date posted, and screenshot of the post

def unwrap_post(post):
    title = post.find_element("css selector", "h1").text
    link = post.find_element("css selector", "h1 a").get_attribute("href")
    date = post.find_element("css selector", "h2").text[len("by fatant, "):]
    date_ts = datetime.strptime(date, "%b %d, %Y, %I:%M %p").strftime("%Y-%m-%d %H:%M:%S")
    entry = post.find_element("css selector", ".entrywrap")
    msg = entry.find_element("css selector", ".message")
    # img = Image.open(BytesIO(msg.screenshot_as_png))
    pos = [msg.location["x"], msg.location["y"], msg.location["x"] + msg.size["width"], msg.location["y"] + msg.size["height"]]
    return title, link, date, date_ts, pos

In [None]:
# aggregate all posts into a list of tuples then sort

post_data = [unwrap_post(post) for post in posts]
post_data.sort(key=lambda x: x[3])

In [None]:
# ad hoc solution bc i posted twice within the same minute

tmp = post_data[0]
post_data[0] = post_data[1]
post_data[1] = tmp
post_data

In [None]:
# sanity check, should be approximately equal
# difference is due to scroll bar and a little off the bottom since we only merged full page screenshots

page_dim = (driver.execute_script("return document.body.scrollWidth"), driver.execute_script("return document.body.scrollHeight"))
img_dim = merged.size

print("Page dimensions:", page_dim)
print("Image dimensions:", img_dim)

In [None]:
# print out all post data and save screenshots

Image.MAX_IMAGE_PIXELS = None
final = Image.open("./images/merged.png")

count = 0
for title, link, date, date_true, pos in post_data:
    print(title, link, date, date_true)
    img = final.crop(pos)
    file_name = date_true.replace(':', '-')
    img.save(f"./images/posts/{file_name}.png")

In [None]:
# sign into substack

email = "this.is.not@my.email"
password = "thisisnotmypassword"

driver.get("https://substack.com/")
driver.find_element("css selector", ".button.sign-in-link.outline-grayscale").click()
driver.find_element("css selector", ".login-option.substack-login__login-option").click()
driver.find_element("css selector", "input[type=email]").send_keys(email)
driver.find_element("css selector", "input[type=password]").send_keys(password)
driver.find_element("css selector", "button[type=submit]").click()

In [None]:
# copy image to clipboard from path

def copy_to_clipboard(clip_type, data):
    win32clipboard.OpenClipboard()
    win32clipboard.EmptyClipboard()
    win32clipboard.SetClipboardData(clip_type, data)
    win32clipboard.CloseClipboard()
    
def copy_img_to_clipboard(img):
    io = BytesIO()
    img.convert("RGB").save(io, "BMP")
    data = io.getvalue()[14:]
    io.close()
    copy_to_clipboard(win32clipboard.CF_DIB, data)

In [None]:
# post given image/data to substack

def post_substack(post, url):
    driver.get(url)
    title, link, date, date_true, pos = post
    file_name = date_true.replace(':', '-')
    img = Image.open(f"./images/posts/{file_name}.png")
    driver.find_element("css selector", ".page-title.mousetrap").send_keys(title)
    regular_date = datetime.strptime(date, "%b %d, %Y, %I:%M %p").strftime("%B %d, %Y").lower()
    subtitle = f"initially posted to aops on {regular_date}"
    driver.find_element("css selector", ".subtitle.mousetrap").send_keys(subtitle)
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").click()
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").send_keys(link)
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").send_keys(Keys.CONTROL, "a")
    driver.find_element("css selector", '[title="Link ^K"]').click()
    driver.find_element("css selector", ".url-input").send_keys(link, Keys.ENTER)
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").send_keys(Keys.RIGHT)
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").send_keys(Keys.ENTER)
    copy_img_to_clipboard(img)
    driver.find_element("css selector", ".tiptap.ProseMirror.mousetrap").send_keys(Keys.CONTROL, "v")
    while len(driver.find_elements("css selector", '[data-testid="publish-button"]')) == 0:
        time.sleep(0.3)
    driver.find_element("css selector", '[data-testid="publish-button"]').click()
    time.sleep(0.3)
    driver.find_element("css selector", '.frontend-pencraft-form-Checkbox-module__container--bnhv5.frontend-pencraft-form-Checkbox-module__checked--SUdWp').click()
    all_buttons = driver.find_elements("css selector", "button")
    for button in all_buttons:
        if button.text == "Publish now":
            button.click()
            break

In [None]:
# skip first post since i already posted it manually for testing

for post in post_data[1:]:
    post_substack(post, "https://f4t4nt.substack.com/publish/post")
    time.sleep(5)