In [None]:
from webdav3.client import Client
from dotenv import dotenv_values
import instaloader
from instaloader import Profile, Post, Instaloader
import os
import datetime
import time
import shutil
import asyncio



Load configuration variables from a `.env` file.

Expected Variables:
- WEBDAV_DOMAIN_NAME
- WEBDAV_USERNAME
- WEBDAV_PASSWORD
- INSTAGRAM_USERNAME
- INSTAGRAM_PASSWORD

In [None]:
config = dotenv_values(".env") 

# Test WebDav API

Test WebDav API can connect to my NextCloud instance

Configure WebDav Client to connect to my Nextcloud server.

In [None]:


webdav_options = {
 'webdav_hostname': f'https://{config["WEBDAV_DOMAIN_NAME"]}/remote.php/dav/files/{config["WEBDAV_USERNAME"]}/', 
 'webdav_login':    config["WEBDAV_USERNAME"],
 'webdav_password': config["WEBDAV_PASSWORD"]
}

client = Client(webdav_options)



Test that a folder can be created via WebDav.

In [None]:
client.mkdir("fox-drive")
print(client.check("fox-drive"))

# Get Instagram Stories

In [None]:
L = instaloader.Instaloader()
L.login(config["INSTAGRAM_USERNAME"], config["INSTAGRAM_PASSWORD"])

In [None]:
for story in L.get_stories():
    # story is a Story object
    for item in story.get_items():
        print(item)
        # item is a StoryItem object
        # L.download_storyitem(item, ':stories')

# Get Instagram posts

Test downloading the first post that [juniperfoxx](https://www.instagram.com/juniperfoxx/) has maded to instagram

In [None]:
profile = Profile.from_username(L.context, 'juniperfoxx')
for post in profile.get_posts():
    print(post)
    L.download_post(post, target=f'downloads')
    break

## Design Plan:

1. Lookup the datetime of the most recent post saved to nextcloud. 
2. Get the list of posts from after the datetime
3. for-each post downloaded and save to nextcloud

In [None]:
test_sync_download_path = "test-sync-downloads"


start = time.perf_counter()

profile = Profile.from_username(L.context, 'juniperfoxx')
MAX_POSTS = 1
for count, post in enumerate(profile.get_posts()):
    if MAX_POSTS <= count:
        break
    start_time_download = time.perf_counter()
    L.download_post(post, target=test_sync_download_path)
    print(time.perf_counter() - start_time_download)
client.upload_sync(remote_path=test_sync_download_path, local_path=test_sync_download_path)

#remove temporary local files
shutil.rmtree(test_sync_download_path)

sync_elapse_time = time.perf_counter() - start

print(sync_elapse_time)

#cleanup for next run
client.clean(test_sync_download_path)


In [None]:
async def download(post: Post, remote_path):
    temp_download_path = f'post-{post.shortcode}'
    L.download_post(post, target=temp_download_path)
    client.upload_sync(remote_path=remote_path, local_path=temp_download_path)
    shutil.rmtree(temp_download_path)

async def dowload_all_posts(test_async_download_path):
    profile = Profile.from_username(L.context, 'juniperfoxx')
    MAX_POSTS = 20
    posts = [post for post, count in zip(profile.get_posts(), range(MAX_POSTS))]

    return await asyncio.gather(*(download(post, test_async_download_path) for post in posts))

test_async_download_path = "test-async-downloads"
start = time.perf_counter()

await dowload_all_posts(test_async_download_path)

async_elapse_time = time.perf_counter() - start

print(async_elapse_time)

#cleanup for next run
client.clean(test_async_download_path)

In [None]:
def sync_store_post(post:Post, webdav_client:Client, instaloader:Instaloader, remote_path:str):
    """Synchronously store the downloaded Instagram `Post` on WebDav API compatible fileserver.

    Note: Files will be temporarily stored to the local filesystem in a folder called `<current directory>/post-<the posts shortcode>`. 
    
    Downloaded Files
    ----------------
    <datetime>_UTC.jpg
        The first image in the post. The datetime format is `YYYY-MM-DD-HH-MM-SS`
    <datetime>_UTC.mp4
        An optional video file that goes with the image that has the same filename. 
        If this is downloaded, then the corresponding image would be this video's thumbnail/preview.
    <datetime>_UTC_<count>.jpg
        Any additional images in the post will have `count` suffix starting with the value `1`
    <datetime>_UTC_<count>.mp4
        An optional video file that goes with the image that has the same filename. 
        If this is downloaded, then the corresponding image would be this video's thumbnail/preview.
    <datetime>_UTC.txt
        Contains post's caption
    <datetime>_UTC.json.xz
        A compressed json file that contains information about the post
        
    Parameters
    ----------
    post: Post
        The post to download from instagram and store to the fileserver.
    webdav_client
        The WebDav API client used to store the files to the fileserver.
    instaloader: Instaloader
        The logged in Instagram API used to locally download the post.
    remote_path: str
        The remote path to store files into on the fileserver
    
    Raises
    ------
    NotImplementedError
        If any of the parameters `post`, `webdav_client`, `instaloader`, or `remote_path` are None
    """
    if post is None:
        raise NotImplementedError('`post` is a required parameter.')
    if webdav_client is None:
        raise NotImplementedError('`webdav_client` is a required parameter.')
    if instaloader is None:
        raise NotImplementedError('`instaloader` is a required parameter.')
    if remote_path is None:
        raise NotImplementedError('`remote_path` is a required parameter.')
    
    temp_download_path = f'post-{post.shortcode}'
    instaloader.download_post(post, target=temp_download_path)
    webdav_client.mkdir(remote_path)
    for file in os.listdir(temp_download_path):
        local_file = os.path.join(os.getcwd(),temp_download_path,file)
        remote_file = os.path.join(remote_path,file).replace("\\","/").lstrip('/')
        webdav_client.upload_file(remote_path=remote_file, local_path=local_file)
        
    shutil.rmtree(temp_download_path)

async def upload_file(webdav_client:Client, local_file_dir:str, local_file_name:str, remote_dir:str ):
    local_file = os.path.join(os.getcwd(),local_file_dir,local_file_name)
    remote_file = os.path.join(remote_dir,local_file_name).replace('\\','/').lstrip('/')
    webdav_client.upload_file(remote_path=remote_file, local_path=local_file)

async def async_store_post(post: Post, webdav_client:Client, instaloader: Instaloader, remote_path:str):
    """Asynchronously store the downloaded Instagram `Post` on WebDav API compatible fileserver.

    Note: Files will be temporarily stored to the local filesystem in a folder called `<current directory>/post-<the posts shortcode>`. 
    
    Downloaded Files:
    -----------------
    <datetime>_UTC.jpg
        The first image in the post. The datetime format is `YYYY-MM-DD-HH-MM-SS`
    <datetime>_UTC.mp4
        An optional video file that goes with the image that has the same filename. 
        If this is downloaded, then the corresponding image would be this video's thumbnail/preview.
    <datetime>_UTC_<count>.jpg
        Any additional images in the post will have `count` suffix starting with the value `1`
    <datetime>_UTC_<count>.mp4
        An optional video file that goes with the image that has the same filename. 
        If this is downloaded, then the corresponding image would be this video's thumbnail/preview.
    <datetime>_UTC.txt
        Contains post's caption
    <datetime>_UTC.json.xz
        A compressed json file that contains information about the post
        
    Parameters:
    -----------
    post: Post
        The post to download from instagram and store to the fileserver.
    webdav_client
        The WebDav API client used to store the files to the fileserver.
    instaloader: Instaloader
        The logged in Instagram API used to locally download the post.
    remote_path: str
        The remote path to store files into on the fileserver
    
    Raises
    ------
    NotImplementedError
        If any of the parameters `post`, `webdav_client`, `instaloader`, or `remote_path` are None
    """
    if post is None:
        raise NotImplementedError('`post` is a required parameter.')
    if webdav_client is None:
        raise NotImplementedError('`webdav_client` is a required parameter.')
    if instaloader is None:
        raise NotImplementedError('`instaloader` is a required parameter.')
    if remote_path is None:
        raise NotImplementedError('`remote_path` is a required parameter.')
    
    temp_download_path = f'post-{post.shortcode}'
    instaloader.download_post(post, target=temp_download_path)
    webdav_client.mkdir(remote_path)
    
    await asyncio.gather( *(upload_file(client, temp_download_path, file, remote_path) for file in os.listdir(temp_download_path)) )

    shutil.rmtree(temp_download_path)


# TODO evaluate if I should refactor this to not sort in ascending order.
# The benifit of ascending order is that it becomes easier to recover from failer states
# since you can easily determine which post to download next by checking the date 
# of the most recent post saved. The downside is that every new post since that point 
# will need to be stored in memory since I have to convert the lazy interator to a list in order to sort it.
# This took 2 minutes to execute
def fetch_posts(profile_username:str, after_date:datetime = None, max_posts:int = -1):
    """Fetch the list of Instagram posts for an instagramer's username.

    Posts are ordered by date posted (`Post.date_utc`) in ascending order

    Parameters
    ----------
    profile_username: str
        The username of the instagram poster
    after_date: datetime, optional
        The datetime to filter the posts by those posted afterwards (default is None).
        If None posts will not be filtered by date. 
    max_posts: int, optional
        The max number of posts to return. If None or less than 0, return all posts .

    Raises
    ------
    NotImplementedError
        If `profile_username` is None

    Returns
    -------
    List of Instagram Posts in ascending order by date posted (`Post.date_utc`) for the user. 
    
    """
    if profile_username is None:
        raise NotImplementedError('`profile_username` is a required parameter.')

    profile = Profile.from_username(L.context, profile_username)
    posts = [post for post in profile.get_posts() if after_date is None or post.date_utc > after_date]
    posts.sort(key=lambda post : post.date_utc)
    if max_posts >= 0:
        return posts[0:max_posts]

    return posts


In [None]:
def fast_fetch_posts(profile_username:str, after_date:datetime = None, max_posts:int = -1):
    """Fetch the list of Instagram posts for an instagramer's username.

    Posts are ordered by date posted (`Post.date_utc`) in ascending order

    Parameters
    ----------
    profile_username: str
        The username of the instagram poster
    after_date: datetime, optional
        The datetime to filter the posts by those posted afterwards (default is None).
        If None posts will not be filtered by date. 
    max_posts: int, optional
        The max number of posts to return. If None or less than 0, return all posts .

    Raises
    ------
    NotImplementedError
        If `profile_username` is None

    Returns
    -------
    List of Instagram Posts in ascending order by date posted (`Post.date_utc`) for the user. 
    
    """
    if profile_username is None:
        raise NotImplementedError('`profile_username` is a required parameter.')

    profile = Profile.from_username(L.context, profile_username)

    # Assumes that profile.get_posts() always returns the posts in descending order on the date posted
    posts = []
    for post in profile.get_posts():
        if  after_date is not None and post.date_utc <= after_date :
            break

        posts.insert(0, post)
        
        if max_posts >= 0 and max_posts < len(posts):
            posts.pop()

    return posts
    

## Profile

In [None]:
max_posts = 30

In [None]:
start = time.perf_counter()
posts = fetch_posts('juniperfoxx',after_date=datetime.datetime(2022, 1, 26, 0, 0), max_posts = max_posts)
fetch_posts_elapsed_time_with_date_parameter = time.perf_counter() - start
print(posts)

In [None]:
start = time.perf_counter()
posts = fetch_posts('juniperfoxx', max_posts = max_posts)
fetch_posts_elapsed_time = time.perf_counter() - start
print(posts)

In [None]:
start = time.perf_counter()
posts = fast_fetch_posts('juniperfoxx',after_date=datetime.datetime(2022, 1, 26, 0, 0), max_posts = max_posts)
fast_fetch_posts_elapsed_time_with_date_parameter = time.perf_counter() - start
print(posts)

In [None]:
start = time.perf_counter()
posts = fast_fetch_posts('juniperfoxx', max_posts = max_posts)
fast_fetch_posts_elapsed_time = time.perf_counter() - start
print(posts)

In [None]:
start = time.perf_counter()
test_sync_download_path = "test-sync-downloads"
for post in posts:
    sync_store_post(post, client, L, test_sync_download_path)
sync_store_post_elapsed_time = time.perf_counter() - start
client.clean(test_sync_download_path) #cleanup test

In [None]:
start = time.perf_counter()
test_async_download_path = "test-async-downloads"
await asyncio.gather(*(async_store_post(post, client, L, test_async_download_path) for post in posts))
async_store_post_elapsed_time = time.perf_counter() - start
client.clean(test_async_download_path) #cleanup test

In [None]:
print(f'fetch_posts elapsed time: {fetch_posts_elapsed_time}')
print(f'fetch_posts with date parameter elapsed time: {fetch_posts_elapsed_time_with_date_parameter}')

print(f'fast_fetch_posts elapsed time: {fast_fetch_posts_elapsed_time}')
print(f'fast_fetch_posts with date parameter elapsed time: {fast_fetch_posts_elapsed_time_with_date_parameter}')

print(f'sync_store_post elapsed time: {sync_store_post_elapsed_time}')
print(f'async_store_post elapsed time: {async_store_post_elapsed_time}')

## Summary of Profiling

For a small number of posts the bottle neck is the fetching of posts. Fetching the posts returns a lazy iterator so it only looks up the next batch of posts as it is need. Furthermore the list of posts are in descending order on the dated posted so to get the oldest post you have to iterate over the whole collection. The async version of the store post method does appear to be faster.


For max_posts = 50
|Function | Elapsed Time |
|---------|--------------|
|fetch_posts | 113.28898620000109 |
|fetch_posts with date parameter | 131.27258950000396 |
|fast_fetch_posts | 117.25117950000276 |
|fast_fetch_posts with date parameter | 3.3295254000113346 |
|sync_store_post | 202.49611910000385 |
|async_store_post | 131.32413129998895 |

