In [6]:
import json
import os
from datetime import datetime
from jinja2 import Environment, FileSystemLoader
from collections import defaultdict


def process_instagram_files(directory):
    print(f"\nScanning directory: {directory}")
    posts_by_year = defaultdict(list)

    # Count total valid files first
    total_files = sum(
        1
        for root, _, files in os.walk(directory)
        for f in files
        if f.endswith(".json")
        and "tagged" not in f.lower()
        and "comments" not in f.lower()
    )

    processed_files = 0
    skipped_files = 0

    # Process each JSON file
    for root, _, files in os.walk(directory):
        for filename in files:
            # Skip files with 'tagged' or 'comments' in the name
            if (
                "tagged" in filename.lower()
                or "comments" in filename.lower()
                or not filename.endswith(".json")
            ):
                skipped_files += 1
                continue

            processed_files += 1
            print(
                f"\rProcessing file {processed_files}/{total_files} ({(processed_files/total_files)*100:.1f}%)",
                end="",
            )

            with open(os.path.join(root, filename), "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                    # Extract timestamp and convert to year
                    timestamp = data["node"]["taken_at_timestamp"]
                    year = datetime.fromtimestamp(timestamp).year

                    # Extract required fields
                    post = {
                        "caption": (
                            data["node"]["edge_media_to_caption"]["edges"][0]["node"][
                                "text"
                            ]
                            if data["node"]["edge_media_to_caption"]["edges"]
                            else ""
                        ),
                        "comments": data["node"]["comments"],
                        "like_count": data["node"]["edge_media_preview_like"]["count"],
                        "shortcode": data["node"]["shortcode"],
                        "timestamp": timestamp,
                        "images": [],
                    }

                    # Detect images with the same name as the JSON file
                    base_filename = filename[:-5]  # Remove the .json extension
                    image_path = os.path.join(root, f"{base_filename}*.jpg")
                    if os.path.exists(image_path):
                        post["images"].append(image_path)

                    # for i in range(1, 100):  # Assuming not more than 99 images per post
                    #     image_name = f"{base_filename}_{i}*.jpg"
                    #     image_path = os.path.join(root, image_name)
                    #     if os.path.exists(image_path):
                    #         post["images"].append(image_path)
                    #     else:
                    #         break

                    posts_by_year[year].append(post)
                except (KeyError, json.JSONDecodeError) as e:
                    print(f"\nError processing {filename}: {str(e)}")

    print("\n\nProcessing summary:")
    print(f"Total files found: {total_files + skipped_files}")
    print(f"Files processed: {processed_files}")
    print(f"Files skipped: {skipped_files}")
    print(f"Years found: {sorted(posts_by_year.keys())}")
    print(
        f"Total posts processed: {sum(len(posts) for posts in posts_by_year.values())}"
    )

    return posts_by_year

json_directory = "data/forummuenchenev"  # Change this to your directory path

print("Instagram JSON to HTML Processor")
print("=" * 30)

# Process JSON files
posts_by_year = process_instagram_files(json_directory)


Instagram JSON to HTML Processor

Scanning directory: data/forummuenchenev


Processing summary:
Total files found: 0
Files processed: 0
Files skipped: 0
Years found: []
Total posts processed: 0


In [8]:
posts_by_year

defaultdict(list, {})

In [5]:
import json
import os
from datetime import datetime
from jinja2 import Environment, FileSystemLoader
from collections import defaultdict


def process_instagram_files(directory):
    print(f"\nScanning directory: {directory}")
    posts_by_year = defaultdict(list)

    # Count total valid files first
    total_files = sum(
        1
        for root, _, files in os.walk(directory)
        for f in files
        if f.endswith(".json")
        and "tagged" not in f.lower()
        and "comments" not in f.lower()
    )

    processed_files = 0
    skipped_files = 0

    # Process each JSON file
    for root, _, files in os.walk(directory):
        for filename in files:
            # Skip files with 'tagged' or 'comments' in the name
            if (
                "tagged" in filename.lower()
                or "comments" in filename.lower()
                or not filename.endswith(".json")
            ):
                skipped_files += 1
                continue

            processed_files += 1
            print(
                f"\rProcessing file {processed_files}/{total_files} ({(processed_files/total_files)*100:.1f}%)",
                end="",
            )

            with open(os.path.join(root, filename), "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                    # Extract timestamp and convert to year
                    timestamp = data["node"]["taken_at_timestamp"]
                    year = datetime.fromtimestamp(timestamp).year

                    # Extract required fields
                    post = {
                        "caption": (
                            data["node"]["edge_media_to_caption"]["edges"][0]["node"][
                                "text"
                            ]
                            if data["node"]["edge_media_to_caption"]["edges"]
                            else ""
                        ),
                        "comments": data["node"]["comments"],
                        "like_count": data["node"]["edge_media_preview_like"]["count"],
                        "shortcode": data["node"]["shortcode"],
                        "timestamp": timestamp,
                        "images": [],
                    }

                    # Detect images with the same name as the JSON file
                    base_filename = filename[:-5] 
                    print(base_filename)
                    image_path = os.path.join(root, f"{base_filename}*.jpg")
                    print(image_path)
                except (KeyError, json.JSONDecodeError) as e:
                    print(f"\nError processing {filename}: {str(e)}")
                print(file)

In [3]:
image_path = os.path.join(root, f"{base_filename}.jpg")

defaultdict(list, {})