In [61]:
import requests
from bs4 import BeautifulSoup
import csv
from PIL import Image
from io import BytesIO
from collections import Counter

image_urls = []
flags = {}


# Funktion zum Herunterladen eines Bildes
def download_image(url):
    response = requests.get(url)
    response.raise_for_status()
    return Image.open(BytesIO(response.content))


# Funktion zur Zuordnung der Farben zu den 12 Farbkategorien
def get_color_category(rgb_color):
    color_categories = {
        "yellow": (255, 255, 0),
        "yellow-green": (154, 205, 50),
        "green": (0, 128, 0),
        "blue-green": (0, 255, 255),
        "blue": (0, 0, 255),
        "blue-violet": (138, 43, 226),
        "violet": (128, 0, 128),
        "red-violet": (255, 0, 255),
        "red": (255, 0, 0),
        "red-orange": (255, 69, 0),
        "orange": (255, 165, 0),
        "yellow-orange": (255, 215, 0),
    }

    min_dist = float("inf")
    closest_category = None

    for category, color in color_categories.items():
        dist = sum((comp1 - comp2) ** 2 for comp1, comp2 in zip(rgb_color, color))
        if dist < min_dist:
            min_dist = dist
            closest_category = category

    return closest_category


# Funktion zur Analyse der Farben eines Bildes
def analyze_image_colors(image):
    image = image.convert("RGB")
    pixels = list(image.getdata())
    color_counts = Counter(
        [
            get_color_category(pixel)
            for pixel in pixels
            if get_color_category(pixel) is not None
        ]
    )

    # Anzahl Pixel addieren und Farbanteile prozentual ausrechnen
    total_pixels = sum(color_counts.values())
    color_percentages = {
        color: round((count / total_pixels) * 100, 2)
        for color, count in color_counts.items()
    }

    for color in [
        "yellow",
        "yellow-green",
        "green",
        "blue-green",
        "blue",
        "blue-violet",
        "violet",
        "red-violet",
        "red",
        "red-orange",
        "orange",
        "yellow-orange",
    ]:
        if color not in color_percentages:
            color_percentages[color] = 0

    return color_percentages


# CSV-Datei erstellen und Header schreiben
with open("flags.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(
        [
            "Image URL",
            "Region",
            "State Name",
            "Flag Name",
            "Additional Info",
            "Year Added",
            "Year Removed",
            "yellow",
            "yellow-green",
            "green",
            "blue-green",
            "blue",
            "blue-violet",
            "violet",
            "red-violet",
            "red",
            "red-orange",
            "orange",
            "yellow-orange",
        ]
    )

# URL der Webseite
for year in range(1874, 2024):
    url = f"https://flaglog.com/{year}"
    flags[year] = []

    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")

    for section in soup.find_all("section"):
        if section.get("id") != "Other":
            region = section.get("id", "Unknown")
            for flag_div in section.find_all("div", class_="flag"):
                image_url = "https://flaglog.com/" + flag_div.find("img")["src"]
                state_name_div = flag_div.find("div", class_="flagname")
                state_name = str(state_name_div).replace('<div class="flagname">', "")
                state_name = state_name.split("<")[0]

                flag_name_div = flag_div.find("div", class_="type")
                flag_name = "N/A"
                if flag_name_div:
                    flag_name = flag_name_div.text.strip()
                    for i, char in enumerate(flag_name):
                        if char.isdigit():
                            flag_name = flag_name[:i].strip()
                            break

                comment_element = flag_div.find("span", class_="popnote")
                comment = comment_element.text.strip() if comment_element else "N/A"
                comment = comment.replace("&nbsp;", "")
                comment = "".join([i for i in comment if not i.isdigit()])

                year_added = "".join([i for i in image_url if i.isdigit()])
                if len(year_added) != 4:
                    year_added = "?"

                if image_url not in image_urls:
                    try:
                        image = download_image(image_url)
                        color_percentages = analyze_image_colors(image)
                    except Exception as e:
                        print(f"Fehler beim Verarbeiten des Bildes {image_url}: {e}")
                        color_percentages = {
                            color: 0
                            for color in [
                                "yellow",
                                "yellow-green",
                                "green",
                                "blue-green",
                                "blue",
                                "blue-violet",
                                "violet",
                                "red-violet",
                                "red",
                                "red-orange",
                                "orange",
                                "yellow-orange",
                            ]
                        }

                    with open(
                        "flags.csv", mode="a", newline="", encoding="utf-8"
                    ) as file:
                        writer = csv.writer(file)
                        writer.writerow(
                            [
                                image_url,
                                region,
                                state_name,
                                flag_name,
                                comment,
                                year_added,
                                "still in use",
                                color_percentages["yellow"],
                                color_percentages["yellow-green"],
                                color_percentages["green"],
                                color_percentages["blue-green"],
                                color_percentages["blue"],
                                color_percentages["blue-violet"],
                                color_percentages["violet"],
                                color_percentages["red-violet"],
                                color_percentages["red"],
                                color_percentages["red-orange"],
                                color_percentages["orange"],
                                color_percentages["yellow-orange"],
                            ]
                        )
                    image_urls.append(image_url)
                flags[year].append(image_url)

    if year == 1874:
        continue

    rip_flags = [flag for flag in flags[year - 1] if flag not in flags[year]]
    updated_rows = []

    with open("flags.csv", mode="r", newline="", encoding="utf-8") as read_file:
        reader = csv.reader(read_file)
        existing_rows = list(reader)

    for row in existing_rows:
        if row[0] in rip_flags:
            row[6] = year
        updated_rows.append(row)

    with open("flags.csv", mode="w", newline="", encoding="utf-8") as write_file:
        writer = csv.writer(write_file)
        writer.writerows(updated_rows)

print("Daten wurden erfolgreich in 'flags.csv' gespeichert.")



Fehler beim Verarbeiten des Bildes https://flaglog.com/img/france1940-free.png: 404 Client Error: Not Found for url: https://flaglog.com/img/france1940-free.png
Daten wurden erfolgreich in 'flags.csv' gespeichert.


In [59]:
import os
import requests
from bs4 import BeautifulSoup

# Funktion zum Herunterladen eines Bildes
def download_image(url, save_path):
    response = requests.get(url)
    response.raise_for_status()
    with open(save_path, 'wb') as f:
        f.write(response.content)

# Ordner "flags" erstellen, falls er nicht existiert
os.makedirs('flags', exist_ok=True)

# Liste der Jahre, von denen die Flaggenbilder heruntergeladen werden sollen
years = range(1874, 2024)

# Set zur Überprüfung doppelter Bild-URLs
downloaded_images = set()

for year in years:
    url = f"https://flaglog.com/{year}"

    # HTTP-Anfrage an die Webseite senden
    response = requests.get(url)
    response.raise_for_status()

    # Inhalt der Webseite mit BeautifulSoup parsen
    soup = BeautifulSoup(response.content, 'html.parser')

    for section in soup.find_all('section'):
        if section.get('id') != 'Other':
            for flag_div in section.find_all('div', class_='flag'):
                image_url = 'https://flaglog.com/' + flag_div.find('img')['src']
                
                # Überprüfen, ob das Bild bereits heruntergeladen wurde
                if image_url not in downloaded_images:
                    image_name = image_url.split('/')[-1]
                    save_path = os.path.join('flags', image_name)
                    
                    try:
                        download_image(image_url, save_path)
                        print(f"Bild heruntergeladen und gespeichert: {save_path}")
                        downloaded_images.add(image_url)
                    except Exception as e:
                        print(f"Fehler beim Herunterladen des Bildes {image_url}: {e}")

print("Alle Bilder wurden erfolgreich heruntergeladen und im Ordner 'flags' gespeichert.")

Bild heruntergeladen und gespeichert: flags/turkey1844.png
Bild heruntergeladen und gespeichert: flags/tunisia1831.png
Bild heruntergeladen und gespeichert: flags/egypt1841.png
Bild heruntergeladen und gespeichert: flags/jebelshammar1836.png
Bild heruntergeladen und gespeichert: flags/nejd1750.png
Bild heruntergeladen und gespeichert: flags/tripoli1727.png
Bild heruntergeladen und gespeichert: flags/morocco1667.png
Bild heruntergeladen und gespeichert: flags/muscat1649.png
Bild heruntergeladen und gespeichert: flags/qasimi1820.png
Bild heruntergeladen und gespeichert: flags/madagascar1817.png
Bild heruntergeladen und gespeichert: flags/persia1797.png
Bild heruntergeladen und gespeichert: flags/lebanon1861.png
Bild heruntergeladen und gespeichert: flags/futajallon1725.png
Bild heruntergeladen und gespeichert: flags/sokoto1804.png
Bild heruntergeladen und gespeichert: flags/orange1856.png
Bild heruntergeladen und gespeichert: flags/transvaal1874.png
Bild heruntergeladen und gespeichert: 

In [3]:
import requests  # Library to make HTTP requests
from bs4 import BeautifulSoup  # Library to parse HTML and XML documents
import csv  # Library to handle CSV file operations
from PIL import Image  # Python Imaging Library to handle image processing
from io import BytesIO  # Library to handle byte streams
from collections import Counter  # Library to count hashable objects

# List to keep track of image URLs to avoid duplicates
image_urls = []
# Dictionary to store flags by year
flags1 = {}

# Function to download an image from a URL
def download_image(url):
    response = requests.get(url)  # Send a GET request to the URL
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return Image.open(BytesIO(response.content))  # Open the image from the response content

# Function to map RGB colors to predefined color categories
def get_color_category(rgb_color):
    color_categories = {
        "green": (0, 128, 0),
        "turquoise": (0, 255, 255),  # previously "blue-green"
        "blue": (0, 0, 255),
        "violet": (128, 0, 128),
        "pink": (255, 0, 255),  # previously "red-violet"
        "red": (255, 69, 0),  # previously "red-orange"
        "orange": (255, 165, 0),
        "yellow": (255, 215, 0),  # previously "yellow-orange"
        "white": (255, 255, 255),  # new color
        "black": (0, 0, 0)  # new color
    }

    min_dist = float("inf")  # Initialize minimum distance
    closest_category = None  # Initialize closest category

    for category, color in color_categories.items():  # Iterate over predefined color categories
        dist = sum((comp1 - comp2) ** 2 for comp1, comp2 in zip(rgb_color, color))  # Calculate Euclidean distance
        if dist < min_dist:  # If the distance is smaller, update the closest category
            min_dist = dist
            closest_category = category

    return closest_category  # Return the closest color category

# Function to analyze the colors in an image
def analyze_image_colors(image):
    image = image.convert("RGB")  # Convert image to RGB mode
    pixels = list(image.getdata())  # Get all pixel data from the image
    color_counts = Counter(
        [
            get_color_category(pixel)  # Get color category for each pixel
            for pixel in pixels
            if get_color_category(pixel) is not None  # Ensure pixel maps to a category
        ]
    )

    # Calculate total number of pixels and percentage of each color
    total_pixels = sum(color_counts.values())
    color_percentages = {
        color: round((count / total_pixels) * 100, 2)  # Calculate percentage for each color
        for color, count in color_counts.items()
    }

    # Ensure all colors are represented in the output
    for color in [
        "green",
        "turquoise",
        "blue",
        "violet",
        "pink",
        "red",
        "orange",
        "yellow",
        "white",
        "black"
    ]:
        if color not in color_percentages:
            color_percentages[color] = 0

    return color_percentages  # Return dictionary of color percentages

# Create and write the header row to a CSV file
with open("flags1.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(
        [
            "Image URL",
            "Region",
            "State Name",
            "Flag Name",
            "Additional Info",
            "Year Added",
            "Year Removed",
            "green",
            "turquoise",
            "blue",
            "violet",
            "pink",
            "red",
            "orange",
            "yellow",
            "white",
            "black"
        ]
    )

# URL of the website to scrape data from
for year in range(1874, 2024):  # Loop through years from 1874 to 2023
    url = f"https://flaglog.com/{year}"  # Construct URL for the specific year
    flags1[year] = []  # Initialize list to store flags for the year

    response = requests.get(url)  # Send a GET request to the URL
    response.raise_for_status()  # Raise an HTTPError for bad responses

    soup = BeautifulSoup(response.content, "html.parser")  # Parse the HTML content

    for section in soup.find_all("section"):  # Loop through all sections in the HTML
        if section.get("id") != "Other":  # Skip sections with id "Other"
            region = section.get("id", "Unknown")  # Get region from section id
            for flag_div in section.find_all("div", class_="flag"):  # Loop through all flag divs
                image_url = "https://flaglog.com/" + flag_div.find("img")["src"]  # Construct image URL
                state_name_div = flag_div.find("div", class_="flagname")
                state_name = str(state_name_div).replace('<div class="flagname">', "")  # Extract state name
                state_name = state_name.split("<")[0]  # Further clean state name

                flag_name_div = flag_div.find("div", class_="type")
                flag_name = "N/A"  # Default flag name
                if flag_name_div:  # If flag name div is present
                    flag_name = flag_name_div.text.strip()  # Get flag name text
                    for i, char in enumerate(flag_name):  # Remove trailing digits
                        if char.isdigit():
                            flag_name = flag_name[:i].strip()
                            break

                comment_element = flag_div.find("span", class_="popnote")
                comment = comment_element.text.strip() if comment_element else "N/A"  # Get comment text
                comment = comment.replace("&nbsp;", "")  # Remove non-breaking spaces
                comment = "".join([i for i in comment if not i.isdigit()])  # Remove digits

                year_added = "".join([i for i in image_url if i.isdigit()])  # Extract year from image URL
                if len(year_added) != 4:
                    year_added = "?"  # Default to "?" if year format is incorrect

                if image_url not in image_urls:  # Check if the image URL is not already processed
                    try:
                        image = download_image(image_url)  # Download the image
                        color_percentages = analyze_image_colors(image)  # Analyze image colors
                    except Exception as e:
                        print(f"Fehler beim Verarbeiten des Bildes {image_url}: {e}")  # Print error message if any
                        color_percentages = {  # Default to 0 percentages for all colors
                            color: 0
                            for color in [
                                "green",
                                "turquoise",
                                "blue",
                                "violet",
                                "pink",
                                "red",
                                "orange",
                                "yellow",
                                "white",
                                "black"
                            ]
                        }

                    # Write flag information and color percentages to CSV file
                    with open("flags1.csv", mode="a", newline="", encoding="utf-8") as file:
                        writer = csv.writer(file)
                        writer.writerow(
                            [
                                image_url,
                                region,
                                state_name,
                                flag_name,
                                comment,
                                year_added,
                                "still in use",
                                color_percentages["green"],
                                color_percentages["turquoise"],
                                color_percentages["blue"],
                                color_percentages["violet"],
                                color_percentages["pink"],
                                color_percentages["red"],
                                color_percentages["orange"],
                                color_percentages["yellow"],
                                color_percentages["white"],
                                color_percentages["black"]
                            ]
                        )
                    image_urls.append(image_url)  # Add image URL to processed list
                flags1[year].append(image_url)  # Add image URL to the list for the year

    if year == 1874:  # Skip the year 1874 for removal checks
        continue

    # Identify flags removed in the current year
    rip_flags = [flag for flag in flags1[year - 1] if flag not in flags1[year]]
    updated_rows = []

    # Read existing rows from the CSV file
    with open("flags1.csv", mode="r", newline="", encoding="utf-8") as read_file:
        reader = csv.reader(read_file)
        existing_rows = list(reader)

    for row in existing_rows:  # Loop through existing rows
        if row[0] in rip_flags:  # If the flag URL is in the removal list
            row[6] = year  # Update the "Year Removed" field
        updated_rows.append(row)  # Add the row to updated rows

    # Write the updated rows back to the CSV file
    with open("flags1.csv", mode="w", newline="", encoding="utf-8") as write_file:
        writer = csv.writer(write_file)
        writer.writerows(updated_rows)

KeyboardInterrupt: 