In [None]:
%cd /tf/notebooks

In [None]:
import os
import requests
import json
import unicodedata
from urllib.parse import quote

def remove_non_ascii(text):
    return ''.join(char for char in text if ord(char) < 128)

def download_images(data):
    for anime in data["animeData"]:
        title = anime["title"]["romaji"]
        year = anime["startDate"]["year"]
        characters = anime["characters"]["edges"]
        
        for character in characters:
            character_name = character["node"]["name"]["full"]
            image_url = character["node"]["image"]["medium"]
            
            # Replacing spaces with underscores in title and character name
            title = title.replace(' ', '_')
            character_name = character_name.replace(' ', '_')
            
            # Remove non-ASCII characters
            title = remove_non_ascii(title)
            character_name = remove_non_ascii(character_name)
            
            # Creating folder if it doesn't exist
            folder_name = "data"
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
            
            # Extracting file extension from URL
            file_extension = image_url.split(".")[-1]
            
            # Creating the filename
            filename = f"{year}_{title}_{character_name}.{file_extension}"
            filename = quote(filename)
            file_path = os.path.join(folder_name, filename)
            
            # Check if the file already exists
            if not os.path.exists(file_path):
                # Downloading the image
                response = requests.get(image_url)
                if response.status_code == 200:
                    with open(file_path, 'wb') as file:
                        file.write(response.content)
                        print(f"Downloaded: {filename}")
                else:
                    print(f"Failed to download: {filename}")
            else:
                print(f"File already exists: {filename}")

# Load JSON data from the file
with open('anime_data_and_progress.json', 'r') as file:
    anime_data = json.load(file)

# Call function to download images
download_images(anime_data)
print("Done!")

In [None]:
%pip install Pillow imagehash

In [13]:
import os
from PIL import Image
import imagehash

def delete_duplicate_images(folder_path, image_hashes):
    # Get the absolute path of the folder containing images
    folder_path = os.path.abspath(folder_path)

    count = 0
    array_image_deleted = []

    # Iterate through all files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)

            try:
                # Open the image using PIL
                img = Image.open(file_path)

                # Generate the hash for the image
                img_hash = str(imagehash.average_hash(img))

                # If the hash already exists in the dictionary, delete the image
                if img_hash in image_hashes:
                    os.remove(file_path)
                    print(f"Deleted duplicate image: {file_path}")
                    count += 1
                    array_image_deleted.append(file_path)
                else:
                    # Add the hash to the dictionary
                    image_hashes[img_hash] = file_path

            except (OSError, Image.UnidentifiedImageError):
                print(f"Unable to open {file_path}. It might not be an image file.")
    
    print(f"Deleted {count} duplicate images.")
    json.dump(array_image_deleted, open("array_image_deleted.json", "w"))

folder_path = '/tf/notebooks/data'
image_to_delete = '/tf/notebooks/no_image.jpg'
image_hashes = {}

# hash image_to_delete and add it to the dictionary
img = Image.open(image_to_delete)
img_hash = str(imagehash.average_hash(img))
image_hashes[img_hash] = image_to_delete

delete_duplicate_images(folder_path, image_hashes)


Deleted duplicate image: /tf/notebooks/data/2015_3Ping_Lovers%21_Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Alice_Erzan.png
Deleted duplicate image: /tf/notebooks/data/2015_3Ping_Lovers%21_Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Frey_Ringitt.png
Deleted duplicate image: /tf/notebooks/data/2015_3Ping_Lovers%21_Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Tomohiro_Kutsuki.png
Deleted duplicate image: /tf/notebooks/data/2015_Aikatsu%21_Music_Award%3A_Minna_de_Shou_wo_Moraima_SHOW%21_Akari%0D%0A_Oozora.png
Deleted duplicate image: /tf/notebooks/data/2015_Animegatari_x_Koukaku_Kidoutai_Collab_Eizou_Erika_Aoyama.jpg
Deleted duplicate image: /tf/notebooks/data/2015_Animegatari_x_Koukaku_Kidoutai_Collab_Eizou_Maya_Asagaya.jpg
Deleted duplicate image: /tf/notebooks/data/2015_Animegatari_x_Koukaku_Kidoutai_Collab_Eizou_Nishiazabu.jpg
Deleted duplicate image: /tf/notebooks/data/2015_Animegatari_x_Shingeki_no_Kyojin_Collab_Eizou_Erika_Aoyama.jpg
Deleted duplicate image: /tf/notebooks/data/2015

In [20]:
# transform all images to 256x256 ( from 150 by 50, and fill with white, dont crop) and save them as jpg in a new folder data_clean
from PIL import Image
import os

def resize_images(folder_path):
    # Get the absolute path of the folder containing images
    folder_path = os.path.abspath(folder_path)

    # Creating folder if it doesn't exist
    folder_name = "data_clean"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Iterate through all files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)

            try:
                # Open the image using PIL
                img = Image.open(file_path)
                img = img.resize((64, 64), Image.LANCZOS)

                # save image with jpg 
                file_extension = file_path.split(".")[-1]
                filename = file_path.split("/")[-1]
                filename = filename.replace(file_extension, "jpg")
                filename = os.path.join(folder_name, filename)
                img.save(filename, "JPEG", quality=100)

            except (OSError, Image.UnidentifiedImageError):
                print(f"Unable to open {file_path}. It might not be an image file.")

    print("Done!")

folder_path = '/tf/notebooks/data'
resize_images(folder_path)

Unable to open /tf/notebooks/data/2015_3Ping_Lovers%21Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Alice_Erzan.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_3Ping_Lovers%21Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Frey_Ringitt.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_3Ping_Lovers%21Ippu_Nisai_no_Sekai_e_Youkoso_THE_ANIMATION_Tomohiro_Kutsuki.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_Aikatsu%21_4_Akari%0D%0A_Oozora.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_Aikatsu%21_Music_Award%3A_Minna_de_Shou_wo_Moraima_SHOW%21_Aoi_Kiriya.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_Aikatsu%21_Music_Award%3A_Minna_de_Shou_wo_Moraima_SHOW%21_Ran_Shibuki.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_Akatsuki_no_Yona_OVA_Jae-Ha.png. It might not be an image file.
Unable to open /tf/notebooks/data/2015_Aldnoah.Zero_2_L

In [22]:
# create anime_before_clean.json that contain all filenames before cleaning in data_clean
import os
import json

def create_json(folder_path):
    # Get the absolute path of the folder containing images
    folder_path = os.path.abspath(folder_path)

    # Creating folder if it doesn't exist
    folder_name = "data_clean"
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Iterate through all files in the folder
    anime_before_clean = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            anime_before_clean.append(file_path)

    print("Done!")
    json.dump(anime_before_clean, open("anime_before_clean.json", "w"))

folder_path = '/tf/notebooks/data_clean'
create_json(folder_path)

Done!
