# Data Preprocessing
The plan is to get the data organized. Here, the goal is to ensure that I have sorted all the data and placed each image in the right folder.

## Step One
Get a full list of breeds in the folder.

In [9]:
import os

image_folder = 'images' 
breed_names = set()

# Loop through all files in the folder
for filename in os.listdir(image_folder):
    if filename.endswith(('.jpg', '.jpeg', '.png')):
        breed = filename.split('_')[0]
        breed_names.add(breed)

# Convert to sorted list
sorted_breeds = sorted(breed_names)

# Print or save
for breed in sorted_breeds:
    print(breed)

print(f"\nTotal unique breeds: {len(sorted_breeds)}")


Abyssinian
Bengal
Birman
Bombay
British
Egyptian
Maine
Persian
Ragdoll
Russian
Siamese
Sphynx
american
basset
beagle
boxer
chihuahua
english
german
great
havanese
japanese
keeshond
leonberger
miniature
newfoundland
pomeranian
pug
saint
samoyed
scottish
shiba
staffordshire
wheaten
yorkshire

Total unique breeds: 35


## Step Two
Remove the mat files in the folders which I do not wish to use. 

In [21]:
import os

folder = 'images'
for filename in os.listdir(folder):
    if filename.endswith('.mat'):
        os.remove(os.path.join(folder, filename))

print("All .mat files removed.")


All .mat files removed.


## Step Three
I used ChatGPT to sort out the breeds to know which ones are dogs and which ones are cats. 

In [11]:
cat_breeds = [
    "abyssinian",
    "bengal",
    "birman",
    "bombay",
    "british",
    "egyptian",
    "maine",
    "persian",
    "ragdoll",
    "russian",
    "scottish",
    "siamese",
    "sphynx"
]
dog_breeds = [
    "american",
    "basset",
    "beagle",
    "boxer",
    "chihuahua",
    "english",
    "german",
    "great",
    "havanese",
    "japanese",
    "keeshond",
    "leonberger",
    "miniature",
    "newfoundland",
    "pomeranian",
    "pug",
    "saint",
    "samoyed",
    "shiba",
    "staffordshire",
    "wheaten",
    "yorkshire"
]


## Step Four
Place each breed in its own category, be it cat and dog. 

In [22]:
import os
import shutil

source_folder = 'images'
destination_folder_cat = 'cat_images'
destination_folder_dog = 'dog_images'
label_cat = 'cat'
label_dog = 'dog'

# List and sort files in the source folder
files = sorted(os.listdir(source_folder))

# Filter cat and dog images
cat_files = [f for f in files if any(f.lower().startswith(breed.lower()) for breed in cat_breeds)]
dog_files = [f for f in files if any(f.lower().startswith(breed.lower()) for breed in dog_breeds)]

print(len(cat_files))
print(len(dog_files))

# Rename and copy cat files
for idx, filename in enumerate(cat_files, start=1):
    ext = os.path.splitext(filename)[1]
    new_name = f"{label_cat}_{idx}{ext}"
    src = os.path.join(source_folder, filename)
    dst = os.path.join(destination_folder_cat, new_name)
    shutil.copy(src, dst)

# Rename and copy dog files
for idx, filename in enumerate(dog_files, start=1):
    ext = os.path.splitext(filename)[1]
    new_name = f"{label_dog}_{idx}{ext}"
    src = os.path.join(source_folder, filename)
    dst = os.path.join(destination_folder_dog, new_name)
    shutil.copy(src, dst)

print("All files copied and renamed successfully.")




2599
4791
All files copied and renamed successfully.


## Step Five
Get the names of the people whose pictures I have access to. 

In [13]:
import os

lfw_path = "archive/lfw-deepfunneled/lfw-deepfunneled" 

# Get only directories
person_folders = [name for name in os.listdir(lfw_path) if os.path.isdir(os.path.join(lfw_path, name))]

output_file = "people_names.txt"
# Write to a text file
with open(output_file, "w") as f:
    for person in sorted(person_folders):
        f.write(f"{person}\n")

print(f"Saved {len(person_folders)} names to '{output_file}'")


Saved 5749 names to 'people_names.txt'


In [14]:
len(person_folders)

5749

## Step Six
Get the images of all humans in one folder. 

In [15]:
import os
import shutil
import glob


# List of folders you want to copy images from
new_folder = [lfw_path + "/" + k for k in person_folders]


# The folder where you want to copy the images
destination_folder = 'human_images'

# Iterate over each folder
for folder in new_folder:
    # Use glob to find all image files (e.g., jpg, png)
    images = glob.glob(os.path.join(folder, '*.jpg'))  # Add more extensions if necessary
    images += glob.glob(os.path.join(folder, '*.png'))  # Include PNG files
    images += glob.glob(os.path.join(folder, '*.jpeg'))
    images += glob.glob(os.path.join(folder, '*.bmp'))

    # Copy each image to the destination folder
    for image in images:
        try:
            shutil.copy(image, destination_folder)
        except Exception as e:
            print(f"Error copying {image}: {e}")


## convert bmp files

In [20]:
from PIL import Image
import os

folder = "human_images"

for file in os.listdir(folder):
    if file.endswith(".bmp"):
        img = Image.open(os.path.join(folder, file))
        new_name = file.replace(".bmp", ".jpg")
        img.convert("RGB").save(os.path.join(folder, new_name), "JPEG")
        print(f"{new_name} converted")


## Get a count of images per breed

In [16]:
import os
from collections import defaultdict

image_folder = 'images'  # Path to your image folder


# Create breed count dictionaries
cat_counts = defaultdict(int)
dog_counts = defaultdict(int)

# Loop through images
for filename in os.listdir(image_folder):
    if filename.endswith(('.jpg', '.jpeg', '.png')):
        breed = filename.split('_')[0].lower()
        if breed in cat_breeds:
            cat_counts[breed] += 1
        elif breed in dog_breeds:
            dog_counts[breed] += 1

# Print results
print("🐱 Cat Breed Counts:\n")
for breed in sorted(cat_counts):
    print(f"{breed}: {cat_counts[breed]}")

print("\n🐶 Dog Breed Counts:\n")
for breed in sorted(dog_counts):
    print(f"{breed}: {dog_counts[breed]}")

print(f"\nTotal cat images: {sum(cat_counts.values())}")
print(f"Total dog images: {sum(dog_counts.values())}")


🐱 Cat Breed Counts:

abyssinian: 200
bengal: 200
birman: 200
bombay: 200
british: 200
egyptian: 200
maine: 200
persian: 200
ragdoll: 200
russian: 200
scottish: 199
siamese: 200
sphynx: 200

🐶 Dog Breed Counts:

american: 400
basset: 200
beagle: 200
boxer: 200
chihuahua: 200
english: 400
german: 200
great: 200
havanese: 200
japanese: 200
keeshond: 200
leonberger: 200
miniature: 200
newfoundland: 200
pomeranian: 200
pug: 200
saint: 200
samoyed: 200
shiba: 200
staffordshire: 191
wheaten: 200
yorkshire: 200

Total cat images: 2599
Total dog images: 4791
