In [1]:
import pandas as pd
import numpy as np

In [19]:
# open the file
df = pd.read_csv("./facescrub_metadata.csv")
df.head(20)

Unnamed: 0,name,image_id,face_id,url,bbox,sha256
0,Aaron Eckhart,1,1,http://upload.wikimedia.org/wikipedia/commons/...,53177418542,dec996994cf1eec33b53c203cff0e8f25638829fa2ad71...
1,Aaron Eckhart,2,2,http://movies.dosthana.com/sites/default/files...,80102260282,f84d0c3b1b854a51e6bc031bc353e801834e81df795e85...
2,Aaron Eckhart,3,3,http://upload.wikimedia.org/wikipedia/commons/...,2038029751574,8548658ef00f2ac4c384fbfff9d3ae225b4b9e0c2aa45e...
3,Aaron Eckhart,4,4,http://25.media.tumblr.com/nJ2vga5sae9o2ks4Flt...,6290231259,658d83f35859d2f313ff660c1900427c21eae1c41e3035...
4,Aaron Eckhart,5,5,http://upload.wikimedia.org/wikipedia/commons/...,276120492336,1fa14fed3371280e8785df42cdc5d0335e7923a38f1b06...
5,Aaron Eckhart,6,6,http://media.zenfs.com/en_us/Movies/PhotoG/2nd...,235158540463,121e8bea5caae215b537dc116534bbd2fa7a9a60c19107...
6,Aaron Eckhart,7,7,http://img2.timeinc.net/people/i/2008/news/080...,6057178175,bb2e227a9420d6325b67974231d334fc4620e807543a16...
7,Aaron Eckhart,8,8,http://latimesblogs.latimes.com/photos/uncateg...,30149326445,4937ad41d13c493f6b1bd5dfd560c3b2a4bf3fdeb97315...
8,Aaron Eckhart,9,9,http://collider.com/wp-content/uploads/Aaron-E...,69773318781914,3705523152829d26aa5691a81f61f4708d10231a6c5b97...
9,Aaron Eckhart,10,10,http://movies.dosthana.com/sites/default/files...,375157571353,0ffd2e7003856122e275ec621cba760e05db0a42a7e453...


In [20]:
df['name'].value_counts().sort_values().reset_index().head(20)

Unnamed: 0,name,count
0,Adrien Brody,10
1,Al Pacino,10
2,Alan Alda,10
3,Alan Arkin,10
4,Alan Rickman,10
5,Alec Baldwin,10
6,Alexander Skarsgård,10
7,Alfred Molina,10
8,Amaury Nolasco,10
9,Andy Garcia,10


In [26]:
# clean the name column from non ASCII characters
import unicodedata

def clean_text_unicode(text):
    """
    Remove accents and convert to ASCII using standard library
    """
    # Normalize to NFD (decomposed form)
    nfd = unicodedata.normalize('NFD', text)
    
    # Remove combining characters (accents)
    ascii_text = ''.join(char for char in nfd 
                         if unicodedata.category(char) != 'Mn')
    
    # Keep only ASCII characters
    ascii_text = ascii_text.encode('ascii', 'ignore').decode('ascii')
    
    return ascii_text 

df['name_clean'] = df['name'].apply(lambda x : clean_text_unicode(x))

In [29]:
# create image title
df["title"] = df.apply(lambda x: x['name_clean'] + "_" + str(x["image_id"]), axis=1)
df.head()

Unnamed: 0,name,image_id,face_id,url,bbox,sha256,name_clean,image_name,title
0,Aaron Eckhart,1,1,http://upload.wikimedia.org/wikipedia/commons/...,53177418542,dec996994cf1eec33b53c203cff0e8f25638829fa2ad71...,Aaron Eckhart,Aaron Eckhart_1,Aaron Eckhart_1
1,Aaron Eckhart,2,2,http://movies.dosthana.com/sites/default/files...,80102260282,f84d0c3b1b854a51e6bc031bc353e801834e81df795e85...,Aaron Eckhart,Aaron Eckhart_2,Aaron Eckhart_2
2,Aaron Eckhart,3,3,http://upload.wikimedia.org/wikipedia/commons/...,2038029751574,8548658ef00f2ac4c384fbfff9d3ae225b4b9e0c2aa45e...,Aaron Eckhart,Aaron Eckhart_3,Aaron Eckhart_3
3,Aaron Eckhart,4,4,http://25.media.tumblr.com/nJ2vga5sae9o2ks4Flt...,6290231259,658d83f35859d2f313ff660c1900427c21eae1c41e3035...,Aaron Eckhart,Aaron Eckhart_4,Aaron Eckhart_4
4,Aaron Eckhart,5,5,http://upload.wikimedia.org/wikipedia/commons/...,276120492336,1fa14fed3371280e8785df42cdc5d0335e7923a38f1b06...,Aaron Eckhart,Aaron Eckhart_5,Aaron Eckhart_5


### Download the images

In [31]:
import requests
from pathlib import Path
from tqdm import tqdm
import time

def download_images_from_csv(csv_file, output_folder='downloaded_images'):
    """
    Download images from CSV with URL and title columns
    
    CSV format:
    url,title
    https://example.com/image1.jpg,my_image_1
    https://example.com/image2.png,my_image_2
    """
    # Create output folder
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)
    
    # Read CSV
    df = pd.read_csv(csv_file)

    # clean the name column from ascii character
    df['name_clean'] = df['name'].apply(lambda x : clean_text_unicode(x))

    # create the title column for the image title
    df["title"] = df.apply(lambda x: x['name_clean'] + "_" + str(x["image_id"]), axis=1)
    
    # Validate columns
    if 'url' not in df.columns or 'title' not in df.columns:
        print("Error: CSV must have 'url' and 'title' columns")
        return
    
    successful = 0
    failed = 0
    
    # Download each image
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Downloading"):
        url = row['url']
        title = row['title']
        
        try:
            # Get file extension from URL
            ext = Path(url).suffix
            if not ext:
                ext = '.jpg'  # Default extension
            
            # Clean filename
            filename = f"{title}{ext}"
            filepath = output_path / filename
            
            # Download image
            response = requests.get(url, timeout=10, stream=True)
            response.raise_for_status()
            
            # Save to file
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            successful += 1
            
        except Exception as e:
            print(f"\nFailed to download {title}: {e}")
            failed += 1
            continue
        
        # Be nice to servers
        time.sleep(0.1)
    
    print(f"\n{'='*60}")
    print(f"Download Complete!")
    print(f"{'='*60}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Output folder: {output_path.absolute()}")
    print(f"{'='*60}")

ModuleNotFoundError: No module named 'tqdm'