In [1]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm

# Configuration
DATASET_ROOT = '/gpfs/workdir/restrepoda/datasets'
DATASET_NAME = 'fakeddit'
IMAGE_DIR = 'images'
LABELS_FILE = 'labels.csv' # Assuming the file used is labels.csv based on vlm_embeddings.py defaults

dataset_path = os.path.join(DATASET_ROOT, DATASET_NAME)
labels_path = os.path.join(dataset_path, LABELS_FILE)
images_path = os.path.join(dataset_path, IMAGE_DIR)

print(f"Checking dataset at: {dataset_path}")
print(f"Labels file: {labels_path}")
print(f"Images directory: {images_path}")

Checking dataset at: /gpfs/workdir/restrepoda/datasets/fakeddit
Labels file: /gpfs/workdir/restrepoda/datasets/fakeddit/labels.csv
Images directory: /gpfs/workdir/restrepoda/datasets/fakeddit/images


In [2]:
# Load the dataframe
if os.path.exists(labels_path):
    # Try reading as CSV first, if it fails try TSV (common for fakeddit)
    try:
        df = pd.read_csv(labels_path)
        print(f"Loaded labels.csv with {len(df)} rows.")
    except:
        print("Failed to read as standard CSV, trying TSV...")
        df = pd.read_csv(labels_path, sep='\t')
        print(f"Loaded labels as TSV with {len(df)} rows.")
else:
    print(f"Error: {labels_path} does not exist.")
    # List files in directory to help debug
    print("Files in dataset directory:", os.listdir(dataset_path))

Loaded labels.csv with 91043 rows.


In [3]:
# Inspect dataframe
df.head()

Unnamed: 0,id,text,split,2_way_label,3_way_label,6_way_label
0,6d50rl,major thermos,train,0,2,2
1,86byl8,rabbi meat from cloned pig could be kosher for...,train,1,0,0
2,bl11vl,this car has a door bolt,train,1,0,0
3,61uy4u,el chapo escapes altiplano prison th,train,0,2,2
4,63r5xj,comfort cases former foster child helps other ...,train,1,0,0


In [4]:
from joblib import Parallel, delayed
import multiprocessing

def verify_single_image(idx, img_id, image_dir_path):
    # Handle missing extension if necessary
    if not img_id.lower().endswith(('.jpg', '.jpeg', '.png')):
            # Try adding .jpg as per error log observation (d1vwukd.jpg)
            img_filename = img_id + '.jpg'
    else:
            img_filename = img_id
            
    full_path = os.path.join(image_dir_path, img_filename)
    
    if not os.path.exists(full_path):
        # Try checking if the original ID works (maybe it already had extension)
        full_path_orig = os.path.join(image_dir_path, img_id)
        if os.path.exists(full_path_orig):
            full_path = full_path_orig
        else:
            return ('missing', idx, img_id, full_path)
    
    try:
        with Image.open(full_path) as img:
            img.convert("RGB")
        return ('valid', idx, img_id, full_path)
    except Exception as e:
        return ('corrupted', idx, img_id, full_path, str(e))

def check_images_parallel(df, image_col='id', image_dir_path=''):
    """
    Parallelized version of image checking.
    """
    # Prepare arguments for parallel execution
    tasks = [(idx, str(row[image_col]), image_dir_path) for idx, row in df.iterrows()]
    
    # Use all available cores
    n_jobs = multiprocessing.cpu_count()
    print(f"Running on {n_jobs} cores...")
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(verify_single_image)(idx, img_id, path) 
        for idx, img_id, path in tqdm(tasks, total=len(tasks))
    )
    
    missing_files = []
    corrupted_files = []
    valid_count = 0
    
    # Aggregate results
    for res in results:
        status = res[0]
        if status == 'valid':
            valid_count += 1
        elif status == 'missing':
            missing_files.append(res[1:]) # (idx, img_id, full_path)
        elif status == 'corrupted':
            corrupted_files.append(res[1:]) # (idx, img_id, full_path, error)
            
    return missing_files, corrupted_files, valid_count

# Run check
# Ensure we use the correct column name for Fakeddit (usually 'id' or 'clean_url' which maps to image)
# generate_embeddings_all.py config says: 'image_col': 'id'
IMAGE_COL = 'id'

if 'df' in locals():
    print("Starting parallel check...")
    missing, corrupted, valid = check_images_parallel(df, image_col=IMAGE_COL, image_dir_path=images_path)
    
    print(f"\nCheck Complete.")
    print(f"Valid images: {valid}")
    print(f"Missing images: {len(missing)}")
    print(f"Corrupted images: {len(corrupted)}")
    
    if len(missing) > 0:
        print("\nSample missing:", missing[:5])
        
    if len(corrupted) > 0:
        print("\nSample corrupted:", corrupted[:5])
        
    # Optionally save bad indices to filter them out later
    if len(corrupted) > 0 or len(missing) > 0:
        bad_indices = [x[0] for x in missing] + [x[0] for x in corrupted]
        print(f"\nTotal problematic rows: {len(bad_indices)}")
        
        # Save a list of IDs to remove
        bad_ids = [x[1] for x in missing] + [x[1] for x in corrupted]
        #pd.DataFrame({'id': bad_ids}).to_csv('fakeddit_bad_images.csv', index=False)
        print("Saved bad image IDs to fakeddit_bad_images.csv")
else:
    print("Dataframe not loaded, cannot check.")

Starting parallel check...
Running on 40 cores...


100%|██████████| 91043/91043 [17:11<00:00, 88.27it/s] 



Check Complete.
Valid images: 90992
Missing images: 0
Corrupted images: 51

Sample corrupted: [(1090, 'd1vwukd', '/gpfs/workdir/restrepoda/datasets/fakeddit/images/d1vwukd.jpg', "cannot identify image file '/gpfs/workdir/restrepoda/datasets/fakeddit/images/d1vwukd.jpg'"), (4870, 'c8fjng3', '/gpfs/workdir/restrepoda/datasets/fakeddit/images/c8fjng3.jpg', "cannot identify image file '/gpfs/workdir/restrepoda/datasets/fakeddit/images/c8fjng3.jpg'"), (7242, 'd1ulk01', '/gpfs/workdir/restrepoda/datasets/fakeddit/images/d1ulk01.jpg', "cannot identify image file '/gpfs/workdir/restrepoda/datasets/fakeddit/images/d1ulk01.jpg'"), (10250, 'c89m77i', '/gpfs/workdir/restrepoda/datasets/fakeddit/images/c89m77i.jpg', "cannot identify image file '/gpfs/workdir/restrepoda/datasets/fakeddit/images/c89m77i.jpg'"), (11949, 'de62ivj', '/gpfs/workdir/restrepoda/datasets/fakeddit/images/de62ivj.jpg', "cannot identify image file '/gpfs/workdir/restrepoda/datasets/fakeddit/images/de62ivj.jpg'")]

Total probl

### Filter labels.csv to avoid the corrupt images

In [18]:
df[~df["id"].isin(bad_ids)].to_csv(labels_path, index=False)

In [23]:
pd.read_csv(labels_path)

Unnamed: 0,id,text,split,2_way_label,3_way_label,6_way_label
0,6d50rl,major thermos,train,0,2,2
1,86byl8,rabbi meat from cloned pig could be kosher for...,train,1,0,0
2,bl11vl,this car has a door bolt,train,1,0,0
3,61uy4u,el chapo escapes altiplano prison th,train,0,2,2
4,63r5xj,comfort cases former foster child helps other ...,train,1,0,0
...,...,...,...,...,...,...
90987,5v3mp5,british muslims donate blood under imam hussai...,test,1,0,0
90988,40n81r,we did it for ransom in nazi to criticize stalin,test,0,2,3
90989,1g6fp7,microsoft exec offline gamers should stick wit...,test,1,0,0
90990,1p9r8t,the air condition in my bus looked slightly te...,test,0,2,2
