# 3&4 Combined
This should download a video and then extact images delete video and move on to the next

In [1]:
import os
import cv2
import requests
import numpy as np
import pandas as pd
from urllib.parse import urlparse

In [2]:
API_KEY = "illecible"

## Check for duplicates

In [3]:
df_results = pd.read_csv(os.path.join("..", "data", "full_video_dataset_relevant.csv"))


In [7]:
df_results.head(1)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs,Topic,Topic_Probability
0,Voetbalwedstrijd Tuschinski - Cinema Royal,bioscoopjournaals waarin nederlandse onderwerp...,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_eus...,1928-01-01,association football images association football,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,voetbalwedstrijd tuschinski - cinema royal [SE...,53,0.910389


In [None]:
# Check for duplicates based on the 'Video URL' column
duplicate_videos = df_results[df_results.duplicated(subset=['Video URL'], keep=False)]

# Display the duplicate rows
print(f"Found {len(duplicate_videos)} duplicate entries based on 'Video URL'.")


Found 0 duplicate entries based on 'Video URL'.


## START

In [10]:
# 🔹 Define Paths
VIDEO_FOLDER = os.path.join("..", "videos")
IMAGE_FOLDER = os.path.join("..", "images")
DATA_FOLDER = os.path.join("..", "data_info")
IMAGE_METADATA_FILE = os.path.join(DATA_FOLDER, "extracted_images_metadata.csv")
IMAGE_QUALITY = 70  # Compression quality (0-100, higher is better)
IMAGE_EXTENSION = ".jpg"
MAX_FRAMES_PER_VIDEO = 30  # Max images extracted per video
DARK_THRESHOLD = 50  # Pixel intensity below this is considered dark (0-255)
DARK_PERCENTAGE = 80  # % of image pixels that must be dark to be removed
WHITE_THRESHOLD = 225  # Threshold for white images
SIMILARITY_THRESHOLD = 0.8  # Similarity threshold for duplicate detection

# 🔹 Ensure folders exist
os.makedirs(VIDEO_FOLDER, exist_ok=True)
os.makedirs(IMAGE_FOLDER, exist_ok=True)
os.makedirs(DATA_FOLDER, exist_ok=True)

# 🔹 Load dataset with relevant videos
df_results = pd.read_csv(os.path.join("..", "data", "full_video_dataset_relevant.csv"))

# 🔹 Function to extract item ID from URL
def extract_item_id(europeana_url):
    path_parts = urlparse(europeana_url).path.strip("/").split("/")
    return "/".join(path_parts[-2:]) if len(path_parts) > 1 else None

# 🔹 Function to clean and format title as a safe filename
def clean_title(title):
    title = title.lower().replace(" ", "_").replace("/", "_")
    return "".join(c for c in title if c.isalnum() or c in "_-")[:100]  # Safe filename, limit length

# 🔹 Function to check if an image is mostly blank (dark or white)
def is_blank_image(image, dark_threshold=DARK_THRESHOLD, dark_percentage=DARK_PERCENTAGE, white_threshold=WHITE_THRESHOLD):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    mean_intensity = np.mean(gray)

    if mean_intensity < dark_threshold or mean_intensity > white_threshold:
        return True  # Fully black or white

    num_dark_pixels = np.sum(gray < dark_threshold)
    total_pixels = gray.size
    dark_ratio = (num_dark_pixels / total_pixels) * 100

    return dark_ratio > dark_percentage  # Remove if mostly dark

# 🔹 Function to compute image histogram for duplicate detection
def get_histogram(image):
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])
    cv2.normalize(hist, hist)
    return hist

# 🔹 Function to get the full video duration
def get_video_duration(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    cap.release()
    
    if fps > 0:
        return int(frame_count / fps)  # Convert to seconds
    return None  # Return None if duration cannot be calculated

# 🔹 Modify extract_frames() to save images in a subfolder per video and track duration
def extract_frames(video_path, video_title, europeana_url, video_duration):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_interval = fps * 10  # Every 10 seconds
    start_frame = fps * 3  # Start at 3rd second

    frame_count = 0
    saved_frames = 0
    previous_histogram = None
    extracted_images = []

    # 🔹 Create a subfolder inside the image folder for this video
    video_image_folder = os.path.join(IMAGE_FOLDER, video_title)
    os.makedirs(video_image_folder, exist_ok=True)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or saved_frames >= MAX_FRAMES_PER_VIDEO:
            break

        if frame_count >= start_frame and (frame_count - start_frame) % frame_interval == 0:
            timestamp_seconds = frame_count // fps
            frame_filename = f"{video_title}_{timestamp_seconds}{IMAGE_EXTENSION}"
            frame_path = os.path.join(video_image_folder, frame_filename)

            if is_blank_image(frame):
                frame_count += 1
                continue  

            current_histogram = get_histogram(frame)
            if previous_histogram is not None:
                similarity = cv2.compareHist(previous_histogram, current_histogram, cv2.HISTCMP_CORREL)
                if similarity > SIMILARITY_THRESHOLD:
                    frame_count += 1
                    continue  

            previous_histogram = current_histogram  
            cv2.imwrite(frame_path, frame, [cv2.IMWRITE_JPEG_QUALITY, IMAGE_QUALITY])  

            extracted_images.append({
                "Image": frame_filename,
                "Video": video_title,
                "Second": timestamp_seconds,
                "Path": frame_path,
                "Europeana URL": europeana_url,
                "Video Duration (seconds)": video_duration  # 🔹 Add duration to metadata
            })
            saved_frames += 1

        frame_count += 1

    cap.release()
    return extracted_images

# 🔹 Modify process_video() to pass the URL and get video duration
def process_video(europeana_url, title):
    item_id = extract_item_id(europeana_url)
    if not item_id:
        print(f"❌ Invalid URL format: {europeana_url}")
        return []

    metadata_url = f"https://api.europeana.eu/record/v2/{item_id}.json"
    response = requests.get(metadata_url, params={"wskey": API_KEY})

    if response.status_code == 200:
        data = response.json()
        aggregation = data.get("object", {}).get("aggregations", [{}])[0]
        video_url = aggregation.get("edmIsShownBy", "Unknown")

        if video_url != "Unknown":
            video_filename = os.path.join(VIDEO_FOLDER, f"{clean_title(title)}.mp4")
            print(f"⬇️ Downloading: {video_url} -> {video_filename}")

            video_response = requests.get(video_url, stream=True)
            if video_response.status_code == 200:
                with open(video_filename, "wb") as video_file:
                    for chunk in video_response.iter_content(chunk_size=1024):
                        video_file.write(chunk)
                print(f"✅ Downloaded: {video_filename}")

                # 🔹 Get video duration
                video_duration = get_video_duration(video_filename)

                # 🔹 Extract images & pass the original URL and duration
                extracted_images = extract_frames(video_filename, clean_title(title), europeana_url, video_duration)

                # 🔹 Delete video after processing
                os.remove(video_filename)
                print(f"🗑️ Deleted video: {video_filename}")

                return extracted_images
            else:
                print(f"❌ Failed to download video, Status Code: {video_response.status_code}")
        else:
            print(f"⚠️ No video URL found for {europeana_url}")
    else:
        print(f"❌ Failed to fetch item {item_id} from Europeana API, Status Code: {response.status_code}")

    return []

# 🔹 Process multiple videos in a loop
def process_all_videos(df):
    image_data = []
    processed_videos = 0

    for _, row in df.iterrows():
        extracted_images = process_video(row["Europeana URL"], row["Title"])
        image_data.extend(extracted_images)
        processed_videos += 1

        # Print progress every 100 videos
        if processed_videos % 100 == 0:
            print(f"✅ Processed {processed_videos} videos...")

    # 🔹 Save extracted image metadata
    df_images = pd.DataFrame(image_data)
    df_images.to_csv(IMAGE_METADATA_FILE, index=False)
    print(f"✅ Saved metadata for {len(df_images)} images to '{IMAGE_METADATA_FILE}'.")




In [None]:
# 🔹 Run the full pipeline
process_all_videos(df_results)
print(f"✅ All videos processed and images extracted.")

## 2nd Go

In [9]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', None)

In [6]:
df_results.head(1)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs,Topic,Topic_Probability
0,Voetbalwedstrijd Tuschinski - Cinema Royal,bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005717?utm_source=api&utm_medium=api&utm_campaign=illecible,1928-01-01,association football images association football,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,voetbalwedstrijd tuschinski - cinema royal [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] association football images association football [SEP] kingdom netherlands netherlands eurozone,53,0.910389


In [8]:
# 🔹 Define Paths
DATA_FOLDER = os.path.join("..", "data")
VIDEO_FOLDER = os.path.join("..", "videos")
IMAGE_FOLDER = os.path.join("..", "images")
IMAGE_METADATA_FILE = os.path.join("..", "data_info", "extracted_images_metadata.csv")

# 🔹 Load dataset
file_path = os.path.join(DATA_FOLDER, "full_video_dataset_relevant.csv")
if not os.path.exists(file_path):
    print(f"❌ File not found: {file_path}")
    # exit()

df_results = pd.read_csv(file_path)

# 🔹 Normalize title column to lowercase for case-insensitive search
df_results['Title_lower'] = df_results['Title'].str.lower()

# 🔹 Find the last processed title (case-insensitive)
last_processed_title = "motorwedstrijden op de grasbaan"
index_match = df_results[df_results['Title_lower'] == last_processed_title.lower()].index

if not index_match.empty:
    last_index = index_match[0]  # Get the first matching index
    print(f"✅ Found '{last_processed_title}' at index {last_index}")
else:
    print(f"❌ Title '{last_processed_title}' not found in dataset.")
    #exit()


✅ Found 'motorwedstrijden op de grasbaan' at index 2292


In [None]:
# 🔹 Process remaining videos
df_remaining = df_results.iloc[last_index + 1:]  # Select remaining rows

# Function to process the videos
def process_all_videos(df):
    for _, row in df.iterrows():
        extracted_images = process_video(row["Europeana URL"], row["Title"])

# Run the remaining processing
process_all_videos(df_remaining)
print("✅ All remaining videos processed.")

# Data 

In [4]:
df_results.head(1)

Unnamed: 0,Title,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs,Topic,Topic_Probability
0,Voetbalwedstrijd Tuschinski - Cinema Royal,bioscoopjournaals waarin nederlandse onderwerp...,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_eus...,1928-01-01,association football images association football,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,voetbalwedstrijd tuschinski - cinema royal [SE...,53,0.910389


In [5]:
len(df_results)

3593

In [6]:
import pandas as pd
import os

IMAGE_METADATA_FILE = os.path.join("..", "data_info", "extracted_images_metadata.csv")

# Read the CSV file as a DataFrame
df_image = pd.read_csv(IMAGE_METADATA_FILE)


In [10]:
df_image.head(1)

Unnamed: 0,Image,Video,Second,Path
0,2051906_data_euscreenXL_https___www_openbeelden_nl_media_1005287_3.jpg,2051906_data_euscreenXL_https___www_openbeelden_nl_media_1005287,3,../data_info/frames\2051906_data_euscreenXL_https___www_openbeelden_nl_media_1005287\2051906_data_euscreenXL_https___www_openbeelden_nl_media_1005287_3.jpg


In [8]:
len(df_image)

1004

In [21]:
# Define the base directory
IMAGE_BASE_DIR = os.path.join("..", "images")

# Initialize an empty list to store metadata
image_metadata = []

# Iterate over all folders in the images directory
for video_title in os.listdir(IMAGE_BASE_DIR):
    video_path = os.path.join(IMAGE_BASE_DIR, video_title)
    
    # Check if it's a directory (a video folder)
    if os.path.isdir(video_path):
        # Iterate over images in the video folder
        for image_name in os.listdir(video_path):
            if image_name.lower().endswith((".jpeg", ".jpg", ".png")):  # Ensure it's a JPEG image
                relative_path = os.path.join("..", "images", video_title, image_name)
                
                # Append metadata to the list
                image_metadata.append({
                    "Video": video_title,
                    "Image": image_name,
                    "Path": relative_path
                })

# Convert to DataFrame
df_image = pd.DataFrame(image_metadata)

# Save to CSV
output_file = os.path.join("..", "data", "extracted_images_metadata.csv")
df_image.to_csv(output_file, index=False, encoding="utf-8")



In [25]:
df_image.head(5)

Unnamed: 0,Video,Image,Path
0,10-jarig_bestaan_bio-vacantie_oord,10-jarig_bestaan_bio-vacantie_oord_103.jpg,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_103.jpg
1,10-jarig_bestaan_bio-vacantie_oord,10-jarig_bestaan_bio-vacantie_oord_113.jpg,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_113.jpg
2,10-jarig_bestaan_bio-vacantie_oord,10-jarig_bestaan_bio-vacantie_oord_63.jpg,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_63.jpg
3,10-jarig_bestaan_bio-vacantie_oord,10-jarig_bestaan_bio-vacantie_oord_93.jpg,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_93.jpg
4,10-jarig_bestaan_van_de_school_voor_reserveofficieren_der_infanterie,10-jarig_bestaan_van_de_school_voor_reserveofficieren_der_infanterie_113.jpg,..\images\10-jarig_bestaan_van_de_school_voor_reserveofficieren_der_infanterie\10-jarig_bestaan_van_de_school_voor_reserveofficieren_der_infanterie_113.jpg


In [23]:
len(df_image)

19410

In [24]:
len(df_image["Video"].value_counts())

3166

Windows: 19.410 Files, 3.392 Folders, so some folders ( name after videos) have the same name somehow?

## Clean Data

In [26]:
# Assuming df_image is already loaded
# Reorder columns: Image, Path, Video
df_image = df_image[['Image', 'Path', 'Video']]

# Function to extract the second timestamp
def extract_time(image_name):
    # Remove file extension
    image_name = os.path.splitext(image_name)[0]
    # Extract the last numeric part after the last '_'
    timestamp = image_name.split("_")[-1]
    # Convert to integer (fallback to 0 if error)
    return int(timestamp) if timestamp.isdigit() else 0

# Apply extraction function
df_image["Time"] = df_image["Image"].apply(extract_time)

# Convert time to mm:ss format
df_image["Time_Formatted"] = df_image["Time"].apply(lambda x: f"{x // 60}:{x % 60:02}")

In [None]:
# Remove file extension from Image column
df_image["Image"] = df_image["Image"].apply(lambda x: os.path.splitext(x)[0])

# Save the updated DataFrame
output_file = os.path.join("..", "data", "extracted_images_metadata.csv")
df_image.to_csv(output_file, index=False, encoding="utf-8")


In [30]:
# Replace underscores with spaces in Image and Video columns
df_image["Image"] = df_image["Image"].str.replace("_", " ", regex=False)
df_image["Video"] = df_image["Video"].str.replace("_", " ", regex=False)

# Save the updated DataFrame
output_file = os.path.join("..", "data_info", "updated_extracted_images_metadata_cleaned.csv")
df_image.to_csv(output_file, index=False, encoding="utf-8")

In [31]:
df_image.head(4)

Unnamed: 0,Image,Path,Video,Time,Time_Formatted
0,10-jarig bestaan bio-vacantie oord 103,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_103.jpg,10-jarig bestaan bio-vacantie oord,103,1:43
1,10-jarig bestaan bio-vacantie oord 113,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_113.jpg,10-jarig bestaan bio-vacantie oord,113,1:53
2,10-jarig bestaan bio-vacantie oord 63,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_63.jpg,10-jarig bestaan bio-vacantie oord,63,1:03
3,10-jarig bestaan bio-vacantie oord 93,..\images\10-jarig_bestaan_bio-vacantie_oord\10-jarig_bestaan_bio-vacantie_oord_93.jpg,10-jarig bestaan bio-vacantie oord,93,1:33


In [32]:
# Save the updated DataFrame
output_file = os.path.join("..", "data", "cleaned_extracted_images_metadata.csv")
df_image.to_csv(output_file, index=False, encoding="utf-8")

In [34]:
# Ensure Title column exists before proceeding
if "Title" in df_results.columns:
    # Create a lowercase version of Title
    df_results["Title_Lower"] = df_results["Title"].str.lower()

    # Reorder columns to place Title_Lower second
    cols = df_results.columns.tolist()
    cols.insert(1, cols.pop(cols.index("Title_Lower")))  # Move Title_Lower to second position
    df_results = df_results[cols]


In [35]:
df_results.head(1)

Unnamed: 0,Title,Title_Lower,Description,Video URL,Europeana URL,Date,Subjects,Places,Provider,docs,Topic,Topic_Probability
0,Voetbalwedstrijd Tuschinski - Cinema Royal,voetbalwedstrijd tuschinski - cinema royal,bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd,https://www.openbeelden.nl/media/1005717,https://www.europeana.eu/item/2051906/data_euscreenXL_https___www_openbeelden_nl_media_1005717?utm_source=api&utm_medium=api&utm_campaign=illecible,1928-01-01,association football images association football,kingdom netherlands netherlands eurozone,Netherlands Institute for Sound & Vision,voetbalwedstrijd tuschinski - cinema royal [SEP] bioscoopjournaals waarin nederlandse onderwerpen bepaalde gepresenteerd [SEP] association football images association football [SEP] kingdom netherlands netherlands eurozone,53,0.910389


In [57]:
import re

# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure it's a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove non-Latin characters
        text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text
    return text  # Return original if not string

# Apply cleaning to both columns
df_image["Video"] = df_image["Video"].apply(clean_text)
df_results["Title_Lower"] = df_results["Title_Lower"].apply(clean_text)

In [67]:
# Save the updated DataFrame
output_file = os.path.join("..", "data", "extracted_images_metadata_180225.csv")
df_image.to_csv(output_file, index=False, encoding="utf-8")

In [68]:
# Save the updated DataFrame
output_file = os.path.join("..", "data", "df_results_cleaned_180225.csv")
df_results.to_csv(output_file, index=False, encoding="utf-8")

## What is missing

In [58]:
# Convert to sets for comparison
titles_in_results = set(df_results["Title_Lower"].unique())
videos_in_images = set(df_image["Video"].unique())

# Find differences
missing_in_images = titles_in_results - videos_in_images  # Present in df_results but not in df_image
missing_in_results = videos_in_images - titles_in_results  # Present in df_image but not in df_results


In [59]:
len(missing_in_images)

230

In [60]:
len(missing_in_results)

4

In [61]:
# Display the first 120 elements of each set
print("First 10 values missing in df_image (from df_results):")
print(list(missing_in_images)[:10])

print("\nFirst 10 values missing in df_results (from df_image):")
print(list(missing_in_results)[:10])


First 10 values missing in df_image (from df_results):
['het bokbier wordt binnengehaald', 'nieuws uit indonesie autodumps in batavia nederlandse rode kruis hulp in pangkalpinang tinbewerking op bangka', 'dutch east indies footage by willy mullens part 1', 'dutch east indies miscellaneous', 'begrafenis professor treub', 'biljartwedstrijd om het kampioenschap van europa', 'concert door bond amsterdam dilettanten muziek vereniging', 'handboogwedstrijd nederlandbelgi', 'bloemenfeest van het genootschap nederlanditali', 'om het wereldkampioenschap biljarten cadre']

First 10 values missing in df_results (from df_image):
['pembasmian malaria 2 3', 'pembasmian malaria 3 3', 'pembasmian malaria 1 3', 'de fransman galmiche europeesch kampioen biljarten kader 47 2']


In [65]:
unique_video_count = df_image["Video"].nunique()
print(f"Number of unique values in df_image['Video']: {unique_video_count}")
# This is our final video count

Number of unique values in df_image['Video']: 3165


In [None]:
x = str(len(df_image))
print(f"Number of images in df_image: {x}")
# Final image count

Number of images in df_image: 19410
