# Dataset Preprocessing for Embedding Evaluation

This notebook handles the preprocessing and filtering of datasets for embedding evaluation. It's designed to be dataset-agnostic and configurable through the evaluation config file.

In [53]:
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
import re
from typing import Dict, List
import logging
import json

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [54]:
# Load configuration
config_path = Path('./config/embedding_eval.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)

dataset_config = config['dataset']
print("Dataset configuration:")
print(yaml.dump(dataset_config, indent=2))

Dataset configuration:
cache_dir: data/cache
columns:
  content: PlotSummary
  identifier: Title
  metadata: Genre
evaluation:
  query_template: Find content similar to '{identifier}' ({metadata})
  similarity_threshold: 0.5
  test_cases_path: config/test_cases.json
path: ./data/processed_dataset.csv
sampling:
  n_queries: 100
  n_samples: 1000



In [55]:
"""Load and inspect raw data"""
# Load dataset
data_path = Path("./data/wiki_movie_plots_deduped_with_summaries.csv")  ######### or any dataset
## https://huggingface.co/datasets/vishnupriyavr/wiki-movie-plots-with-summaries/blob/main/wiki_movie_plots_deduped_with_summaries.csv

logger.info(f"Loading data from {data_path}")

df = pd.read_csv(data_path)
logger.info(f"Loaded {len(df)} rows")

# Display column info
print("\nDataset columns:")
for col in df.columns:
    non_null = df[col].count()
    dtype = df[col].dtype
    print(f"{col}: {dtype} ({non_null} non-null values)")

# Display sample
print("\nSample data:")
display(df.head())

INFO:__main__:Loading data from data/wiki_movie_plots_deduped_with_summaries.csv
INFO:__main__:Loaded 34886 rows



Dataset columns:
Release Year: int64 (34886 non-null values)
Title: object (34886 non-null values)
Origin/Ethnicity: object (34886 non-null values)
Director: object (34886 non-null values)
Cast: object (33464 non-null values)
Genre: object (34886 non-null values)
Wiki Page: object (34886 non-null values)
Plot: object (34886 non-null values)
PlotSummary: object (34886 non-null values)

Sample data:


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",Carrie Nation and her followers burst into a s...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,The first shot is set in a wood during winter ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,The earliest known adaptation of the classic f...


## Data Quality Checks

In [57]:
def check_data_quality(df: pd.DataFrame, config: Dict) -> None:
    """Perform basic data quality checks."""
    # Check required columns exist
    required_columns = config['columns'].values()
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Check for nulls
    null_counts = df[list(required_columns)].isnull().sum()
    print("\nNull counts in required columns:")
    print(null_counts)
    
    # Basic statistics
    text_col = config['columns']['content']
    text_lengths = df[text_col].str.len()
    print(f"\n{text_col} length statistics:")
    print(text_lengths.describe())

check_data_quality(df, dataset_config)


Null counts in required columns:
PlotSummary    0
Title          0
Genre          0
dtype: int64

PlotSummary length statistics:
count    34886.000000
mean       380.977727
std         97.721757
min        160.000000
25%        302.000000
50%        365.000000
75%        452.000000
max        697.000000
Name: PlotSummary, dtype: float64


## Data Preprocessing

In [58]:
def preprocess_text(text: str) -> str:
    """Clean and preprocess text."""
    if pd.isna(text):
        return ""
    
    # Convert to string if needed
    text = str(text)
    
    # Basic cleaning
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'[\r\n]+', ' ', text)  # Remove newlines
    
    return text

def process_dataset(df: pd.DataFrame, config: Dict) -> pd.DataFrame:
    """Process the dataset according to configuration."""
    # Create a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Clean text
    text_col = config['columns']['content']
    processed_df[text_col] = processed_df[text_col].apply(preprocess_text)

    logger.info("Applying dataset filters to match movie recommendation app...")

    # Apply constraints
    processed_df = processed_df[
    (processed_df['Origin/Ethnicity'] == 'American') &
    (processed_df['Release Year'] >= 1950)
]
    
    # For movies dataset specifically (can be made configurable)
    if 'Plot' in df.columns and 'Summary' in df.columns:
        processed_df[text_col] = (
            processed_df['Plot'].fillna('').apply(preprocess_text) + ' ' +
            processed_df['Summary'].fillna('').apply(preprocess_text)
        ).str.strip()
    
    # Remove empty texts
    processed_df = processed_df[processed_df[text_col].str.len() > 0]
    
    # Clean category/genre (if applicable)
    category_col = config['columns']['metadata']
    if category_col in processed_df.columns:
        processed_df[category_col] = processed_df[category_col].fillna('Unknown')
    
    logger.info(f"Processed dataset shape: {processed_df.shape}")
    return processed_df

# Process the dataset
processed_df = process_dataset(df, dataset_config)
print("\nProcessed dataset sample:")
display(processed_df.head())

INFO:__main__:Applying dataset filters to match movie recommendation app...
INFO:__main__:Processed dataset shape: (12340, 9)



Processed dataset sample:


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary
5037,1950,8 Ball Bunny,American,Chuck Jones,Looney Tunes,animation,https://en.wikipedia.org/wiki/8_Ball_Bunny,The Brooklyn Ice Palace shuts down after the I...,"""Playboy"" Penguin is found by Bugs Bunny, who ..."
5038,1950,711 Ocean Drive,American,Joseph M. Newman,"Edmond O'Brien, Joanne Dru, Dorothy Patrick",crime drama,https://en.wikipedia.org/wiki/711_Ocean_Drive,Knowing how much telephone repairman Mal Grang...,Mal Granger (Edmond O'Brien) is a telephone re...
5039,1950,Abbott and Costello in the Foreign Legion,American,Charles Lamont,"Abbott and Costello, Patricia Medina",comedy,https://en.wikipedia.org/wiki/Abbott_and_Coste...,Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou ...,Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou ...
5040,1950,The Admiral Was a Lady,American,Albert S. Rogell,"Edmond O'Brien, Wanda Hendrix, Steve Brodie",comedy,https://en.wikipedia.org/wiki/The_Admiral_Was_...,"After the end of World War II, Jean Madison (W...","Jean Madison, a former WAVE ensign, meets the ..."
5041,1950,All About Eve,American,Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, Gary Merrill, Celest...",drama,https://en.wikipedia.org/wiki/All_About_Eve,Margo Channing (Bette Davis) is one of the big...,Margo Channing (Bette Davis) is one of the big...


## Save Processed Dataset

In [59]:
import json
# Save processed dataset as CSV
processed_path = Path(dataset_config['path'])
print(processed_path)
processed_df.to_csv(processed_path, index=False)
logger.info(f"Saved processed dataset to {processed_path}")

# Save dataset statistics
stats = {
    'total_rows': len(processed_df),
    'columns': processed_df.columns.tolist(),
    'content_length_stats': processed_df[dataset_config['columns']['content']].str.len().describe().to_dict(),
    'metadata_distribution': processed_df[dataset_config['columns']['metadata']].value_counts().to_dict()
}

stats_path = processed_path.with_suffix('.stats.json')
with open(stats_path, 'w') as f:
    json.dump(stats, f, indent=2)

logger.info(f"Saved dataset statistics to {stats_path}")

# Display first few rows of processed data
display(processed_df.head())
display(processed_df.count())

data/processed_dataset.csv


INFO:__main__:Saved processed dataset to data/processed_dataset.csv
INFO:__main__:Saved dataset statistics to data/processed_dataset.stats.json


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,PlotSummary
5037,1950,8 Ball Bunny,American,Chuck Jones,Looney Tunes,animation,https://en.wikipedia.org/wiki/8_Ball_Bunny,The Brooklyn Ice Palace shuts down after the I...,"""Playboy"" Penguin is found by Bugs Bunny, who ..."
5038,1950,711 Ocean Drive,American,Joseph M. Newman,"Edmond O'Brien, Joanne Dru, Dorothy Patrick",crime drama,https://en.wikipedia.org/wiki/711_Ocean_Drive,Knowing how much telephone repairman Mal Grang...,Mal Granger (Edmond O'Brien) is a telephone re...
5039,1950,Abbott and Costello in the Foreign Legion,American,Charles Lamont,"Abbott and Costello, Patricia Medina",comedy,https://en.wikipedia.org/wiki/Abbott_and_Coste...,Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou ...,Bud Jones (Bud Abbott) and Lou Hotchkiss (Lou ...
5040,1950,The Admiral Was a Lady,American,Albert S. Rogell,"Edmond O'Brien, Wanda Hendrix, Steve Brodie",comedy,https://en.wikipedia.org/wiki/The_Admiral_Was_...,"After the end of World War II, Jean Madison (W...","Jean Madison, a former WAVE ensign, meets the ..."
5041,1950,All About Eve,American,Joseph L. Mankiewicz,"Bette Davis, Anne Baxter, Gary Merrill, Celest...",drama,https://en.wikipedia.org/wiki/All_About_Eve,Margo Channing (Bette Davis) is one of the big...,Margo Channing (Bette Davis) is one of the big...


Release Year        12340
Title               12340
Origin/Ethnicity    12340
Director            12340
Cast                12128
Genre               12340
Wiki Page           12340
Plot                12340
PlotSummary         12340
dtype: int64