In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from cts_recommender.io import readers
from cts_recommender.settings import get_settings
from cts_recommender.features.whatson_csv_schema import ORIGINAL_WHATSON_COLUMNS, WHATSON_RENAME_MAP
import pandas as pd
import numpy as np
import re

cfg = get_settings()

# Load data
whatson_df = readers.read_csv(cfg.raw_dir / "original_raw_whatson.csv", sep='\t')
print(f"Loaded {len(whatson_df)} records")
print("\nOriginal columns:")
print(whatson_df.columns.tolist())

Loaded 17336 records

Original columns:
['Collection', 'Titre', 'Titre Original', 'Durée', 'Description courte', 'Code contenu', 'Régions de Production', 'Année de Production', 'Contrôle Parental', 'Départment', 'Réalisation', 'Acteurs/Actrices', 'Début des droits TV', 'Fin des droits TV', 'Nb de diffusions total', 'Nb de diffusions consommées', 'Nb de diffusions disponibles', 'Nb diff. RTS1/RTS2', 'Date de 1ère diffusion', 'Date rediffusion 1', 'Date rediffusion 2', 'Date rediffusion 3', 'Date rediffusion 4', 'Date dernière diff', 'Dernière diff Rating', 'Dernière diff Rating+7', 'Nb Droits TV', 'Nb Droits TV valides', 'Référence Externe']


In [3]:
# Validate original columns
assert whatson_df.columns.tolist() == ORIGINAL_WHATSON_COLUMNS, "Column mismatch!"
print("✓ Columns validated")

✓ Columns validated


In [4]:
# Rename columns immediately
whatson_df = whatson_df.rename(columns=WHATSON_RENAME_MAP)
print("✓ Columns renamed to:")
print(whatson_df.columns.tolist())

✓ Columns renamed to:
['collection', 'title', 'original_title', 'duration', 'short_description', 'class_key', 'production_regions', 'production_year', 'parental_control', 'department', 'director', 'actors', 'tv_rights_start', 'tv_rights_end', 'total_broadcasts', 'consumed_broadcasts', 'available_broadcasts', 'rts1_rts2_broadcasts', 'first_broadcast_date', 'rebroadcast_date_1', 'rebroadcast_date_2', 'rebroadcast_date_3', 'rebroadcast_date_4', 'last_broadcast_date', 'last_broadcast_rating', 'last_broadcast_rating_plus_7', 'tv_rights_count', 'valid_tv_rights_count', 'external_reference']


In [5]:
title_counts = whatson_df['title'].value_counts()
duplicates = title_counts[title_counts > 1]
duplicates

title
Episode 1                                  40
Episode 4                                  37
Episode 3                                  37
Episode 2                                  36
Top models                                 34
                                           ..
Les géants du ciel                          2
La fabuleuse histoire de la warner bros     2
Jude                                        2
Princesse caraboo                           2
Vingt mille lieues sous les mers            2
Name: count, Length: 811, dtype: int64

## Data Analysis: Understanding Episode Patterns

In [6]:
# Analyze title patterns to identify episode indicators
print("=== ANALYZING TITLE PATTERNS ===\n")

# Check for numeric patterns in title
numeric_pattern = whatson_df['title'].str.contains(r'\d+', na=False)
print(f"Titles containing numbers: {numeric_pattern.sum()}")
print("Sample:", whatson_df[numeric_pattern]['title'].head(20).tolist())

print("\n" + "="*50 + "\n")

# Check for common episode indicators
episode_patterns = {
    'ép.': r'ép\.',
    'episode': r'épisode|episode',
    'ep.': r'\bep\.',
    'numbers only': r'^\d+$',
    'E followed by number': r'\bE\d+',
    'S followed by number': r'\bS\d+'
}

for name, pattern in episode_patterns.items():
    matches = whatson_df['title'].str.contains(pattern, case=False, na=False, regex=True)
    if matches.sum() > 0:
        print(f"\n{name.upper()} pattern ({pattern}): {matches.sum()} matches")
        print("Samples:", whatson_df[matches]['title'].head(5).tolist())

=== ANALYZING TITLE PATTERNS ===

Titles containing numbers: 1594
Sample: ['ép.1', 'ép. 2', 'ép. 3', 'ép. 4', 'ép. 5', 'ép. 6', 'Le piège (1/2)', 'Le piège (2/2)', 'Les 1001 nuits ', '30000 pieds ', 'Box office - 2018-06-11', 'Box office - 2018-06-04', 'Box office - 2002-11-11', 'Intro 1/intro 2', 'Presentation de sauvez willy 2', 'Episode 1', 'Episode 2', 'Episode 3', 'Episode 4', 'Episode 1']



ÉP. pattern (ép\.): 10 matches
Samples: ['ép.1', 'ép. 2', 'ép. 3', 'ép. 4', 'ép. 5']

EPISODE pattern (épisode|episode): 285 matches
Samples: ['Episode 1', 'Episode 2', 'Episode 3', 'Episode 4', 'Episode 1']

EP. pattern (\bep\.): 36 matches
Samples: ['Ep. 1', 'Ep. 2', 'Ep. 3', 'Ep. 4', 'Ep. 5']

NUMBERS ONLY pattern (^\d+$): 36 matches
Samples: ['2', '1', '1', '2', '3']


In [7]:
# Check when collection and title differ - often indicates episodes
print("=== COLLECTION vs TITLE ANALYSIS ===\n")

diff_mask = (whatson_df['collection'] != whatson_df['title']) & \
            whatson_df['collection'].notna() & \
            whatson_df['title'].notna()

print(f"Records where collection ≠ title: {diff_mask.sum()}")
print("\nSample cases:")
whatson_df[diff_mask][['collection', 'title', 'original_title', 'class_key']].head(20)

=== COLLECTION vs TITLE ANALYSIS ===

Records where collection ≠ title: 17222

Sample cases:


Unnamed: 0,collection,title,original_title,class_key
0,24 heures pour survivre,L'ombre du soir,Ex : un samedi pas comme les autres,72 - Téléfilms
1,24 heures pour survivre,Celle qui n'existait plus,(Ex :Thandi),72 - Téléfilms
2,24 heures pour survivre,L'inconnue de Belfast,,72 - Téléfilms
3,24 heures pour survivre,"Un taxi, la nuit",ex : L'aigle de la nuit,72 - Téléfilms
4,A l'est d'éden,A l'est d'eden,East of Eden,71 - Films de cinéma
5,À l'instinct,En eaux profondes,,72 - Téléfilms
6,À l'instinct,La mort en marche,,72 - Téléfilms
8,Addict,ép.1,,72 - Téléfilms
9,Addict,ép. 2,,72 - Téléfilms
10,Addict,ép. 3,,72 - Téléfilms


In [8]:
# Analyze collection patterns to identify film collections
print("=== COLLECTION PATTERN ANALYSIS ===\n")

collection_counts = whatson_df['collection'].value_counts()
print(f"Total unique collections: {len(collection_counts)}")
print(f"\nTop 50 collections:")
print(collection_counts.head(20))

# Look for patterns that suggest film collections
print("\n\n=== POTENTIAL FILM COLLECTION PATTERNS ===\n")

# Search for keywords that suggest film collections
film_keywords = ['film', 'cinéma', 'cinema', 'fiction', 'jeunesse', 'téléfilm', 
                 'classique', 'comédie', 'action', 'émotion', 'minuit', 'écran', 'box']

potential_film_collections = []
for collection in collection_counts.index:
    if pd.notna(collection):
        collection_lower = str(collection).lower()
        for keyword in film_keywords:
            if keyword in collection_lower:
                potential_film_collections.append(collection)
                break

print(f"Collections containing film-related keywords ({len(potential_film_collections)}):")
for col in sorted(potential_film_collections):
    count = collection_counts[col]
    print(f"  - {col}: {count} records")

=== COLLECTION PATTERN ANALYSIS ===

Total unique collections: 321

Top 50 collections:
collection
Film                                     8003
Téléfilm                                 3615
Fiction\Achats                           3522
Fiction                                   166
Camping Paradis                           100
Rosamunde Pilcher                          78
Clem                                       52
Les petits meurtres d'Agatha Christie      45
Capitaine Marleau                          39
Top Models                                 38
Alex Hugo                                  35
Cassandre                                  34
Crimes parfaits                            32
Inga Lindström                             32
Cinéma                                     29
La stagiaire                               29
Une famille formidable                     27
Court métrage                              25
Nocturne                                   25
Jeunesse\Achats            

In [9]:
# Analyze duration field and collection=title cases
print("=== DURATION ANALYSIS ===\n")

# Check duration format
print("Sample durations:")
print(whatson_df['duration'].head(20).tolist())

# Find rows where collection == title
same_collection_title = whatson_df[whatson_df['collection'] == whatson_df['title']].copy()
print(f"\n\nRows where collection == title: {len(same_collection_title)}")
print("\nSample:")
display(same_collection_title[['collection', 'title', 'duration', 'original_title', 'class_key']].head(20))

# Analyze duration format to understand how to parse it
print("\n\nDuration value types:")
print(whatson_df['duration'].apply(type).value_counts())
print("\nUnique duration patterns (first 30):")
print(whatson_df['duration'].unique()[:30])

=== DURATION ANALYSIS ===

Sample durations:
['01:31:52', '01:31:35', '01:26:57', '01:29:32', '01:49:49', '01:31:37', '01:32:23', '01:30:00', '00:46:27', '00:53:23', '00:50:01', '00:45:58', '00:47:12', '00:46:04', '01:30:09', '01:23:51', '01:27:49', '01:29:49', '01:31:29', '01:29:49']


Rows where collection == title: 114

Sample:


Unnamed: 0,collection,title,duration,original_title,class_key
7,À l'instinct,À l'instinct,01:30:00,,72 - Téléfilms
59,Alice au pays des merveilles,Alice au pays des merveilles,01:12:02,,71 - Films de cinéma
104,Beethoven,Beethoven,01:23:10,Beethoven,71 - Films de cinéma
112,Bibifoc,Bibifoc,00:00:00,,71 - Films de cinéma
120,Box office,Box office,01:20:00,,71 - Films de cinéma
121,Box office,Box office,00:00:00,,71 - Films de cinéma
270,Cap danger,Cap danger,01:26:22,,71 - Films de cinéma
383,Christine,Christine,00:00:00,,71 - Films de cinéma
439,Clem,Clem,01:35:51,,72 - Téléfilms
483,"Comédie, Comédie","Comédie, Comédie",00:00:00,,71 - Films de cinéma




Duration value types:
duration
<class 'str'>    17336
Name: count, dtype: int64

Unique duration patterns (first 30):
['01:31:52' '01:31:35' '01:26:57' '01:29:32' '01:49:49' '01:31:37'
 '01:32:23' '01:30:00' '00:46:27' '00:53:23' '00:50:01' '00:45:58'
 '00:47:12' '00:46:04' '01:30:09' '01:23:51' '01:27:49' '01:29:49'
 '01:31:29' '01:30:53' '01:30:33' '01:35:16' '01:28:02' '01:31:02'
 '01:31:48' '01:29:15' '01:31:56' '01:30:25' '01:29:47' '01:31:44']


In [10]:
# Analyze title-in-collection patterns
print("=== TITLE IN COLLECTION ANALYSIS ===\n")

# Check cases where title appears as substring in collection
def title_in_collection(row):
    """Check if title is a substring of collection"""
    if pd.isna(row['title']) or pd.isna(row['collection']):
        return False
    title_str = str(row['title']).strip().lower()
    collection_str = str(row['collection']).strip().lower()
    # Avoid false positives from very short titles
    if len(title_str) < 3:
        return False
    return title_str in collection_str

title_in_coll_mask = whatson_df.apply(title_in_collection, axis=1)
print(f"Rows where title is in substring of collection: {title_in_coll_mask.sum()}")
print("\nSample:")
display(whatson_df[title_in_coll_mask][['collection', 'title', 'original_title', 'duration', 'class_key']].head(20))

=== TITLE IN COLLECTION ANALYSIS ===

Rows where title is in substring of collection: 184

Sample:


Unnamed: 0,collection,title,original_title,duration,class_key
7,À l'instinct,À l'instinct,,01:30:00,72 - Téléfilms
59,Alice au pays des merveilles,Alice au pays des merveilles,,01:12:02,71 - Films de cinéma
104,Beethoven,Beethoven,Beethoven,01:23:10,71 - Films de cinéma
112,Bibifoc,Bibifoc,,00:00:00,71 - Films de cinéma
120,Box office,Box office,,01:20:00,71 - Films de cinéma
121,Box office,Box office,,00:00:00,71 - Films de cinéma
270,Cap danger,Cap danger,,01:26:22,71 - Films de cinéma
383,Christine,Christine,,00:00:00,71 - Films de cinéma
439,Clem,Clem,,01:35:51,72 - Téléfilms
483,"Comédie, Comédie","Comédie, Comédie",,00:00:00,71 - Films de cinéma


## Data Processing: Episode Detection and Title Selection

In [11]:
from cts_recommender.preprocessing.whatson_extraction import *

In [12]:
# Create a copy for processing
processed_df = whatson_df.copy()

# Apply the title selection
processed_df['best_title'] = processed_df.apply(select_best_title, axis=1)

# Show examples of title selection
print("Examples of title selection:")
print("\nFirst 10 rows:")
display(processed_df[['collection', 'title', 'original_title', 'best_title', 'duration', 'short_description']].head(10))

print("\nRows where title OR original_title is episode-like:")
episode_mask = processed_df.apply(lambda r: is_episode_title(r['title']) or is_episode_title(r['original_title']), axis=1)
print(f"Total rows with episode patterns: {episode_mask.sum()}")
display(processed_df[episode_mask][['collection', 'title', 'original_title', 'best_title']].head(15))

Examples of title selection:

First 10 rows:


Unnamed: 0,collection,title,original_title,best_title,duration,short_description
0,24 heures pour survivre,L'ombre du soir,Ex : un samedi pas comme les autres,Ex : un samedi pas comme les autres,01:31:52,Ce soir Marc et Eva vont se marier. Mais cette...
1,24 heures pour survivre,Celle qui n'existait plus,(Ex :Thandi),(Ex :Thandi),01:31:35,"De Paris en Afrique du Sud, Sébastien retrouve..."
2,24 heures pour survivre,L'inconnue de Belfast,,L'inconnue de Belfast,01:26:57,Sean n'a que 24 heures pour savoir qui a tué l...
3,24 heures pour survivre,"Un taxi, la nuit",ex : L'aigle de la nuit,ex : L'aigle de la nuit,01:29:32,"Daniel, reporter français, a épousé il y a que..."
4,A l'est d'éden,A l'est d'eden,East of Eden,East of Eden,01:49:49,"1914, Salinas Valley, Californie. Adam Trask, ..."
5,À l'instinct,En eaux profondes,,En eaux profondes,01:31:37,Un an après un tragique accident de car qui a ...
6,À l'instinct,La mort en marche,,La mort en marche,01:32:23,"Dans un musée à Lyon, une exposition sur les r..."
7,À l'instinct,À l'instinct,,À l'instinct,01:30:00,
8,Addict,ép.1,,Addict,00:46:27,"Élodie et Yvan Marsais, son mari, s’installent..."
9,Addict,ép. 2,,Addict,00:53:23,Les soupçons qui pèsent sur Yvan rafraîchissen...



Rows where title OR original_title is episode-like:
Total rows with episode patterns: 143


Unnamed: 0,collection,title,original_title,best_title
8,Addict,ép.1,,Addict
9,Addict,ép. 2,,Addict
10,Addict,ép. 3,,Addict
11,Addict,ép. 4,,Addict
12,Addict,ép. 5,,Addict
13,Addict,ép. 6,,Addict
464,Clem,Les retrouvailles Partie 1,,Clem
465,Clem,Les retrouvailles Partie 2,,Clem
507,Contact,2,,Contact
508,Contact,1,,Contact


## Filtering: Keep Only Movies

In [13]:
# MOVIE FILTERING LOGIC
# Following the specific rules:
# 1. If episode pattern in title or original_title -> NOT a movie
# 2. If date pattern in title (XX-XX-XX or XX.XX.XX) -> NOT a movie
# 3. If title appears more than 3 times in dataset -> NOT a movie (likely series)
# 4. If collection appears more than twice AND doesn't have film keyword -> NOT a movie (likely series collection)
# 5. If duration is present (not 00:00:00) but < 35 min -> NOT a movie
# 6. If collection == title AND collection matches film keywords -> NOT a movie (generic)
# 7. If collection == title, check duration (must be >= 35 min or 00:00:00) -> else NOT a movie
# 8. If title == original_title (exact match) -> IS a movie
# 9. If title is substring of collection -> NOT a movie
# 10. If collection matches film patterns -> IS a movie
# 11. Otherwise -> NOT a movie (conservative approach)

# Pre-compute title counts for Rule 3
title_counts = whatson_df['title'].value_counts()

# Pre-compute collection counts for Rule 4
collection_counts = whatson_df['collection'].value_counts()

# Apply filter
movies_only_df = processed_df[processed_df.apply(should_keep_as_movie, axis=1, args=(title_counts, collection_counts))].copy()

print(f"Total records: {len(whatson_df)}")
print(f"Movies kept: {len(movies_only_df)}")
print(f"Filtered out: {len(whatson_df) - len(movies_only_df)} ({100*(len(whatson_df) - len(movies_only_df))/len(whatson_df):.1f}%)")
print(f"\nContent codes distribution:")
print(movies_only_df['class_key'].value_counts())
print(f"\nCollection value counts (top 20):")
print(movies_only_df['collection'].value_counts().head(20))

# Show statistics about filtering
print(f"\n=== FILTERING BREAKDOWN ===")
print(f"Titles with dates: {whatson_df['title'].apply(has_date_pattern).sum()}")
print(f"Titles appearing >3 times: {sum(1 for t in whatson_df['title'] if title_counts.get(t, 0) > 3)}")
print(f"Collections appearing >2 times without film keywords: {sum(1 for c, count in collection_counts.items() if count > 2 and not is_film_collection(c))}")
print(f"Titles == Original titles: {sum(1 for _, row in whatson_df.iterrows() if pd.notna(row['title']) and pd.notna(row['original_title']) and str(row['title']).strip() == str(row['original_title']).strip())}")

Total records: 17336
Movies kept: 14942
Filtered out: 2394 (13.8%)

Content codes distribution:
class_key
71 - Films de cinéma                10956
72 - Téléfilms                       3856
761 - Longs métrages d'animation      130
Name: count, dtype: int64

Collection value counts (top 20):
collection
Film                        7926
Téléfilm                    3568
Fiction\Achats              3211
Fiction                      130
Cinéma                        20
Emotions fortes               18
Nocturne                      17
Film d'action                 10
Film Jeunesse                  9
Film de minuit                 8
Comédie, Comédie               7
Ecran TV                       4
Les classiques du cinéma       3
Box office                     3
La culture en films            2
Court métrage                  2
Beethoven                      1
Echo                           1
Film du matin                  1
Les 7 mercenaires              1
Name: count, dtype: int64

=== FILTE

## Finalization: Apply Best Title and Validate

In [14]:
# Replace the generic 'title' with our selected 'best_title'
final_df = movies_only_df.copy()
final_df['title'] = final_df['best_title']

# Drop intermediate columns
final_df = final_df.drop(columns=['best_title'])

# Show sample of cleaned data
print("Cleaned dataset columns:")
print(final_df.columns.tolist())
print(f"\nShape: {final_df.shape}")
print("\nSample records:")
final_df[['collection', 'title', 'original_title', 'class_key', 'duration', 'production_year', 'director']].head(10)

Cleaned dataset columns:
['collection', 'title', 'original_title', 'duration', 'short_description', 'class_key', 'production_regions', 'production_year', 'parental_control', 'department', 'director', 'actors', 'tv_rights_start', 'tv_rights_end', 'total_broadcasts', 'consumed_broadcasts', 'available_broadcasts', 'rts1_rts2_broadcasts', 'first_broadcast_date', 'rebroadcast_date_1', 'rebroadcast_date_2', 'rebroadcast_date_3', 'rebroadcast_date_4', 'last_broadcast_date', 'last_broadcast_rating', 'last_broadcast_rating_plus_7', 'tv_rights_count', 'valid_tv_rights_count', 'external_reference']

Shape: (14942, 29)

Sample records:


Unnamed: 0,collection,title,original_title,class_key,duration,production_year,director
104,Beethoven,Beethoven,Beethoven,71 - Films de cinéma,01:23:10,1992.0,Brian Levant
123,Box office,Intro,,71 - Films de cinéma,00:00:00,,
124,Box office,Presentation de taxi,,71 - Films de cinéma,00:00:00,,
126,Box office,Intro 1/intro 2,,71 - Films de cinéma,00:00:00,,
386,Cinéma,Rosetta,,71 - Films de cinéma,01:30:06,1999.0,"Luc Dardenne, Jean-Pierre Dardenne"
387,Cinéma,Mediterraneo,,71 - Films de cinéma,01:26:09,1991.0,Gabriele Salvatores
388,Cinéma,Ridicule,,71 - Films de cinéma,01:37:57,1996.0,Patrice Leconte
390,Cinéma,Les cachetonneurs,,71 - Films de cinéma,01:26:32,1998.0,
391,Cinéma,Le grand alibi,,71 - Films de cinéma,00:00:00,,
392,Cinéma,La main au collet,,71 - Films de cinéma,00:00:00,,


In [15]:
# VALIDATION: Test each filtering rule
print("=== FILTERING RULE VALIDATION ===\n")

# Test Rule 1: Episode patterns
episode_in_title = processed_df[processed_df['title'].apply(is_episode_title)]
episode_in_original = processed_df[processed_df['original_title'].apply(is_episode_title)]
total_episodes = processed_df[processed_df.apply(lambda r: is_episode_title(r['title']) or is_episode_title(r['original_title']), axis=1)]

print(f"Rule 1 - Episode patterns:")
print(f"  Episodes in title: {len(episode_in_title)}")
print(f"  Episodes in original_title: {len(episode_in_original)}")
print(f"  Total with episode patterns: {len(total_episodes)}")
print(f"  Any kept as movies?: {total_episodes.apply(should_keep_as_movie, axis=1, args = (title_counts, collection_counts)).sum()} (should be 0)")

# Test Rule 2: Collection == Title with duration check
same_coll_title = processed_df[processed_df['collection'] == processed_df['title']]
print(f"\nRule 2 - Collection == Title:")
print(f"  Total with same collection/title: {len(same_coll_title)}")
same_coll_title_copy = same_coll_title.copy()
same_coll_title_copy['duration_min'] = same_coll_title_copy['duration'].apply(parse_duration_minutes)
short_duration = same_coll_title_copy[same_coll_title_copy['duration_min'] < 35]
print(f"  With duration < 35 min: {len(short_duration)}")
print(f"  Short ones kept as movies?: {short_duration.apply(should_keep_as_movie, axis=1).sum()} (should be 0)")

# Test Rule 3: Title in Collection
def title_in_coll(row):
    if pd.isna(row['title']) or pd.isna(row['collection']):
        return False
    title_str = str(row['title']).strip().lower()
    collection_str = str(row['collection']).strip().lower()
    return len(title_str) >= 3 and title_str != collection_str and title_str in collection_str

title_in_collection_mask = processed_df.apply(title_in_coll, axis=1)
title_in_collection_rows = processed_df[title_in_collection_mask]
print(f"\nRule 3 - Title in Collection:")
print(f"  Total with title in collection: {len(title_in_collection_rows)}")
print(f"  Kept as movies ?: {title_in_collection_rows.apply(should_keep_as_movie, axis=1).sum()} (should be 0)")

# Test Rule 4: Film collections
film_coll_rows = processed_df[processed_df['collection'].apply(is_film_collection)]
print(f"\nRule 4 - Film Collections:")
print(f"  Total in film collections: {len(film_coll_rows)}")
print(f"  Kept as movies: {len(movies_only_df)}")
print(f"  Film collection rows kept: {film_coll_rows.apply(should_keep_as_movie, axis=1).sum()}")

print("\n✓ Validation complete")

=== FILTERING RULE VALIDATION ===

Rule 1 - Episode patterns:
  Episodes in title: 127
  Episodes in original_title: 23
  Total with episode patterns: 143
  Any kept as movies?: 0 (should be 0)

Rule 2 - Collection == Title:
  Total with same collection/title: 114
  With duration < 35 min: 31


TypeError: should_keep_as_movie() missing 2 required positional arguments: 'title_counts' and 'collection_counts'

In [None]:
# Check for any remaining episode-like titles in final dataset
print("=== FINAL DATASET CHECK ===\n")

episode_pattern = final_df['title'].apply(is_episode_title)
print(f"Remaining episode-like titles: {episode_pattern.sum()}")

if episode_pattern.sum() > 0:
    print("\nRemaining episodes that need review:")
    display(final_df[episode_pattern][['title', 'collection', 'original_title', 'class_key']].head(20))
else:
    print("✓ No episode-like titles remaining!")

# Check collections in final dataset
print(f"\nCollections in final dataset:")
print(final_df['collection'].value_counts().head(30))

=== FINAL DATASET CHECK ===

Remaining episode-like titles: 0
✓ No episode-like titles remaining!

Collections in final dataset:
collection
Film                        7926
Téléfilm                    3568
Fiction\Achats              3211
Fiction                      130
Cinéma                        20
Emotions fortes               18
Nocturne                      17
Film d'action                 10
Film Jeunesse                  9
Film de minuit                 8
Comédie, Comédie               7
Ecran TV                       4
Box office                     3
Les classiques du cinéma       3
Court métrage                  2
Pour l'amour du risque         2
La culture en films            2
L'arme fatale                  2
Beethoven                      1
Echo                           1
Documentaire                   1
Film du matin                  1
Les 7 mercenaires              1
Name: count, dtype: int64


In [None]:
# Summary statistics
print("=== FINAL SUMMARY ===")
print(f"\nOriginal records: {len(whatson_df)}")
print(f"Final movie records: {len(final_df)}")
print(f"Filtered out: {len(whatson_df) - len(final_df)} ({100*(len(whatson_df) - len(final_df))/len(whatson_df):.1f}%)")
print(f"\nNull values in key columns:")
print(final_df[['title', 'original_title', 'collection', 'production_year', 'director']].isnull().sum())
print(f"\nUnique titles: {final_df['title'].nunique()}")
print(f"Duplicate titles: {len(final_df) - final_df['title'].nunique()}")

=== FINAL SUMMARY ===

Original records: 17336
Final movie records: 14947
Filtered out: 2389 (13.8%)

Null values in key columns:
title                 0
original_title     7193
collection            0
production_year    2500
director           2527
dtype: int64

Unique titles: 14334
Duplicate titles: 613


In [None]:
final_df

Unnamed: 0,collection,title,original_title,duration,short_description,class_key,production_regions,production_year,parental_control,department,...,rebroadcast_date_1,rebroadcast_date_2,rebroadcast_date_3,rebroadcast_date_4,last_broadcast_date,last_broadcast_rating,last_broadcast_rating_plus_7,tv_rights_count,valid_tv_rights_count,external_reference
104,Beethoven,Beethoven,Beethoven,01:23:10,,71 - Films de cinéma,Etats-Unis d'Amérique,1992.0,Jeunesse,,...,,,,,,,,1,0,100035798
123,Box office,Intro,,00:00:00,,71 - Films de cinéma,,,Achats UAP,,...,,,,,,,,0,0,100189141
124,Box office,Presentation de taxi,,00:00:00,,71 - Films de cinéma,Suisse Romande,,Achats UAP,,...,,,,,,,,0,0,100185092
126,Box office,Intro 1/intro 2,,00:00:00,,71 - Films de cinéma,,,Achats UAP,,...,,,,,,,,0,0,100174259
386,Cinéma,Rosetta,,01:30:06,"Chaque jour, Rosetta part au front à la recher...",71 - Films de cinéma,Belgique,1999.0,Achats UAP,,...,,,,,2001-10-04,,,1,0,100203009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17178,Téléfilm,Gothard,,00:00:55,Un extrait où l'on découvre que les mineurs em...,72 - Téléfilms,Suisse,2016.0,Fictions,,...,,,,,,,,0,0,100527666
17179,Téléfilm,Salomé,,01:26:26,"Tragédie en un acte d'Oscar Wilde, que le poèt...",72 - Téléfilms,France,1969.0,Achats UAP,,...,,,,,,,,0,0,500647632
17180,Téléfilm,La reine de Saba,,01:33:30,Oeuvre écrite par Maurice Clavel sur une idée ...,72 - Téléfilms,"France, Iran",1975.0,Achats UAP,,...,,,,,,,,0,0,500647633
17181,Téléfilm,Françoise et Udo...,,01:06:52,"Dans un train, un chanteur autrichien (Udo Jur...",72 - Téléfilms,France,1968.0,Achats UAP,,...,,,,,,,,0,0,500647634
