In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from utils import date_formatting

catalog_name = 'whatson_catalog_df'
catalog_df = pd.read_parquet('data/' + catalog_name + '.parquet')
lookup = catalog_df[['processed_title', 'catalog_id']]
featured_movies_df = pd.read_pickle('data/featured_movies_dataframe.pkl')
catalog_df.columns

Index(['title', 'actors', 'adult', 'available_num_diff', 'collection',
       'consumed_num_diff', 'content_class_key', 'date_diff_1',
       'date_last_diff', 'date_rediff_1', 'date_rediff_2', 'date_rediff_3',
       'date_rediff_4', 'department', 'director', 'duration', 'duration_dt',
       'duration_min', 'end_rights', 'external_reference', 'genres',
       'last_diff_rating', 'last_diff_rating_7', 'missing_tmdb_id',
       'num_diff_RTS1_RTS2', 'num_rights_TV', 'original_language',
       'original_title', 'parental_control', 'popularity', 'processed_title',
       'production_region', 'production_year', 'release_date', 'revenue',
       'start_rights', 'tmdb_id', 'total_num_diff', 'valid_num_rights_TV',
       'vote_average', 'missing_release_date', 'missing_tmdb', 'is_movie',
       'movie_age', 'times_shown', 'catalog_id'],
      dtype='object')

In [3]:
title_to_id = catalog_df.set_index('title')['catalog_id'].to_dict()

In [4]:
keep_columns = ['title', 'date', 'BrdCstClassKey', 'channel', 'duration', 'hour', 'day', 'weekday', 
                'is_weekend', 'month', 'rt_m', 'tmdb_id', 'season', 'processed_title', 'missing_tmdb_id']
historical_data_df = featured_movies_df.copy()[keep_columns]

#def match_catalog_ids(historical_data_df: pd.DataFrame, title_to_id: dict):
#    # First: match via TMDB ID
#    historical_data_df['catalog_id'] = str(int(historical_data_df['tmdb_id'])) if historical_data_df['tmdb_id'] is not None else historical_data_df['tmdb_id']
#
#    # Second: fallback for rows where TMDB ID is missing or didn't match
#    missing_mask = historical_data_df['catalog_id'].isna()
#    historical_data_df.loc[missing_mask, 'catalog_id'] = historical_data_df.loc[missing_mask, 'title'].map(title_to_id)
#
#    # If the movie is still missing a catalog_id, it is not in the catalog
#    missing_mask_catalog = historical_data_df['catalog_id'].isna()
#    historical_data_df.loc[missing_mask_catalog, 'catalog_id'] =  -1 # Assign -1 for missing catalog_id
#
#    return historical_data_df
#
#historical_data_df = match_catalog_ids(historical_data_df, title_to_id)

def match_id(row):
    tmdb = row['tmdb_id']
    if pd.notnull(tmdb):
        # convert numeric TMDB id to a string
        return str(int(tmdb))
    else:
        # fallback: look up by title, or -1 if not found
        return title_to_id.get(row['title'], '-1')

historical_data_df['catalog_id'] = historical_data_df.apply(match_id, axis=1)

In [5]:
# find RTS 1 and 2 only
historical_data_df_interest = historical_data_df[historical_data_df['channel'].isin(['RTS 1', 'RTS 2'])]
temp = historical_data_df_interest[historical_data_df_interest['catalog_id'].isna()]
temp

Unnamed: 0,title,date,BrdCstClassKey,channel,duration,hour,day,weekday,is_weekend,month,rt_m,tmdb_id,season,processed_title,missing_tmdb_id,catalog_id


In [6]:
catalog_df[catalog_df['title'].str.contains('Rien que pour vos yeux')]

Unnamed: 0,title,actors,adult,available_num_diff,collection,consumed_num_diff,content_class_key,date_diff_1,date_last_diff,date_rediff_1,...,tmdb_id,total_num_diff,valid_num_rights_TV,vote_average,missing_release_date,missing_tmdb,is_movie,movie_age,times_shown,catalog_id
4798,Rien que pour vos yeux,"Roger Moore, Carole Bouquet, Chaim Topol, Lynn...",False,1.0,Film,1.0,71,1994-01-08,2024-01-02,2016-12-14,...,699.0,2.0,0,6.537,True,False,True,44,0,699


In [7]:
historical_data_df.columns

Index(['title', 'date', 'BrdCstClassKey', 'channel', 'duration', 'hour', 'day',
       'weekday', 'is_weekend', 'month', 'rt_m', 'tmdb_id', 'season',
       'processed_title', 'missing_tmdb_id', 'catalog_id'],
      dtype='object')

In [8]:
historical_data_df.columns = ['title', 'date', 'content_class_key', 'channel', 'duration_min', 'hour', 'day',
       'weekday', 'is_weekend', 'month', 'rt_m', 'tmdb_id', 'season',
       'processed_title', 'missing_tmdb_id', 'catalog_id']

In [9]:
historical_data_df[historical_data_df['title'] == 'Cher Noël']

Unnamed: 0,title,date,content_class_key,channel,duration_min,hour,day,weekday,is_weekend,month,rt_m,tmdb_id,season,processed_title,missing_tmdb_id,catalog_id
22,Cher Noël,2024-01-01,72,RTS 1,86.116667,14,1,0,False,1,22.1,744941.0,winter,Cher Noël,False,744941
68279,Cher Noël,2024-11-25,72,RTS 1,87.05,9,25,0,False,11,7.2,744941.0,fall,Cher Noël,False,744941


In [10]:
assert historical_data_df['catalog_id'].notna().all(), "There are still missing catalog IDs in the historical data."

In [11]:
historical_data_df.to_pickle("data/historical_data_df.pkl")

In [12]:
historical_data_df

Unnamed: 0,title,date,content_class_key,channel,duration_min,hour,day,weekday,is_weekend,month,rt_m,tmdb_id,season,processed_title,missing_tmdb_id,catalog_id
22,Cher Noël,2024-01-01,72,RTS 1,86.116667,14,1,0,False,1,22.1,744941.0,winter,Cher Noël,False,744941
24,James Bond: Jamais plus jamais,2024-01-01,71,RTS 1,126.333333,15,1,0,False,1,44.0,36670.0,winter,Jamais plus jamais,False,36670
32,Super-héros malgré lui,2024-01-01,71,RTS 1,77.350000,20,1,0,False,1,75.4,746333.0,winter,Super-héros malgré lui,False,746333
33,Qu'est-ce qu'on a fait au Bon Dieu ?,2024-01-01,71,RTS 1,93.716667,21,1,0,False,1,44.9,262391.0,winter,Qu'est-ce qu'on a fait au Bon Dieu ?,False,262391
34,Séduis-moi si tu peux,2024-01-01,71,RTS 1,112.800000,23,1,0,False,1,14.5,459992.0,winter,Séduis-moi si tu peux !,False,459992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85478,Fiston,2025-02-21,71,RTS 2,84.783333,21,21,4,False,2,15.5,252607.0,winter,Fiston,False,252607
85573,VOYAGE A TOKYO,2025-02-21,AAF,France 3,45.683333,25,21,4,False,2,0.5,18148.0,winter,Voyage à Tokyo,False,18148
85653,Adorables,2025-02-22,71,RTS 1,89.983333,20,22,5,True,2,50.8,701437.0,winter,Adorables,False,701437
85654,Illusions perdues,2025-02-22,71,RTS 1,146.483333,22,22,5,True,2,11.3,43799.0,winter,Illusions perdues,False,43799


In [13]:
historical_data_df[historical_data_df['catalog_id'].apply(lambda x: isinstance(x, str))]

Unnamed: 0,title,date,content_class_key,channel,duration_min,hour,day,weekday,is_weekend,month,rt_m,tmdb_id,season,processed_title,missing_tmdb_id,catalog_id
22,Cher Noël,2024-01-01,72,RTS 1,86.116667,14,1,0,False,1,22.1,744941.0,winter,Cher Noël,False,744941
24,James Bond: Jamais plus jamais,2024-01-01,71,RTS 1,126.333333,15,1,0,False,1,44.0,36670.0,winter,Jamais plus jamais,False,36670
32,Super-héros malgré lui,2024-01-01,71,RTS 1,77.350000,20,1,0,False,1,75.4,746333.0,winter,Super-héros malgré lui,False,746333
33,Qu'est-ce qu'on a fait au Bon Dieu ?,2024-01-01,71,RTS 1,93.716667,21,1,0,False,1,44.9,262391.0,winter,Qu'est-ce qu'on a fait au Bon Dieu ?,False,262391
34,Séduis-moi si tu peux,2024-01-01,71,RTS 1,112.800000,23,1,0,False,1,14.5,459992.0,winter,Séduis-moi si tu peux !,False,459992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85478,Fiston,2025-02-21,71,RTS 2,84.783333,21,21,4,False,2,15.5,252607.0,winter,Fiston,False,252607
85573,VOYAGE A TOKYO,2025-02-21,AAF,France 3,45.683333,25,21,4,False,2,0.5,18148.0,winter,Voyage à Tokyo,False,18148
85653,Adorables,2025-02-22,71,RTS 1,89.983333,20,22,5,True,2,50.8,701437.0,winter,Adorables,False,701437
85654,Illusions perdues,2025-02-22,71,RTS 1,146.483333,22,22,5,True,2,11.3,43799.0,winter,Illusions perdues,False,43799
