In [1]:
%load_ext autoreload
%autoreload 2

## Load dataset

In [2]:
import pandas as pd
import numpy as np

featured_movies_df = pd.read_pickle('data/raw_data/featured_movies_dataframe.pkl')
featured_movies_df.columns

Index(['BrdCstClassKey', 'adult', 'day', 'doy', 'duration', 'genres', 'hour',
       'is_weekend', 'month', 'original_language', 'pdm', 'popularity',
       'public_holiday', 'release_date', 'revenue', 'rt_m', 'season',
       'start_time', 'title', 'tmdb_id', 'vote_average', 'weekday', 'year'],
      dtype='object')

In [3]:
# Adult = False as default
featured_movies_df['adult'] = np.where(
    featured_movies_df['adult'].isna(),         # condition per-row
    False,                                      # value if True
    featured_movies_df['adult']                 # value if False
)

# Missing original_language put as 'unknown'
featured_movies_df['original_language'] = np.where(
    featured_movies_df['original_language'].isna(),         # condition per-row
    'unknown',                                              # value if True
    featured_movies_df['original_language']                 # value if False
)

featured_movies_df['genres'] = featured_movies_df['genres'].apply(
    lambda x: x if isinstance(x, list) else []
)

# Missing release_date put as '1900-01-01'
featured_movies_df['release_date'] = np.where(
    featured_movies_df['release_date'].isna(),         # condition per-row
    '1900-01-01',                                      # value if True
    featured_movies_df['release_date']                 # value if False
)

# Add a missing_release_date flag
featured_movies_df.loc[:, 'missing_release_date'] = np.where(
    featured_movies_df['release_date'].isna(),  # condition per-row
    False,                                      # value if True
    True                                        # value if False
)
featured_movies_df.loc[:, 'missing_release_date'] = featured_movies_df.loc[:, 'missing_release_date'].apply(lambda s: False if s == '' else True)
featured_movies_df.loc[:, 'release_date'] = featured_movies_df.loc[:, 'release_date'].apply(lambda s: '1900-01-01' if s == '' else s)

# Missing Revenue put as 0 similarly to TMDB API
featured_movies_df['revenue'] = np.where(
    featured_movies_df['revenue'].isna(),         # condition per-row
    0,                                            # value if True
    featured_movies_df['revenue']                 # value if False
)

# missing tmdb id flag
featured_movies_df.loc[:, 'missing_tmdb'] = np.where(
    featured_movies_df['tmdb_id'].isna(),  # condition per-row
    True,                                  # value if True
    False                                  # value if False
)

# Add vote average as zero
featured_movies_df.loc[:, 'vote_average'] = featured_movies_df['vote_average'].fillna(0)

#  Add popularity zero 
featured_movies_df.loc[:, 'popularity'] = featured_movies_df['popularity'].fillna(0)

# Separate Movies and TV Shows
featured_movies_df.loc[:, 'is_movie'] = True

In [4]:
# Place title as first column
featured_movies_df = featured_movies_df[['title']+featured_movies_df.drop(columns=['title']).columns.tolist()]

In [5]:
drop = ['day', 'doy', 'hour',
       'is_weekend', 'month', 'pdm', 
       'public_holiday', 'rt_m',
       'start_time', 'weekday', 'year', 'season']
featured_movies_df = featured_movies_df.drop(columns=drop)

In [6]:
featured_movies_df

Unnamed: 0,title,BrdCstClassKey,adult,duration,genres,original_language,popularity,release_date,revenue,tmdb_id,vote_average,missing_release_date,missing_tmdb,is_movie
22,Cher Noël,72,False,86.116667,"[{'id': 10770, 'name': 'Téléfilm'}, {'id': 107...",en,0.2812,2020-11-27,0.0,744941.0,6.657,True,False,True
24,James Bond: Jamais plus jamais,71,False,126.333333,"[{'id': 12, 'name': 'Aventure'}, {'id': 28, 'n...",en,15.4075,1983-10-07,160000000.0,36670.0,6.049,True,False,True
32,Super-héros malgré lui,71,False,77.350000,"[{'id': 35, 'name': 'Comédie'}, {'id': 28, 'na...",fr,1.7759,2022-02-02,7375119.0,746333.0,6.352,True,False,True
33,Qu'est-ce qu'on a fait au Bon Dieu ?,71,False,93.716667,"[{'id': 35, 'name': 'Comédie'}]",fr,2.7478,2014-04-16,176404493.0,262391.0,6.663,True,False,True
34,Séduis-moi si tu peux,71,False,112.800000,"[{'id': 35, 'name': 'Comédie'}, {'id': 10749, ...",en,3.1147,2019-05-02,0.0,459992.0,6.725,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17763,Rencard avec le diable,72,False,85.283333,"[{'id': 10770, 'name': 'Téléfilm'}, {'id': 18,...",en,0.1858,2022-09-14,0.0,1025208.0,6.500,True,False,True
17779,What We Become,71,False,52.250000,"[{'id': 27, 'name': 'Horreur'}, {'id': 53, 'na...",da,0.5022,2015-09-29,0.0,356326.0,5.552,True,False,True
17812,Adorables,71,False,89.983333,"[{'id': 35, 'name': 'Comédie'}]",fr,0.4237,2020-07-22,0.0,701437.0,5.880,True,False,True
17813,Illusions perdues,71,False,146.483333,"[{'id': 35, 'name': 'Comédie'}, {'id': 10749, ...",en,0.5496,1941-04-20,0.0,43799.0,6.560,True,False,True


In [7]:
catalog_df = featured_movies_df.drop_duplicates(subset=['title'], keep='first')
catalog_df.reset_index(drop=True, inplace=True)

In [8]:
catalog_df

Unnamed: 0,title,BrdCstClassKey,adult,duration,genres,original_language,popularity,release_date,revenue,tmdb_id,vote_average,missing_release_date,missing_tmdb,is_movie
0,Cher Noël,72,False,86.116667,"[{'id': 10770, 'name': 'Téléfilm'}, {'id': 107...",en,0.2812,2020-11-27,0.0,744941.0,6.657,True,False,True
1,James Bond: Jamais plus jamais,71,False,126.333333,"[{'id': 12, 'name': 'Aventure'}, {'id': 28, 'n...",en,15.4075,1983-10-07,160000000.0,36670.0,6.049,True,False,True
2,Super-héros malgré lui,71,False,77.350000,"[{'id': 35, 'name': 'Comédie'}, {'id': 28, 'na...",fr,1.7759,2022-02-02,7375119.0,746333.0,6.352,True,False,True
3,Qu'est-ce qu'on a fait au Bon Dieu ?,71,False,93.716667,"[{'id': 35, 'name': 'Comédie'}]",fr,2.7478,2014-04-16,176404493.0,262391.0,6.663,True,False,True
4,Séduis-moi si tu peux,71,False,112.800000,"[{'id': 35, 'name': 'Comédie'}, {'id': 10749, ...",en,3.1147,2019-05-02,0.0,459992.0,6.725,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,La fillette sans nom,72,False,89.983333,[],unknown,0.0000,1900-01-01,0.0,,0.000,True,True,True
886,Rencard avec le diable,72,False,85.283333,"[{'id': 10770, 'name': 'Téléfilm'}, {'id': 18,...",en,0.1858,2022-09-14,0.0,1025208.0,6.500,True,False,True
887,What We Become,71,False,52.250000,"[{'id': 27, 'name': 'Horreur'}, {'id': 53, 'na...",da,0.5022,2015-09-29,0.0,356326.0,5.552,True,False,True
888,Adorables,71,False,89.983333,"[{'id': 35, 'name': 'Comédie'}]",fr,0.4237,2020-07-22,0.0,701437.0,5.880,True,False,True


In [9]:
catalog_df.to_pickle("data/catalog_df.pkl")