In [1]:
# J'importe mes bibliothèques
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Bibiliothèques de ML
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Import pipeline
from sklearn.pipeline import Pipeline

# Import outil standardisation de la donnée
from sklearn.preprocessing import StandardScaler

# Le module pour spliter le modèle
from sklearn.model_selection import train_test_split

# Import des métriques
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error


# Gestion des warnings
import warnings

In [2]:
url = "https://raw.githubusercontent.com/halekss/.fivesensefilms.io/refs/heads/main/Films.csv"
films = pd.read_csv(url, sep=";")

# Nettoyage

In [3]:
def quick_explore(dataframe):
    """
    Fonction d'exploration rapide des données.
    """
    # Afficher quelques lignes
    print("###### Observer des lignes ######")
    display(dataframe.head(10), "\n")


    # Afficher le nombre de lignes et colonnes
    print("###### Dimensions du dataset ######")
    print(f"Lignes : {dataframe.shape[0]}, Colonnes : {dataframe.shape[1]}\n")

    # Informations sur les colonnes
    print("###### Informations sur les colonnes ######")
    print(dataframe.info(), "\n")

    # Les nom de colonnes
    print(list(dataframe.columns))


    # Nombre de valeurs uniques par colonne
    print("###### Nombre de valeurs uniques par colonne ######")
    print(dataframe.nunique(), "\n")

    # Description des colonnes numériques avec les déciles
    print("###### Description des colonnes numériques ######")
    stats = dataframe.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).T
    print( stats.to_markdown(), "\n")

    # Nombre de valeurs manquantes par colonne
    print("###### Nombre de NaN par colonne ######")
    nan_nb = dataframe.isna().sum()
    print(nan_nb)

    # Pourcentage de valeurs manquantes par colonne
    print("###### Pourcentage de NaN par colonne ######")
    nan_percentage = (dataframe.isnull().sum() / len(dataframe)) * 100
    only_nan  = nan_percentage[nan_percentage > 0].sort_values(ascending=False)
    print(only_nan, "\n")

    # Nombre de doublons
    print("###### Nombre de doublons ######")
    print(dataframe.duplicated().sum(), "doublon(s) trouvé(s).\n")

print(quick_explore(films))

###### Observer des lignes ######


Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Id des genres,Genre Principal,Genre secondaire,Genre tertiaire,...,Société de production,Pays d'origine de la production,URL complète,Résolution,Type de vidéo,Officiel,Date de mise en ligne,Id Youtube,Affiche du Film,Arrière plan film
0,tt1524169,321040,Day Camp,Day Camp,"2009-01-01 00:00:00,000",72,[35],Comedy,Non renseigné,Non renseigné,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
1,tt1603444,129466,Unitards,Unitards,"2010-01-01 00:00:00,000",107,[],Comedy,Family,Music,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
2,tt11702252,916721,The Year Between,The Year Between,"2023-03-03 00:00:00,000",94,"[35, 18]",Comedy,Drama,Non renseigné,...,"[{'id': 124473, 'logo_path': '/jW6q5HfAh4LrIQH...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,"2002-11-25 00:00:00,000",48,[18],Comedy,Drama,Non renseigné,...,"[{'id': 104, 'logo_path': '/9aotxauvc9685tq9pT...","[{'iso_3166_1': 'FR', 'name': 'France'}]",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,"2008-08-30 00:00:00,000",94,[],Comedy,Non renseigné,Non renseigné,...,[],"[{'iso_3166_1': 'FR', 'name': 'France'}]",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
5,tt2519088,731774,Dangerous People,Dangerous People,"2015-04-01 00:00:00,000",95,"[53, 35]",Comedy,Horror,Thriller,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
6,tt3627780,308919,Le Noël de mes dix ans,Wishin' and Hopin',"2014-11-23 00:00:00,000",83,"[18, 35, 10770]",Comedy,Drama,Family,...,"[{'id': 62763, 'logo_path': '/xuR3B9b9lAvpoTcS...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
7,tt2644044,395978,Where is Rocky II?,Where is Rocky II?,"2016-05-01 00:00:00,000",93,"[35, 99, 9648]",Comedy,Documentary,Mystery,...,"[{'id': 7144, 'logo_path': '/wqc8HTYdCCFnguN2M...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
8,tt0215516,48345,Nobody's Baby,Nobody's Baby,"2001-01-01 00:00:00,000",110,[35],Comedy,Drama,Non renseigné,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
9,tt2609778,319017,Le dernier hétéro sur Terre,The Last Straight Man,"2014-05-06 00:00:00,000",110,"[18, 35, 10749]",Comedy,Drama,Romance,...,"[{'id': 45765, 'logo_path': None, 'name': 'Thr...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...


'\n'

###### Dimensions du dataset ######
Lignes : 2517, Colonnes : 31

###### Informations sur les colonnes ######
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 31 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   tconst                           2517 non-null   object 
 1   Id_TMDB                          2517 non-null   int64  
 2   Titre                            2517 non-null   object 
 3   Titre Original                   2517 non-null   object 
 4   Date de sortie                   2517 non-null   object 
 5   Durée                            2517 non-null   int64  
 6   Id des genres                    2517 non-null   object 
 7   Genre Principal                  2517 non-null   object 
 8   Genre secondaire                 2517 non-null   object 
 9   Genre tertiaire                  2517 non-null   object 
 10  Genres                           2

In [4]:
display(films.head(10))

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Id des genres,Genre Principal,Genre secondaire,Genre tertiaire,...,Société de production,Pays d'origine de la production,URL complète,Résolution,Type de vidéo,Officiel,Date de mise en ligne,Id Youtube,Affiche du Film,Arrière plan film
0,tt1524169,321040,Day Camp,Day Camp,"2009-01-01 00:00:00,000",72,[35],Comedy,Non renseigné,Non renseigné,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
1,tt1603444,129466,Unitards,Unitards,"2010-01-01 00:00:00,000",107,[],Comedy,Family,Music,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
2,tt11702252,916721,The Year Between,The Year Between,"2023-03-03 00:00:00,000",94,"[35, 18]",Comedy,Drama,Non renseigné,...,"[{'id': 124473, 'logo_path': '/jW6q5HfAh4LrIQH...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,"2002-11-25 00:00:00,000",48,[18],Comedy,Drama,Non renseigné,...,"[{'id': 104, 'logo_path': '/9aotxauvc9685tq9pT...","[{'iso_3166_1': 'FR', 'name': 'France'}]",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,"2008-08-30 00:00:00,000",94,[],Comedy,Non renseigné,Non renseigné,...,[],"[{'iso_3166_1': 'FR', 'name': 'France'}]",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
5,tt2519088,731774,Dangerous People,Dangerous People,"2015-04-01 00:00:00,000",95,"[53, 35]",Comedy,Horror,Thriller,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
6,tt3627780,308919,Le Noël de mes dix ans,Wishin' and Hopin',"2014-11-23 00:00:00,000",83,"[18, 35, 10770]",Comedy,Drama,Family,...,"[{'id': 62763, 'logo_path': '/xuR3B9b9lAvpoTcS...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...
7,tt2644044,395978,Where is Rocky II?,Where is Rocky II?,"2016-05-01 00:00:00,000",93,"[35, 99, 9648]",Comedy,Documentary,Mystery,...,"[{'id': 7144, 'logo_path': '/wqc8HTYdCCFnguN2M...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
8,tt0215516,48345,Nobody's Baby,Nobody's Baby,"2001-01-01 00:00:00,000",110,[35],Comedy,Drama,Non renseigné,...,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,
9,tt2609778,319017,Le dernier hétéro sur Terre,The Last Straight Man,"2014-05-06 00:00:00,000",110,"[18, 35, 10749]",Comedy,Drama,Romance,...,"[{'id': 45765, 'logo_path': None, 'name': 'Thr...","[{'iso_3166_1': 'US', 'name': 'United States o...",,,Non renseigné,Non renseigné,,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w600_and_h900_bestv...


In [5]:
films["URL complète"]=films["URL complète"].fillna("Non renseigné")
films["Arrière plan film"]=films["Arrière plan film"].fillna("Non renseigné")

In [6]:
films['Résolution'] = films['Résolution'].str.replace(',', '.', regex=False).astype(float)
#films['Résolution'].astype(int)
films['Résolution'].describe()

count    1872.000000
mean      949.615385
std       279.662050
min       360.000000
25%      1080.000000
50%      1080.000000
75%      1080.000000
max      2160.000000
Name: Résolution, dtype: float64

In [7]:
films['Résolution'].fillna(films['Résolution'].median(), inplace=True)
display(films['Résolution'].isna().sum())

0

In [8]:
films['Nombre de votants'].fillna(0, inplace=True)

In [9]:
films.isna().sum()

tconst                                0
Id_TMDB                               0
Titre                                 0
Titre Original                        0
Date de sortie                        0
Durée                                 0
Id des genres                         0
Genre Principal                       0
Genre secondaire                      0
Genre tertiaire                       0
Genres                                0
Mots clés                             0
Budget                             1026
Recettes                            917
Pays d'origine                        0
Langue Originale                      0
Moyenne des votes                     0
Nombre de votants                     0
Popularité                            0
Résumé                                0
Slogan                                0
Société de production                 0
Pays d'origine de la production       0
URL complète                          0
Résolution                            0


In [10]:
films.columns

Index(['tconst', 'Id_TMDB', 'Titre', 'Titre Original', 'Date de sortie',
       'Durée', 'Id des genres', 'Genre Principal', 'Genre secondaire',
       'Genre tertiaire', 'Genres', 'Mots clés', 'Budget', 'Recettes',
       'Pays d'origine', 'Langue Originale', 'Moyenne des votes',
       'Nombre de votants', 'Popularité', 'Résumé', 'Slogan',
       'Société de production', 'Pays d'origine de la production',
       'URL complète', 'Résolution', 'Type de vidéo', 'Officiel',
       'Date de mise en ligne', 'Id Youtube', 'Affiche du Film',
       'Arrière plan film'],
      dtype='object')

In [11]:
films.drop(columns=["Genres","Id des genres", "Id Youtube", "Arrière plan film", "Arrière plan film", "Officiel", "Résolution", "Slogan", "Type de vidéo", "Date de mise en ligne"], inplace=True)

In [12]:
# Je décide de créer une fonction pour concaténer le type 1 et le type 2 (pour rappel 48% de valeurs manquantes sur la colonne 'Type 2')

def concatenate_type(row):
  genre1 = row['Genre Principal']
  genre2 = row['Genre secondaire']
  genre3 = row['Genre tertiaire']

  # Si mon type 2 n'est pas un nan alors retournes moi type1,type2:
  if genre2 != 'Non renseigné':
    genre1 = genre1 + ',' + genre2
    if genre3 != 'Non renseigné':
        genre1 = genre1 + ',' + genre3
    return genre1
  # sinon tu me retoures uniquement le type 1 (dans le cas ou type 2 est un nan)
  else:
    return genre1

In [13]:
films['Genres'] = films.apply(concatenate_type, axis=1)

In [14]:
display(films.sample(15))

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Genre Principal,Genre secondaire,Genre tertiaire,Mots clés,...,Langue Originale,Moyenne des votes,Nombre de votants,Popularité,Résumé,Société de production,Pays d'origine de la production,URL complète,Affiche du Film,Genres
1401,tt1260502,14092,Ghost in the Shell 2.0,GHOST IN THE SHELL／攻殻機動隊2.0,"2008-07-12 00:00:00,000",83,Action,Animation,Crime,"{2964: 'future', 803: 'android', 679: 'cyborg'...",...,ja,74,441.0,19,L'histoire se déroule dans un monde futuriste ...,"[{'id': 529, 'logo_path': '/rwB6w2aPENQbx756pB...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Animation,Crime"
1372,tt7921248,504582,Les Siffleurs,La Gomera,"2019-09-13 00:00:00,000",98,Comedy,Crime,Drama,{207268: 'neo-noir'},...,ro,6,176.0,5,"Cristi, un inspecteur de police de Bucarest co...","[{'id': 51274, 'logo_path': None, 'name': '42 ...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",https://www.youtube.com/watch?v=r9fOKXPBEuU,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Crime,Drama"
1961,tt6865690,467956,Richard says goodbye,The Professor,"2018-05-17 00:00:00,000",90,Comedy,Drama,Non renseigné,"{6564: 'terminal illness', 14964: 'drugs', 151...",...,en,68,1219.0,34,Un professeur de littérature d'une université ...,"[{'id': 104972, 'logo_path': '/y1MOtyfXpKTZVJs...","[{'iso_3166_1': 'US', 'name': 'United States o...",https://www.youtube.com/watch?v=VThUDaDIv4o,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama"
931,tt0330069,23626,Blue Collar Comedy Tour: The Movie,Blue Collar Comedy Tour: The Movie,"2003-01-10 00:00:00,000",105,Comedy,Documentary,Non renseigné,{10250: 'redneck'},...,en,58,53.0,35,Non renseigné,"[{'id': 19342, 'logo_path': '/8niiDsqdIQESh7cY...","[{'iso_3166_1': 'US', 'name': 'United States o...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Documentary"
1994,tt1119646,18785,Very Bad Trip,The Hangover,"2009-06-02 00:00:00,000",95,Comedy,Non renseigné,Non renseigné,"{7996: 'blackjack', 8070: 'stag night', 11462:...",...,en,73,17622.0,142,Ils avaient prévu un enterrement de vie de gar...,"[{'id': 923, 'logo_path': '/5UQsZrfbfG2dYJbx8D...","[{'iso_3166_1': 'US', 'name': 'United States o...",https://www.youtube.com/watch?v=hHqR9Tq16_E,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy
1054,tt2378281,211954,Ni repris ni échangé,No se aceptan devoluciones,"2013-07-20 00:00:00,000",117,Comedy,Drama,Non renseigné,"{1338: 'bachelor', 155480: 'vespa'}",...,es,77,938.0,22,Valentin est un playboy vivant à Acapulco. Un ...,"[{'id': 58399, 'logo_path': '/zhTPxItVCkuxXKic...","[{'iso_3166_1': 'US', 'name': 'United States o...",https://www.youtube.com/watch?v=KiLhpZpVqCg,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama"
302,tt19785426,958291,Fièvre Méditerranéenne,حمى البحر المتوسط,"2022-12-14 00:00:00,000",108,Comedy,Drama,Non renseigné,Non renseigné,...,ar,67,10.0,7,"Walid, 40 ans, palestinien vivant à Haifa, ave...","[{'id': 82386, 'logo_path': '/g9AkB14mgz3UiMxU...","[{'iso_3166_1': 'CY', 'name': 'Cyprus'}, {'iso...",https://www.youtube.com/watch?v=Jo5Eqje90So,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama"
223,tt8406738,587986,Advantages of Travelling by Train,Ventajas de viajar en tren,"2019-11-08 00:00:00,000",104,Comedy,Mystery,Thriller,"{818: 'based on novel or book', 744: 'madrid, ...",...,es,65,170.0,17,Après avoir fait interner son mari fou dans un...,"[{'id': 10031, 'logo_path': '/cUOviFxM9l3dYuNe...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Mystery,Thriller"
811,tt8523334,469274,Nicky Larson et le parfum de Cupidon,Nicky Larson et le parfum de Cupidon,"2019-02-06 00:00:00,000",91,Action,Comedy,Crime,"{2708: 'hitman', 3149: 'gangster', 3088: 'body...",...,fr,65,814.0,17,Nicky Larson est le meilleur des gardes du cor...,"[{'id': 13319, 'logo_path': '/eEaB6ztT13s4vhM9...","[{'iso_3166_1': 'FR', 'name': 'France'}]",https://www.youtube.com/watch?v=jcfYjOEiddY,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Comedy,Crime"
2382,tt2713180,228150,Fury,Fury,"2014-10-15 00:00:00,000",134,Action,Drama,War,"{1701: 'hero', 2652: 'nazi', 502: 'ambush', 39...",...,en,75,12574.0,138,Avril 1945. Les Alliés mènent leur ultime offe...,"[{'id': 5, 'logo_path': '/71BqEFAF4V3qjjMPCpLu...","[{'iso_3166_1': 'US', 'name': 'United States o...",https://www.youtube.com/watch?v=X7kpBE8piQ0,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Drama,War"


In [15]:
films.drop(columns=['Genre Principal', 'Genre secondaire', 'Genre tertiaire'],inplace=True)

In [16]:
import ast

def extraire_mots_cles(x):
    try:
        d = ast.literal_eval(x)     # transforme "{212: 'london', ...}" en vrai dict Python
        if isinstance(d, dict):
            return ", ".join(d.values())
    except:
        return "Non renseigné"

films["Mots clés propre"] = films["Mots clés"].apply(extraire_mots_cles)

In [17]:
films.drop(columns=['Mots clés'], inplace=True)

In [18]:
films['Pays d\'origine'] = (
    films['Pays d\'origine']
    .str.replace(r"[\[\]']", "", regex=True)  # enlève crochets et apostrophes
    .str.strip()
)


In [19]:
import ast

def extraire_pays(x):
    try:
        liste = ast.literal_eval(x)  # convertit la chaîne en vraie liste de dict
        if isinstance(liste, list):
            # on prend uniquement la valeur "name" dans chaque dictionnaire
            noms = [d.get("name", "") for d in liste if isinstance(d, dict)]
            return ", ".join(noms)
    except:
        return 'Non renseigné'

films["Pays_production_propre"] = films["Pays d'origine de la production"].apply(extraire_pays)


In [20]:
def extraire_noms_societes(x):
    try:
        data = ast.literal_eval(x)  # transforme la chaîne en vraie liste
        if isinstance(data, list):
            noms = [d.get("name", "") for d in data if isinstance(d, dict)]
            return ", ".join(noms)
    except:
        return "Non renseigné"

films["Société_production_name"] = films["Société de production"].apply(extraire_noms_societes)


In [21]:
def extraire_logo_path(x):
    try:
        data = ast.literal_eval(x)  # convertit texte → liste de dict
        if isinstance(data, list):
            logos = [d.get("logo_path") for d in data if isinstance(d, dict)]
            # enlever les None et ne conserver que les vraies valeurs
            logos = [l for l in logos if l]
            return ", ".join(logos) if logos else None
    except:
        return "Non renseigné"

films["Société_production_logo"] = films["Société de production"].apply(extraire_logo_path)


In [22]:
def construire_urls(cell):
    if not isinstance(cell, str):
        return 'Non renseigné'
    base = "https://image.tmdb.org/t/p/w300"
    logos = [x.strip() for x in cell.split(",")]
    return ", ".join([base + x for x in logos if x])

films["logo_urls"] = films["Société_production_logo"].apply(construire_urls)


In [23]:
films.columns

Index(['tconst', 'Id_TMDB', 'Titre', 'Titre Original', 'Date de sortie',
       'Durée', 'Budget', 'Recettes', 'Pays d'origine', 'Langue Originale',
       'Moyenne des votes', 'Nombre de votants', 'Popularité', 'Résumé',
       'Société de production', 'Pays d'origine de la production',
       'URL complète', 'Affiche du Film', 'Genres', 'Mots clés propre',
       'Pays_production_propre', 'Société_production_name',
       'Société_production_logo', 'logo_urls'],
      dtype='object')

In [24]:
films.drop(columns=['Société_production_logo','Société de production', "Pays d'origine de la production" ],inplace=True)

In [25]:
films["Popularité"] = (
    films["Popularité"]
    .astype(str)
    .str.replace(",", ".", regex=False)
)

films["Popularité"] = pd.to_numeric(films["Popularité"], errors="coerce")


In [26]:
films["Moyenne des votes"] = (
    films["Moyenne des votes"]
    .astype(str)
    .str.replace(",", ".", regex=False)
)

films["Moyenne des votes"] = pd.to_numeric(films["Moyenne des votes"], errors="coerce")


In [27]:
def fill_with_group_median(s):
    s_non_na = s.dropna()
    if s_non_na.empty:   # Aucun budget/recette valide dans ce pays
        return s         # On laisse tel quel
    med = s_non_na.median()
    return s.fillna(med)

films["Budget"] = films.groupby("Pays_production_propre")["Budget"].transform(fill_with_group_median)
films["Recettes"] = films.groupby("Pays_production_propre")["Recettes"].transform(fill_with_group_median)


In [28]:
films["Budget"] = pd.to_numeric(films["Budget"], errors="coerce")
films["Recettes"] = pd.to_numeric(films["Recettes"], errors="coerce")

In [29]:
films.isna().sum()

tconst                       0
Id_TMDB                      0
Titre                        0
Titre Original               0
Date de sortie               0
Durée                        0
Budget                     193
Recettes                   156
Pays d'origine               0
Langue Originale             0
Moyenne des votes            0
Nombre de votants            0
Popularité                   0
Résumé                       0
URL complète                 0
Affiche du Film              0
Genres                       0
Mots clés propre             0
Pays_production_propre       0
Société_production_name      0
logo_urls                    0
dtype: int64

In [30]:
def fill_with_group_median(s):
    s_non_na = s.dropna()
    if s_non_na.empty:
        return s
    med = s_non_na.median()
    return s.fillna(med)

films["Budget"] = films.groupby("Pays_production_propre")["Budget"].transform(fill_with_group_median)
films["Recettes"] = films.groupby("Pays_production_propre")["Recettes"].transform(fill_with_group_median)


In [31]:
# Construction de la matrice de similarité

films["Budget"] = films["Budget"].fillna(films["Budget"].median())
films["Recettes"] = films["Recettes"].fillna(films["Recettes"].median())

In [32]:
# Observation des index sur les colonnes restantes pour la construction de la matrice de similarité
# films.columns

films.columns

Index(['tconst', 'Id_TMDB', 'Titre', 'Titre Original', 'Date de sortie',
       'Durée', 'Budget', 'Recettes', 'Pays d'origine', 'Langue Originale',
       'Moyenne des votes', 'Nombre de votants', 'Popularité', 'Résumé',
       'URL complète', 'Affiche du Film', 'Genres', 'Mots clés propre',
       'Pays_production_propre', 'Société_production_name', 'logo_urls'],
      dtype='object')

In [33]:
# Ré-indexation des colonnes pour une meilleure lisibilité.

films.reindex(columns=['tconst', 'Id_TMDB', 'Titre', 'Titre Original', 'Date de sortie',
       'Durée', 'Budget', 'Recettes', 'Genres', "Pays d'origine", "Langue Originale",'Mots clés propre',
       'Moyenne des votes', 'Nombre de votants', 'Popularité', 'Résumé',
       'URL complète', 'Affiche du Film',
       'Pays_production_propre', 'Société_production_name', 'logo_urls'])

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Budget,Recettes,Genres,Pays d'origine,...,Mots clés propre,Moyenne des votes,Nombre de votants,Popularité,Résumé,URL complète,Affiche du Film,Pays_production_propre,Société_production_name,logo_urls
0,tt1524169,321040,Day Camp,Day Camp,"2009-01-01 00:00:00,000",72,40000000.0,9.200000e+07,Comedy,US,...,Non renseigné,4.0,0.0,3.6,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,,Non renseigné
1,tt1603444,129466,Unitards,Unitards,"2010-01-01 00:00:00,000",107,40000000.0,9.200000e+07,"Comedy,Family,Music",US,...,Non renseigné,4.0,0.0,0.0,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,,Non renseigné
2,tt11702252,916721,The Year Between,The Year Between,"2023-03-03 00:00:00,000",94,40000000.0,9.200000e+07,"Comedy,Drama",US,...,"mental illness, woman director, bipolar disord...",4.2,10.0,1.8,Après avoir été diagnostiquée avec une maladie...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,"Level Forward, Full Spectrum Features, Chicago...",https://image.tmdb.org/t/p/w300/jW6q5HfAh4LrIQ...
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,"2002-11-25 00:00:00,000",48,8616269.0,8.562951e+06,"Comedy,Drama",US,...,Non renseigné,4.5,4.0,0.6,"Âgé de 21 ans à peine, Sœur Angèle vend le mie...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,France,Canal+,https://image.tmdb.org/t/p/w300/9aotxauvc9685t...
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,"2008-08-30 00:00:00,000",94,8616269.0,8.562951e+06,Comedy,US,...,Non renseigné,4.5,4.0,1.4,"Sonia, jolie black, mariée, quatre enfants, es...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,France,,Non renseigné
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,tt3915174,315162,Le Chat Potté 2 : la dernière quête,Puss in Boots: The Last Wish,"2022-12-07 00:00:00,000",102,90000000.0,4.847000e+08,"Action,Adventure,Animation",US,...,"fairy tale, sequel, talking dog, spin off, aft...",8.2,8496.0,12.5,Le Chat Potté découvre que sa passion pour l'a...,https://www.youtube.com/watch?v=5fZ5hVKw2tg,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,DreamWorks Animation,https://image.tmdb.org/t/p/w300/3BPX5VGBov8SDq...
2513,tt4154756,299536,Avengers : Infinity War,Avengers: Infinity War,"2018-04-25 00:00:00,000",149,300000000.0,2.052415e+09,"Action,Adventure,Sci-Fi",US,...,"sacrifice, magic, superhero, based on comic, s...",8.2,31157.0,24.5,Les Avengers et leurs alliés devront être prêt...,https://www.youtube.com/watch?v=Rh90-sCZWYA,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...
2514,tt4154796,299534,Avengers : Endgame,Avengers: Endgame,"2019-04-24 00:00:00,000",181,356000000.0,2.799439e+09,"Action,Adventure,Sci-Fi",US,...,"superhero, time travel, space travel, time mac...",8.2,26944.0,13.9,Après leur défaite face au Titan Thanos qui da...,https://www.youtube.com/watch?v=jTC2fgxMwxU,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...
2515,tt9362722,569094,Spider-Man : Across the Spider-Verse,Spider-Man: Across the Spider-Verse,"2023-05-31 00:00:00,000",140,100000000.0,6.908979e+08,"Action,Adventure,Animation",US,...,"new york city, hero, sacrifice, superhero, vil...",8.3,8039.0,17.2,"Après avoir retrouvé Gwen Stacy, Spider-Man, l...",https://www.youtube.com/watch?v=hrCX4trbaNE,https://image.tmdb.org/t/p/w600_and_h900_bestv...,United States of America,"Columbia Pictures, Sony Pictures Animation, Lo...",https://image.tmdb.org/t/p/w300/71BqEFAF4V3qjj...


In [34]:
# Renommage des colonnes pour un affichage plus lisible

films.rename(columns={'URL complète': 'Lien_vidéo', 
       'Pays_production_propre':'Pays_production', 
       'Société_production_name':'Société_production', 
        'Mots clés propre':'Mots clés',
       'logo_urls':'Logo'}, inplace=True)

In [35]:
# Dernière lecture des données.

films.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tconst              2517 non-null   object 
 1   Id_TMDB             2517 non-null   int64  
 2   Titre               2517 non-null   object 
 3   Titre Original      2517 non-null   object 
 4   Date de sortie      2517 non-null   object 
 5   Durée               2517 non-null   int64  
 6   Budget              2517 non-null   float64
 7   Recettes            2517 non-null   float64
 8   Pays d'origine      2517 non-null   object 
 9   Langue Originale    2517 non-null   object 
 10  Moyenne des votes   2517 non-null   float64
 11  Nombre de votants   2517 non-null   float64
 12  Popularité          2517 non-null   float64
 13  Résumé              2517 non-null   object 
 14  Lien_vidéo          2517 non-null   object 
 15  Affiche du Film     2517 non-null   object 
 16  Genres

In [36]:
display(films)

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Budget,Recettes,Pays d'origine,Langue Originale,...,Nombre de votants,Popularité,Résumé,Lien_vidéo,Affiche du Film,Genres,Mots clés,Pays_production,Société_production,Logo
0,tt1524169,321040,Day Camp,Day Camp,"2009-01-01 00:00:00,000",72,40000000.0,9.200000e+07,US,en,...,0.0,3.6,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy,Non renseigné,United States of America,,Non renseigné
1,tt1603444,129466,Unitards,Unitards,"2010-01-01 00:00:00,000",107,40000000.0,9.200000e+07,US,en,...,0.0,0.0,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Family,Music",Non renseigné,United States of America,,Non renseigné
2,tt11702252,916721,The Year Between,The Year Between,"2023-03-03 00:00:00,000",94,40000000.0,9.200000e+07,US,en,...,10.0,1.8,Après avoir été diagnostiquée avec une maladie...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","mental illness, woman director, bipolar disord...",United States of America,"Level Forward, Full Spectrum Features, Chicago...",https://image.tmdb.org/t/p/w300/jW6q5HfAh4LrIQ...
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,"2002-11-25 00:00:00,000",48,8616269.0,8.562951e+06,US,en,...,4.0,0.6,"Âgé de 21 ans à peine, Sœur Angèle vend le mie...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama",Non renseigné,France,Canal+,https://image.tmdb.org/t/p/w300/9aotxauvc9685t...
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,"2008-08-30 00:00:00,000",94,8616269.0,8.562951e+06,US,en,...,4.0,1.4,"Sonia, jolie black, mariée, quatre enfants, es...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy,Non renseigné,France,,Non renseigné
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,tt3915174,315162,Le Chat Potté 2 : la dernière quête,Puss in Boots: The Last Wish,"2022-12-07 00:00:00,000",102,90000000.0,4.847000e+08,US,en,...,8496.0,12.5,Le Chat Potté découvre que sa passion pour l'a...,https://www.youtube.com/watch?v=5fZ5hVKw2tg,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Animation","fairy tale, sequel, talking dog, spin off, aft...",United States of America,DreamWorks Animation,https://image.tmdb.org/t/p/w300/3BPX5VGBov8SDq...
2513,tt4154756,299536,Avengers : Infinity War,Avengers: Infinity War,"2018-04-25 00:00:00,000",149,300000000.0,2.052415e+09,US,en,...,31157.0,24.5,Les Avengers et leurs alliés devront être prêt...,https://www.youtube.com/watch?v=Rh90-sCZWYA,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Sci-Fi","sacrifice, magic, superhero, based on comic, s...",United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...
2514,tt4154796,299534,Avengers : Endgame,Avengers: Endgame,"2019-04-24 00:00:00,000",181,356000000.0,2.799439e+09,US,en,...,26944.0,13.9,Après leur défaite face au Titan Thanos qui da...,https://www.youtube.com/watch?v=jTC2fgxMwxU,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Sci-Fi","superhero, time travel, space travel, time mac...",United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...
2515,tt9362722,569094,Spider-Man : Across the Spider-Verse,Spider-Man: Across the Spider-Verse,"2023-05-31 00:00:00,000",140,100000000.0,6.908979e+08,US,en,...,8039.0,17.2,"Après avoir retrouvé Gwen Stacy, Spider-Man, l...",https://www.youtube.com/watch?v=hrCX4trbaNE,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Animation","new york city, hero, sacrifice, superhero, vil...",United States of America,"Columbia Pictures, Sony Pictures Animation, Lo...",https://image.tmdb.org/t/p/w300/71BqEFAF4V3qjj...


La donnée est nettoyée, les valeurs vides traités, les colonnes non pertinente supprimé.

Le dataframe est à présent près pour le traitement des dummies

# Encodage des variables

In [37]:
# Encodage de la colonne 'Genres' en dummies pour usage binaire du ML
# films['Genres'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

dummies = films['Genres'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

display(dummies)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2513,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2514,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2515,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Je concate le df initial et le dummies

df_films = pd.concat([films,dummies], axis=1).drop(columns=['Genres'])

In [39]:
# Même démarche que pour la colonne Genres avec pays_production

# films['pays_production'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

dummies_2 = films['Pays_production'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

display(dummies_2)

Unnamed: 0,Afghanistan,Algeria,Argentina,Armenia,Australia,Austria,Belgium,Bosnia and Herzegovina,Brazil,Bulgaria,...,South Korea,Spain,Swaziland,Sweden,Switzerland,Taiwan,Thailand,United Arab Emirates,United Kingdom,United States of America
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [40]:
# Je concate le df initial et le dummies

df_films = pd.concat([df_films,dummies_2], axis=1).drop(columns= ['Pays_production'])
display(df_films.shape)

(2517, 187)

In [41]:
# sur une copie du datframe je stock les comptage unitaire de chaque mots clés
films_copie = films['Mots clés'].str.split(',').explode().value_counts().sort_values(ascending=False)
display(films_copie.describe(percentiles= (0.8,0.9 , 0.95, 0.96, 0.97)))

count    8122.000000
mean        2.979685
std         7.837514
min         1.000000
50%         1.000000
80%         3.000000
90%         5.000000
95%        10.000000
96%        12.000000
97%        14.000000
max       334.000000
Name: count, dtype: float64

In [42]:
# 1. On éclate tous les mots clés film par film
mot_cle_individuel = (
    films['Mots clés']
    .dropna()
    .str.split(',')      # "a,b,c" -> ["a","b","c"]
    .explode()           # 1 mot clé par ligne
    .str.strip()         # on enlève les espaces
)

# 2. On enlève la pseudo valeur "Non renseigné"
mot_cle_individuel = mot_cle_individuel[mot_cle_individuel != 'Non renseigné']

# 3. Fréquence de chaque mot clé
mot_cle_comptage = mot_cle_individuel.value_counts()

# 4. Seuil du 95e percentile
top5 = mot_cle_comptage.quantile(0.95)

# 5. Ensemble des mots clés à conserver (top 5 %)
top_mot_cle = mot_cle_comptage[mot_cle_comptage >= top5].index
top_mot_cle_set = set(top_mot_cle)  # plus rapide pour le test d'appartenance

In [43]:
def filter_top_keywords(cell):
    # cas NaN
    if pd.isna(cell):
        return np.nan
    
    # on découpe les mots clés de ce film
    lst = [k.strip() for k in cell.split(',')]
    
    # on ne garde que ceux du top 5 %
    kept = [k for k in lst if k in top_mot_cle_set]
    
    # si un film n'a aucun mot clé "top 5 %"
    if not kept:
        return np.nan    # ou '' si tu préfères
    return ','.join(kept)

films['Mots_cles_top5'] = films['Mots clés'].apply(filter_top_keywords)
display(films)

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Budget,Recettes,Pays d'origine,Langue Originale,...,Popularité,Résumé,Lien_vidéo,Affiche du Film,Genres,Mots clés,Pays_production,Société_production,Logo,Mots_cles_top5
0,tt1524169,321040,Day Camp,Day Camp,"2009-01-01 00:00:00,000",72,40000000.0,9.200000e+07,US,en,...,3.6,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy,Non renseigné,United States of America,,Non renseigné,
1,tt1603444,129466,Unitards,Unitards,"2010-01-01 00:00:00,000",107,40000000.0,9.200000e+07,US,en,...,0.0,Non renseigné,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Family,Music",Non renseigné,United States of America,,Non renseigné,
2,tt11702252,916721,The Year Between,The Year Between,"2023-03-03 00:00:00,000",94,40000000.0,9.200000e+07,US,en,...,1.8,Après avoir été diagnostiquée avec une maladie...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","mental illness, woman director, bipolar disord...",United States of America,"Level Forward, Full Spectrum Features, Chicago...",https://image.tmdb.org/t/p/w300/jW6q5HfAh4LrIQ...,"mental illness,woman director"
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,"2002-11-25 00:00:00,000",48,8616269.0,8.562951e+06,US,en,...,0.6,"Âgé de 21 ans à peine, Sœur Angèle vend le mie...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama",Non renseigné,France,Canal+,https://image.tmdb.org/t/p/w300/9aotxauvc9685t...,
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,"2008-08-30 00:00:00,000",94,8616269.0,8.562951e+06,US,en,...,1.4,"Sonia, jolie black, mariée, quatre enfants, es...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy,Non renseigné,France,,Non renseigné,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,tt3915174,315162,Le Chat Potté 2 : la dernière quête,Puss in Boots: The Last Wish,"2022-12-07 00:00:00,000",102,90000000.0,4.847000e+08,US,en,...,12.5,Le Chat Potté découvre que sa passion pour l'a...,https://www.youtube.com/watch?v=5fZ5hVKw2tg,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Animation","fairy tale, sequel, talking dog, spin off, aft...",United States of America,DreamWorks Animation,https://image.tmdb.org/t/p/w300/3BPX5VGBov8SDq...,"fairy tale,sequel,spin off,aftercreditsstinger..."
2513,tt4154756,299536,Avengers : Infinity War,Avengers: Infinity War,"2018-04-25 00:00:00,000",149,300000000.0,2.052415e+09,US,en,...,24.5,Les Avengers et leurs alliés devront être prêt...,https://www.youtube.com/watch?v=Rh90-sCZWYA,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Sci-Fi","sacrifice, magic, superhero, based on comic, s...",United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,"magic,superhero,based on comic,space,super pow..."
2514,tt4154796,299534,Avengers : Endgame,Avengers: Endgame,"2019-04-24 00:00:00,000",181,356000000.0,2.799439e+09,US,en,...,13.9,Après leur défaite face au Titan Thanos qui da...,https://www.youtube.com/watch?v=jTC2fgxMwxU,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Sci-Fi","superhero, time travel, space travel, time mac...",United States of America,Marvel Studios,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,"superhero,time travel,space travel,based on co..."
2515,tt9362722,569094,Spider-Man : Across the Spider-Verse,Spider-Man: Across the Spider-Verse,"2023-05-31 00:00:00,000",140,100000000.0,6.908979e+08,US,en,...,17.2,"Après avoir retrouvé Gwen Stacy, Spider-Man, l...",https://www.youtube.com/watch?v=hrCX4trbaNE,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Animation","new york city, hero, sacrifice, superhero, vil...",United States of America,"Columbia Pictures, Sony Pictures Animation, Lo...",https://image.tmdb.org/t/p/w300/71BqEFAF4V3qjj...,"new york city,hero,superhero,villain,based on ..."


In [44]:
# Même démarche que pour la colonne Genres avec mots_clés

# films['Mots clés'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

dummies_3 = films['Mots_cles_top5'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

display(dummies_3)

Unnamed: 0,18th century,1930s,1940s,1950s,1960s,1970s,1980s,1990s,19th century,3d animation,...,widower,witch,witty,wolf,woman director,wonder,world war ii,writer,zealous,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# Je concate le df initial et le dummies

df_films = pd.concat([df_films,dummies_3], axis=1).drop(columns=['Mots clés'])
display(df_films.shape)

(2517, 571)

In [46]:
# Même démarche que pour la colonne Genres avec mots_clés
# films['Mots clés'].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule

dummies_4 = films["Pays d'origine"].str.get_dummies(',')  # syntaxe lorsque l'on a plusieurs catégories dans la même cellule
display(dummies_4)

Unnamed: 0,AE,AM,AR,AS,AT,AU,BE,BO,CA,CH,...,RU,SA,SE,SG,SZ,TH,TW,UA,US,ZA
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [47]:
df_films = pd.concat([df_films,dummies_4], axis=1).drop(columns=["Pays d'origine"])
display(df_films.shape)

(2517, 684)

In [48]:
films_copie_sp = films['Société_production'].str.split(',').explode().value_counts().sort_values(ascending=False)
display(films_copie_sp.describe(percentiles= (0.8,0.9 , 0.95, 0.96, 0.97, 0.995, 0.996, 0.997, 0.998, 0.999)))

count    4444.000000
mean        2.110486
std         4.470745
min         1.000000
50%         1.000000
80%         2.000000
90%         3.000000
95%         6.000000
96%         7.000000
97%         8.000000
99.5%      30.785000
99.6%      35.228000
99.7%      44.026000
99.8%      50.228000
99.9%      58.013000
max        99.000000
Name: count, dtype: float64

In [49]:
# 1. On éclate touses les socitété film par film
societe_prod_individuelle = (
    films['Société_production']
    .dropna()
    .str.split(',')      # "a,b,c" -> ["a","b","c"]
    .explode()           # 1 mot clé par ligne
    .str.strip()         # on enlève les espaces
)

# 2. On enlève la pseudo valeur "Non renseigné"
societe_prod_individuelle = societe_prod_individuelle[societe_prod_individuelle != 'Non renseigné']

# 3. Fréquence de chaque mot clé
societe_prod_comptage = societe_prod_individuelle.value_counts()

# 4. Seuil du 95e percentile
top5_sp = societe_prod_comptage.quantile(0.95)

# 5. Ensemble des mots clés à conserver (top 5 %)
top_societe_prod = societe_prod_comptage[societe_prod_comptage >= top5_sp].index
top_societe_prod_set = set(top_societe_prod)  # plus rapide pour le test d'appartenance

In [50]:
top_societe_prod_set

{'',
 '20th Century Fox',
 '20th Century Fox Animation',
 '20th Century Studios',
 '21 Laps Entertainment',
 '3 Arts Entertainment',
 'A24',
 'ARP Sélection',
 'ARTE',
 'ARTE France Cinéma',
 'Aardman',
 'Agat Films & Cie / Ex Nihilo',
 'Amazon MGM Studios',
 'Amazon Studios',
 'Amblin Entertainment',
 'American Empirical Pictures',
 'Animal Logic',
 'Annapurna Pictures',
 'Anonymous Content',
 'Anton Capital Entertainment',
 'Apatow Productions',
 'Apple Studios',
 'Arad Productions',
 'Artémis Productions',
 'Atlas Entertainment',
 'Auvergne-Rhône-Alpes Cinéma',
 'BBC Film',
 'Bac Films',
 'Bad Robot',
 'BeTV',
 'Belgacom',
 'BiM Distribuzione',
 'Big Beach',
 'Big Indie Pictures',
 'Big Talk Studios',
 'Black Label Media',
 'Blue Sky Studios',
 'Blumhouse Productions',
 'Bron Studios',
 'CNC',
 'Canal+',
 'Castle Rock Entertainment',
 'Chapter 2',
 'Chernin Entertainment',
 'Chi-Fou-Mi Productions',
 'Ciné+',
 'CinéCinéma',
 'Cinéfrance Studios',
 'Closest to the Hole Productions',


In [51]:
def filter_top_societe_prod(cell):
    # cas NaN
    if pd.isna(cell):
        return np.nan
    
    # on découpe les société de ce film
    lst = [k.strip() for k in cell.split(',')]
    
    # on ne garde que ceux du top 5 %
    kept = [k for k in lst if k in top_societe_prod_set]
    
    # si un film n'a aucun mot clé "top 5 %"
    if not kept:
        return np.nan    # ou '' si tu préfères
    return ','.join(kept)

films['Société_production_top5'] = films['Société_production'].apply(filter_top_societe_prod)

In [52]:
dummies_5 = films['Société_production_top5'].str.get_dummies(',')  

display(dummies_5)

Unnamed: 0,20th Century Fox,20th Century Fox Animation,20th Century Studios,21 Laps Entertainment,3 Arts Entertainment,A24,ARP Sélection,ARTE,ARTE France Cinéma,Aardman,...,Wild Bunch,Wonderland Sound and Vision,Working Title Films,XYZ Films,ZDF/Arte,dentsu,di Bonaventura Pictures,uFilm,uFund,uMedia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2513,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2515,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
df_films = pd.concat([df_films,dummies_5], axis=1).drop(columns=["Société_production"])
display(df_films.shape)

(2517, 905)

In [54]:
# je converti la colonne en format datetime
films['Date de sortie'] = pd.to_datetime(films['Date de sortie'], errors='coerce')
# je ne conserve que l'année de sortie
films['Annee_sortie'] = films['Date de sortie'].dt.year

  films['Date de sortie'] = pd.to_datetime(films['Date de sortie'], errors='coerce')


In [55]:
df_films = pd.concat([df_films,films['Annee_sortie']], axis=1).drop(columns=["Date de sortie"])
display(df_films.shape)

(2517, 905)

In [56]:
films[films['Moyenne des votes']>8]

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Budget,Recettes,Pays d'origine,Langue Originale,...,Lien_vidéo,Affiche du Film,Genres,Mots clés,Pays_production,Société_production,Logo,Mots_cles_top5,Société_production_top5,Annee_sortie
125,tt6977442,461844,Liyana,Liyana,2017-07-15,77,27000000.0,38058340.0,US,en,...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Animation,Documentary","africa, orphan, storytelling, part animation, ...","Qatar, Swaziland, United States of America","Intaba Creative, DFI, Shine Global, Fork Films...",https://image.tmdb.org/t/p/w300/shnLD1Pj9d1vB2...,"africa,orphan",,2017
126,tt0232537,385893,Road Dogs,Road Dogs,2003-07-03,90,40000000.0,92000000.0,US,en,...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Drama",Non renseigné,United States of America,Epiphany Pictures,Non renseigné,,,2003
237,tt0319821,214436,Maria,Maria,2003-11-21,97,27000000.0,38058340.0,RO,ro,...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama",Non renseigné,"Germany, Romania, France",Artis Film,Non renseigné,,,2003
426,tt6478218,313995,Houdini : Le Film,Houdini - Le Film,2014-12-27,52,8616269.0,8562951.0,FR,fr,...,https://www.youtube.com/watch?v=VCHAnHk-0N8,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Adventure,Animation",magic,France,"Dandelooo, Walking The Dog, 2 Minutes",https://image.tmdb.org/t/p/w300/eOLN5m2nnhaKLf...,magic,,2014
772,tt0468569,155,The Dark Knight : Le Chevalier noir,The Dark Knight,2008-07-16,152,185000000.0,1004558000.0,US,en,...,https://www.youtube.com/watch?v=UMgb3hQCb08,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Crime,Drama","sadism, chaos, secret identity, crime fighter,...","United Kingdom, United States of America","Warner Bros. Pictures, Legendary Pictures, Syn...",https://image.tmdb.org/t/p/w300/ingPVoHnINIrFR...,"secret identity,superhero,anti hero,based on c...","Warner Bros. Pictures,Legendary Pictures,Syncopy",2008
926,tt5869370,644479,Dedicada A Mi Ex,Dedicada A Mi Ex,2019-11-01,94,27000000.0,1315376.0,EC,es,...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,Comedy,"fame, rock band, heartbreak, sketch comedy, ec...","Colombia, Ecuador, United States of America","Touché Films, Dynamo",https://image.tmdb.org/t/p/w300/lyjwhVUoPpfBy8...,,,2019
1056,tt4772188,823219,"Flow, le chat qui n'avait plus peur de l'eau",Straume,2024-08-29,85,3700000.0,17660110.0,LV,lv,...,https://www.youtube.com/watch?v=vob8AGZM4Wc,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Adventure,Animation,Family","ocean, cat, boat, dystopia, dog, ruins, flood,...","Latvia, Belgium, France","Dream Well Studio, Sacrebleu Productions, Take...",https://image.tmdb.org/t/p/w300/bEsO7l12cRGAY2...,"cat,dystopia,dog,3d animation","ARTE France Cinéma,RTBF",2024
1057,tt0347149,4935,Le Château ambulant,ハウルの動く城,2004-09-09,119,24000000.0,236049800.0,JP,ja,...,https://www.youtube.com/watch?v=n4zU2R1eyPc,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Adventure,Animation,Family","witch, flying, rain, castle, scarecrow, body e...",Japan,"Studio Ghibli, Tokuma Shoten, Nippon Televisio...",https://image.tmdb.org/t/p/w300/uFuxPEZRUcBTEi...,"witch,flying,melancholy,steampunk,anime,though...",dentsu,2004
1152,tt1675434,77338,Intouchables,Intouchables,2011-11-02,112,13000000.0,426590300.0,FR,fr,...,https://www.youtube.com/watch?v=cXu2MhWYUuE,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","friendship, male friendship, based on true sto...",France,"Gaumont, Quad Productions, Chaocorp, Ten Films...",https://image.tmdb.org/t/p/w300/nda3dTUYdDrJ6r...,"friendship,male friendship,based on true story...","Gaumont,Quad Productions,TF1 Films Production",2011
1222,tt1375666,27205,Inception,Inception,2010-07-15,148,160000000.0,839030600.0,"US, GB",en,...,https://www.youtube.com/watch?v=9aijfXkbkXg,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Action,Adventure,Sci-Fi","rescue, mission, dreams, airplane, paris, fran...","United Kingdom, United States of America","Legendary Pictures, Syncopy, Warner Bros. Pict...",https://image.tmdb.org/t/p/w300/5UQsZrfbfG2dYJ...,"rescue,mission,dreams,airplane,paris,france,ki...","Legendary Pictures,Syncopy,Warner Bros. Pictures",2010


# Normalisation des données

In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

In [58]:
films.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   tconst                   2517 non-null   object        
 1   Id_TMDB                  2517 non-null   int64         
 2   Titre                    2517 non-null   object        
 3   Titre Original           2517 non-null   object        
 4   Date de sortie           2517 non-null   datetime64[ns]
 5   Durée                    2517 non-null   int64         
 6   Budget                   2517 non-null   float64       
 7   Recettes                 2517 non-null   float64       
 8   Pays d'origine           2517 non-null   object        
 9   Langue Originale         2517 non-null   object        
 10  Moyenne des votes        2517 non-null   float64       
 11  Nombre de votants        2517 non-null   float64       
 12  Popularité               2517 non-

In [59]:
# Normalisation des données
scaler = MinMaxScaler()

# Correction de la colonne 'Moyenne des votes' au lieu de 'Moyenne des notes'
cols_scaler = ['Durée','Moyenne des votes', 'Popularité', 'Budget', 'Recettes', 'Nombre de votants', 'Annee_sortie']
df_films[cols_scaler] = scaler.fit_transform(df_films[cols_scaler])

In [60]:
# Je sauvegarde le fichier clean en csv (pour streamlit)
df_films.to_csv("df_films_scaler.csv",index = False)

In [61]:
df_films.head(10)

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Durée,Budget,Recettes,Langue Originale,Moyenne des votes,Nombre de votants,...,Wonderland Sound and Vision,Working Title Films,XYZ Films,ZDF/Arte,dentsu,di Bonaventura Pictures,uFilm,uFund,uMedia,Annee_sortie
0,tt1524169,321040,Day Camp,Day Camp,0.291498,0.081627,0.031467,en,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.36
1,tt1603444,129466,Unitards,Unitards,0.433198,0.081627,0.031467,en,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.4
2,tt11702252,916721,The Year Between,The Year Between,0.380567,0.081627,0.031467,en,0.033333,0.000261,...,0,0,0,0,0,0,0,0,0,0.92
3,tt0350703,293411,La chatte andalouse,La chatte andalouse,0.194332,0.017564,0.002928,en,0.083333,0.000105,...,0,0,0,0,0,0,0,0,0,0.08
4,tt1087829,12174,Aide-toi le ciel t'aidera,Aide-toi le ciel t'aidera,0.380567,0.017564,0.002928,en,0.083333,0.000105,...,0,0,0,0,0,0,0,0,0,0.32
5,tt2519088,731774,Dangerous People,Dangerous People,0.384615,0.081627,0.031467,en,0.116667,7.8e-05,...,0,0,0,0,0,0,0,0,0,0.6
6,tt3627780,308919,Le Noël de mes dix ans,Wishin' and Hopin',0.336032,0.081627,0.031467,en,0.133333,0.000314,...,0,0,0,0,0,0,0,0,0,0.56
7,tt2644044,395978,Where is Rocky II?,Where is Rocky II?,0.376518,0.05509,0.013017,en,0.166667,0.0,...,0,0,0,0,0,0,0,0,0,0.64
8,tt0215516,48345,Nobody's Baby,Nobody's Baby,0.445344,0.081627,0.031467,en,0.183333,0.000732,...,0,0,0,0,0,0,0,0,0,0.04
9,tt2609778,319017,Le dernier hétéro sur Terre,The Last Straight Man,0.445344,0.081627,0.031467,en,0.183333,0.000994,...,0,0,0,0,0,0,0,0,0,0.56


# Machine Learning

In [62]:
# Définissons X
X = df_films.drop(columns=['tconst', 'Id_TMDB', 'Titre', 'Titre Original', 'Lien_vidéo','Affiche du Film', 'Logo', 'Résumé', 'Langue Originale']) 


In [63]:
# Je définis mon modèle

model = NearestNeighbors(n_neighbors=6)  # Car 5 films à recommander

# et je l'entraine sur mon dataframe df_films

model.fit(X)

In [64]:
distance, indices = model.kneighbors(X)

In [65]:
print(distance) # Retourne la distance entre les films et ses plus proches voisins

[[2.98023224e-08 3.29891152e-01 4.18444109e-01 6.05044480e-01
  1.04068209e+00 1.06102053e+00]
 [0.00000000e+00 1.05371409e+00 1.42258580e+00 1.45685953e+00
  1.46743373e+00 1.48125066e+00]
 [0.00000000e+00 1.10960631e+00 1.44737220e+00 1.44997742e+00
  1.45146888e+00 1.47097272e+00]
 ...
 [8.42936970e-08 3.37976438e+00 3.39602031e+00 3.48235951e+00
  3.51670140e+00 3.52347796e+00]
 [8.42936970e-08 3.18023977e+00 3.64488365e+00 4.02061934e+00
  4.03182616e+00 4.04468338e+00]
 [0.00000000e+00 3.03390985e+00 3.04145448e+00 3.04552888e+00
  3.18023977e+00 3.20541958e+00]]


In [66]:
print(indices) # Retournes les index des voisins

[[   0   23 1779  682 1751   26]
 [   1   26    0   23  971 1779]
 [   2  785  242   43   34 2045]
 ...
 [2514 2375 2390 2513 2468 2476]
 [2515 2516 2417 2485   82 1852]
 [2516 2417 2101 1848 2515 1869]]


In [67]:
print(indices.shape)

(2517, 6)


In [68]:
input_nom = "Avengers : Infinity War"
display(df_films[df_films['Titre'] == input_nom])
film_index = df_films.index[df_films['Titre'] == input_nom][0]
#display(df_films.loc[film_index, ['tconst', 'Id_TMDB', 'Titre']])
reco_indices = indices[film_index, 1:]   # 1: pour ne pas garder Coco lui-même

print("Index du film demandé :", film_index)
print("\n Les 5 films recommandés sont: ",reco_indices)

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Durée,Budget,Recettes,Langue Originale,Moyenne des votes,Nombre de votants,...,Wonderland Sound and Vision,Working Title Films,XYZ Films,ZDF/Arte,dentsu,di Bonaventura Pictures,uFilm,uFund,uMedia,Annee_sortie
2513,tt4154756,299536,Avengers : Infinity War,Avengers: Infinity War,0.603239,0.61236,0.701991,en,0.7,0.814669,...,0,0,0,0,0,0,0,0,0,0.72


Index du film demandé : 2513

 Les 5 films recommandés sont:  [2476 2438 2458 2390 2375]


In [69]:
recommandations = df_films.iloc[reco_indices,:10]
display(recommandations)

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Durée,Budget,Recettes,Langue Originale,Moyenne des votes,Nombre de votants
2476,tt1825683,284054,Black Panther,Black Panther,0.54251,0.408232,0.461717,en,0.566667,0.600314
2438,tt1300854,68721,Iron Man 3,Iron Man 3,0.526316,0.408232,0.415766,en,0.483333,0.599686
2458,tt9419884,453395,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,0.510121,0.591948,0.326905,en,0.533333,0.258387
2390,tt0848228,24428,Avengers,The Avengers,0.587045,0.449058,0.519483,en,0.65,0.891018
2375,tt3498820,271110,Captain America : Civil War,Captain America: Civil War,0.59919,0.510296,0.395062,en,0.566667,0.615113


# Fonction pour l'automatisation des recommandations

In [70]:
def recommandation_films(Titre:str):
  info_film = df_films[df_films['Titre'] == Titre]
  indice_films = info_film.index[0]
  reco_indices = indices[indice_films,1:]
  df_reco = df_films.iloc[reco_indices,:22]
  return df_reco

In [71]:
recommandation_films("Avengers : Infinity War")

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Durée,Budget,Recettes,Langue Originale,Moyenne des votes,Nombre de votants,...,Lien_vidéo,Affiche du Film,Logo,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary
2476,tt1825683,284054,Black Panther,Black Panther,0.54251,0.408232,0.461717,en,0.566667,0.600314,...,https://www.youtube.com/watch?v=yevh8uctrU0,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,1,1,0,0,0,0,0
2438,tt1300854,68721,Iron Man 3,Iron Man 3,0.526316,0.408232,0.415766,en,0.483333,0.599686,...,https://www.youtube.com/watch?v=wnEr73Rq3Ac,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,1,1,0,0,0,0,0
2458,tt9419884,453395,Doctor Strange in the Multiverse of Madness,Doctor Strange in the Multiverse of Madness,0.510121,0.591948,0.326905,en,0.533333,0.258387,...,https://www.youtube.com/watch?v=J7u1bDo_4sk,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,1,1,0,0,0,0,0
2390,tt0848228,24428,Avengers,The Avengers,0.587045,0.449058,0.519483,en,0.65,0.891018,...,https://www.youtube.com/watch?v=b-kTeJhHOhc,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,1,0,0,0,0,0,0
2375,tt3498820,271110,Captain America : Civil War,Captain America: Civil War,0.59919,0.510296,0.395062,en,0.566667,0.615113,...,https://www.youtube.com/watch?v=gEGp7oQnzQU,https://image.tmdb.org/t/p/w600_and_h900_bestv...,https://image.tmdb.org/t/p/w300/hUzeosd33nzE5M...,1,0,0,0,0,0,0


# Essais avec renvoi sur films

J'ai changé en faisant un renvoi sur le dataframe d'origine dont les valeurs ne sont pas indiciés, complet dans les informations et directement lisible pour une personne lambda. 

In [72]:
def recommandation_films(Titre:str):
  info_film = df_films[df_films['Titre'] == Titre]
  indice_films = info_film.index[0]
  reco_indices = indices[indice_films,1:]
  df_reco = films.iloc[reco_indices,:21]
  return df_reco

In [73]:
recommandation_films("Nobody's Baby")

Unnamed: 0,tconst,Id_TMDB,Titre,Titre Original,Date de sortie,Durée,Budget,Recettes,Pays d'origine,Langue Originale,...,Nombre de votants,Popularité,Résumé,Lien_vidéo,Affiche du Film,Genres,Mots clés,Pays_production,Société_production,Logo
1751,tt1024733,4204,Un Amour de père,$5 a Day,2008-09-06,97,40000000.0,92000000.0,US,en,...,95.0,4.2,Nat est un homme qui aime profiter de la vie. ...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama",Non renseigné,United States of America,"Capitol Films, Carol Baum Productions, Goldenr...",https://image.tmdb.org/t/p/w300/aqiR1prl8N00Ff...
1797,tt0914797,13996,Bottle Shock,Bottle Shock,2008-09-05,110,5000000.0,4040588.0,US,en,...,222.0,3.7,Les débuts du vin en Californie suite au désor...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","wine, winegrowing",United States of America,"Intellectual Properties Worldwide, Zininsa Fil...",Non renseigné
796,tt1687281,51993,Terri,Terri,2011-07-01,105,40000000.0,655802.0,US,en,...,151.0,3.6,Terri vit dans une petite ville des États-Unis...,https://www.youtube.com/watch?v=_E2tlk7BC0Q,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","overweight child, underage drinking, urination...",United States of America,"Verisimilitude, Silverwood Films, Periscope En...",https://image.tmdb.org/t/p/w300/qEoPv39l36BaWH...
52,tt1821426,172631,Family Weekend,Family Weekend,2013-03-28,105,40000000.0,92000000.0,US,en,...,219.0,3.4,"Une jeune fille de 16 ans, agacée par le manqu...",Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama",Non renseigné,United States of America,"Footprint Features, Bedford Falls Productions",https://image.tmdb.org/t/p/w300/tjL893PyjsZ1PE...
1765,tt3881680,386501,Nouveau job pour une nouvelle vie,Waffle Street,2015-09-24,86,40000000.0,92000000.0,US,en,...,128.0,4.8,Secoué par le rôle qu’il a joué dans la crise ...,Non renseigné,https://image.tmdb.org/t/p/w600_and_h900_bestv...,"Comedy,Drama","diner, based on memoir or autobiography, waffl...",United States of America,"Side Gig Productions, 6 Foot Films",https://image.tmdb.org/t/p/w300/10F53rRqBLkfVx...


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=eb8103ec-0d85-46cb-8ff1-660cc3c3a559' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>