In [10]:
import pandas as pd

# SQL client

In [5]:
import os

figures = "./figures/"
os.makedirs(figures, exist_ok=True)

In [6]:
def save(fig, filename, path=figures):
    """ Savegarde une figure matplotlib """
    fig.savefig(path + filename, bbox_inches="tight")

# Data

Les données extraites de la base de données Azure. `Executer le script extraction.py`. On suppose que les données ont été téléchargée dans le dossier `./data/`.

In [5]:
data_path = './data/data_ext.tsv'

Chargement des données :

In [8]:
df = pd.read_csv(data_path, sep='\t', quoting=3)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,tconst,genre_1,genre_2,genre_3,primaryTitle,isAdult,startYear,runtimeMinutes,actor_1,actor_2,actor_3,director_1,director_2,director_3,averageRating,numVotes
0,0,tt6265394,Comedy,,,What If by SnG Comedy,False,2016,,nm5106779,nm5022186,nm8595438,,,,6.6,9.0
1,1,tt8473584,Reality-TV,,,Swapping Amish,False,2019,,nm5819047,nm7017294,,,,,,
2,2,tt7161172,Adventure,Comedy,Fantasy,TableTalk,False,2017,,nm5652225,nm4435805,nm4139879,,,,,
3,3,tt6423374,Animation,,,Mpampoulas Ae,False,2012,,nm0080900,,,,,,,
4,4,tt7607712,Fantasy,,,O Filme de Schrödinger,False,2017,,nm9401958,nm9402248,nm9401952,,,,,


# Système de Recommandation

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [11]:
numeric_features = ["isAdult", "startYear", "runtimeMinutes", "averageRating", "numVotes"]
categorical_features = ['genre_1', 'genre_2', 'genre_3']
text_features = ["tconst", "actor_1", "actor_2", "actor_3", "director_1", "director_2", "director_3"]

In [15]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='median'))])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=''))])
textual_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=''))])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("txt", textual_transformer, categorical_features)
    ]
)

In [17]:
preprocessor

In [18]:
from sklearn.utils._estimator_html_repr import estimator_html_repr

In [19]:
estimator_html_repr(preprocessor)

'<style>#sk-container-id-2 {\n  /* Definition of color scheme common for light and dark mode */\n  --sklearn-color-text: #000;\n  --sklearn-color-text-muted: #666;\n  --sklearn-color-line: gray;\n  /* Definition of color scheme for unfitted estimators */\n  --sklearn-color-unfitted-level-0: #fff5e6;\n  --sklearn-color-unfitted-level-1: #f6e4d2;\n  --sklearn-color-unfitted-level-2: #ffe0b3;\n  --sklearn-color-unfitted-level-3: chocolate;\n  /* Definition of color scheme for fitted estimators */\n  --sklearn-color-fitted-level-0: #f0f8ff;\n  --sklearn-color-fitted-level-1: #d4ebff;\n  --sklearn-color-fitted-level-2: #b3dbfd;\n  --sklearn-color-fitted-level-3: cornflowerblue;\n\n  /* Specific color for light theme */\n  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n  --sklearn-color-bord