In [1]:
import pandas as pd
import numpy as np
from sklearn.utils._estimator_html_repr import estimator_html_repr
import matplotlib.pyplot as plt

# Data

Les données extraites de la base de données Azure. `Executer le script extraction.py`. On suppose que les données ont été téléchargée dans le dossier `./data/`.

In [2]:
data_path = './data/data_ext_v4.tsv'

Chargement des données :

In [3]:
df = pd.read_csv(data_path, sep='\t', quoting=3)

In [4]:
df.head()

Unnamed: 0,tconst,titleType,genre_1,genre_2,genre_3,primaryTitle,isAdult,startYear,runtimeMinutes,actor_1,actor_2,actor_3,director_1,director_2,director_3,averageRating,numVotes
0,tt2291966,movie,Biography,Documentary,Family,Farming 101,False,2012,80.0,nm5174409,,,nm0709664,,,,
1,tt2378115,movie,,,,Cecilia y Juan,False,2012,,nm1570600,nm5021870,,nm3913456,,,,
2,tt2530242,tvMovie,Comedy,,,Das Millionen Rennen,False,2012,89.0,nm0695127,nm0517885,nm0499671,nm1054437,,,6.1,68.0
3,tt27505330,movie,Drama,,,Nevada,False,2023,89.0,nm14752449,nm1235719,nm15159468,nm3807259,nm6281196,,6.2,10.0
4,tt22871554,movie,Drama,Mystery,Thriller,A Haunting Winter's Tale,False,2022,65.0,nm13391370,nm13392830,nm13392832,nm13390267,,,7.7,6.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669273 entries, 0 to 669272
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          669273 non-null  object 
 1   titleType       669273 non-null  object 
 2   genre_1         633878 non-null  object 
 3   genre_2         197851 non-null  object 
 4   genre_3         90136 non-null   object 
 5   primaryTitle    669271 non-null  object 
 6   isAdult         669273 non-null  bool   
 7   startYear       669273 non-null  int64  
 8   runtimeMinutes  407667 non-null  float64
 9   actor_1         444242 non-null  object 
 10  actor_2         394240 non-null  object 
 11  actor_3         366097 non-null  object 
 12  director_1      392268 non-null  object 
 13  director_2      36476 non-null   object 
 14  director_3      5918 non-null    object 
 15  averageRating   320451 non-null  float64
 16  numVotes        320451 non-null  float64
dtypes: bool(1)

In [6]:
df = df[df['averageRating'].notna()].reset_index()
df.shape

(320451, 18)

In [7]:
df.fillna({'genre_1' : 'None1', 'genre_2' : 'None2', 'genre_3': 'None3'}, inplace=True)

In [8]:
df['isAdult'] = df['isAdult'].astype(int)

In [11]:
from unidecode import unidecode
import re
from nltk.stem import PorterStemmer
def preprocess_title(title):
    """
    Preprocess movie/show titles for use in recommendation systems.
    
    Args:
        title: The primary title to preprocess. Can be string or float (np.nan)
        
    Returns:
        str: The preprocessed title
    """
    # Handle missing values
    if isinstance(title, float) and np.isnan(title):
        return "notitle"
    
    # Convert to string if not already
    title = str(title)
    
    # Convert to lowercase
    title = title.lower()
    
    # Normalize unicode characters
    title = unidecode(title)

    
    # Remove special characters and extra whitespace
    title = re.sub(r'[^\w\s]', ' ', title)  # Replace special chars with space
    title = re.sub(r'\s+', ' ', title)       # Replace multiple spaces with single space
    
    # Remove common words that don't add meaning
    stop_words = {'the', 's', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'}
    title_words = title.split()
    title_words = [word for word in title_words if word not in stop_words]
    
    # Remove year patterns (e.g., "(1999)" or "- 2020")
    title_words = [word for word in title_words if not re.match(r'^\d{4}$', word)]
    
    # Apply stemming
    stemmer = PorterStemmer()
    title_words = [stemmer.stem(word) for word in title_words]
    
    # Join words back together
    title = ' '.join(title_words)
    
    # Strip leading/trailing whitespace
    title = title.strip()
    
    return title

In [12]:
df["title"] = df['primaryTitle'].apply(preprocess_title)
df["logVotes"] = df['numVotes'].apply(np.log)

In [13]:
df.head()

Unnamed: 0,index,tconst,titleType,genre_1,genre_2,genre_3,primaryTitle,isAdult,startYear,runtimeMinutes,actor_1,actor_2,actor_3,director_1,director_2,director_3,averageRating,numVotes,title,logVotes
0,2,tt2530242,tvMovie,Comedy,None2,None3,Das Millionen Rennen,0,2012,89.0,nm0695127,nm0517885,nm0499671,nm1054437,,,6.1,68.0,da millionen rennen,4.219508
1,3,tt27505330,movie,Drama,None2,None3,Nevada,0,2023,89.0,nm14752449,nm1235719,nm15159468,nm3807259,nm6281196,,6.2,10.0,nevada,2.302585
2,4,tt22871554,movie,Drama,Mystery,Thriller,A Haunting Winter's Tale,0,2022,65.0,nm13391370,nm13392830,nm13392832,nm13390267,,,7.7,6.0,haunt winter tale,1.791759
3,5,tt6854792,movie,Drama,Romance,None3,Sriramudinta Srikrishnudanta,0,2017,129.0,nm8970446,nm8970447,nm10962932,nm8970443,,,7.1,44.0,sriramudinta srikrishnudanta,3.78419
4,6,tt10429264,movie,Documentary,None2,None3,Fabulous,0,2019,52.0,nm8838626,,,nm5091578,,,6.5,16.0,fabul,2.772589


# Système de Recommandation

Utilization d'un countvectorizer au lieu d'un MultiLabelBinarizer car Multilabel Binarizer n'est pas fait pour fonctionner dans une pipeline (*cf. [sckit-learn github issues 11309](https://github.com/scikit-learn/scikit-learn/issues/11309)*).

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [50]:
class RecSysCosine :

    def __init__(self, n=5, data_path='./data/vectorized_db3.npy'):
        self._n = n
        self.load_data(data_path)
    
    def load_data(self, data_path):
        self._X = np.load(data_path, allow_pickle=True)
        print(self._X.shape)
    
    def recommand(self, idx):
        Y = self._X[idx,:].reshape(1,-1)
        result = cosine_similarity(np.delete(self._X, idx, axis=0), Y).reshape(-1)
        
        idxs = np.argpartition(result, -self._n)[-self._n:]
        idxs = idxs[np.argsort(result[idxs])][::-1]

        return idxs

class RecSysKNN :

    def __init__(self, n=5, data_path='./data/vectorized_db3.npy'):
        self._n = n+1
        self.load_data(data_path)
    
    def load_data(self, data_path):
        self._X = np.load(data_path, allow_pickle=True)
        self.nbrs = NearestNeighbors(n_neighbors=self._n).fit(self._X)
        print(self._X.shape)
    
    def recommand(self, idx):
        _, idxs = self.nbrs.kneighbors(self._X[idx,:].reshape(1, -1))
    
        return idxs[0][1:]

In [41]:
numeric_features     = ["startYear", "averageRating", "logVotes"]
categorical_features = ['genre_1', 'genre_2', 'genre_3']
text_features        = ["title" ,"actor_1", "actor_2", "actor_3", "director_1", "director_2", "director_3"]

In [42]:
numeric_transformer     = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('std', MinMaxScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='None')), ('vec', CountVectorizer(analyzer=list))])
textual_transformer     = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='None')), ('vec', TfidfVectorizer(analyzer=list, min_df=2, max_features=10000)), ('svd', TruncatedSVD(n_components=200)), ('std', MinMaxScaler())])

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("typ", categorical_transformer, ['titleType']),
        ("cat", categorical_transformer, categorical_features),
        ("txt", textual_transformer, text_features)
    ]
)

In [44]:
preproc = Pipeline(steps=[('prec', preprocessor)])
preproc

In [45]:
preproc_output = preproc.fit_transform(df)
filename = './data/vectorized_db3.npy'
np.save(filename, preproc_output)

# Tests

In [None]:
rec_cos = RecSysCosine(5, filename)
rec_knn = RecSysKNN(5, filename)

In [56]:
cols = ['tconst', 'startYear', 'titleType','genre_1', 'genre_2', 'genre_3', 'primaryTitle', 'averageRating', 'numVotes']
wild_idx = df[df['primaryTitle'].fillna('').str.contains('The Wild Robot')].index.values[0]
harry_idx = df[df['primaryTitle'].fillna('').str.contains('Azkaban')].index.values[0]

## The Wild Robot

In [57]:
idx = wild_idx
df.loc[idx, cols].reset_index().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8
index,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
233185,tt29623480,2024,movie,Animation,Sci-Fi,None3,The Wild Robot,8.2,118794.0


### Cosine Similarity

In [53]:
df.loc[rec_cos.recommand(idx), cols]

Unnamed: 0,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
121013,tt28288396,2023,movie,Animation,Sci-Fi,None3,The Missing,7.2,318.0
76199,tt5332274,2015,movie,Animation,Sci-Fi,None3,Arpeggio of Blue Steel: Ars Nova - Cadenza,7.2,105.0
316904,tt1579541,2009,movie,Romance,None2,None3,Evaraina Eppudaina,4.8,64.0
151263,tt13726852,2020,movie,Animation,Sci-Fi,None3,The Intruder,8.1,17.0
223093,tt20562862,2021,movie,Animation,Sci-Fi,None3,Bigfoot vs Krampus,2.9,81.0


### KNN

In [54]:
df.loc[rec_knn.recommand(wild_idx), cols]

Unnamed: 0,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
121013,tt28288396,2023,movie,Animation,Sci-Fi,None3,The Missing,7.2,318.0
76199,tt5332274,2015,movie,Animation,Sci-Fi,None3,Arpeggio of Blue Steel: Ars Nova - Cadenza,7.2,105.0
316905,tt7504716,2017,movie,Animation,Sci-Fi,None3,ChäoS;Child: Silent Sky,6.5,68.0
151263,tt13726852,2020,movie,Animation,Sci-Fi,None3,The Intruder,8.1,17.0
223093,tt20562862,2021,movie,Animation,Sci-Fi,None3,Bigfoot vs Krampus,2.9,81.0


## Harry Potter

In [58]:
idx = harry_idx
df.loc[idx, cols].reset_index().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8
index,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
91963,tt0304141,2004,movie,Adventure,Family,Fantasy,Harry Potter and the Prisoner of Azkaban,7.9,719001.0


### Cosine Similarity

In [60]:
df.loc[rec_cos.recommand(idx), cols]

Unnamed: 0,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
177136,tt8634368,2018,movie,Comedy,Drama,None3,Jatt vs. Ielts,4.4,121.0
22143,tt0295297,2002,movie,Adventure,Family,Fantasy,Harry Potter and the Chamber of Secrets,7.4,719337.0
240306,tt12058584,2021,movie,Drama,Thriller,War,Chess Story,6.8,5069.0
31984,tt0926084,2010,movie,Adventure,Family,Fantasy,Harry Potter and the Deathly Hallows: Part 1,7.7,622383.0
85464,tt1201607,2011,movie,Adventure,Family,Fantasy,Harry Potter and the Deathly Hallows: Part 2,8.1,982979.0


### KNN

In [62]:
df.loc[rec_knn.recommand(idx), cols]

Unnamed: 0,tconst,startYear,titleType,genre_1,genre_2,genre_3,primaryTitle,averageRating,numVotes
177137,tt0330373,2005,movie,Adventure,Family,Fantasy,Harry Potter and the Goblet of Fire,7.7,707778.0
22143,tt0295297,2002,movie,Adventure,Family,Fantasy,Harry Potter and the Chamber of Secrets,7.4,719337.0
240307,tt0241527,2001,movie,Adventure,Family,Fantasy,Harry Potter and the Sorcerer's Stone,7.7,892057.0
31984,tt0926084,2010,movie,Adventure,Family,Fantasy,Harry Potter and the Deathly Hallows: Part 1,7.7,622383.0
85464,tt1201607,2011,movie,Adventure,Family,Fantasy,Harry Potter and the Deathly Hallows: Part 2,8.1,982979.0
