In [20]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import seaborn as sns ;
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [21]:

def data_load():
    load_dotenv('BDD_URL.env')
    BDD_URL = os.environ['BDD_URL']
    engine = create_engine(BDD_URL)


    SQL= """
    SELECT  "tconst", "primaryTitle", "titleType", "isAdult", "startYear", "runtimeMinutes", "genres", "averageRating", "directors", "writers", "actor", "producer", "cinematographer", "composer", "editor", "production_designer", "self", "archive_footage", "archive_sound"
    from "castview"
    where "titleType" = 'movie' and "runtimeMinutes" Is NOT null and "averageRating" is NOT NULL and "genres" is NOT NULL and "startYear" is NOT NULL and "isAdult" is NOT NULL  and "directors" is NOT NULL and  "writers" is NOT NULL  and  "actor" is NOT NULL  and  "producer" is NOT NULL
    ORDER BY "tconst" desc
    limit 100000;
    """
    df = pd.read_sql(SQL, engine)
    engine.dispose()
    return df
df = data_load()

In [22]:
df

Unnamed: 0,tconst,primaryTitle,titleType,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,actor,producer,cinematographer,composer,editor,production_designer,self,archive_footage,archive_sound
0,tt9916730,6 Gunn,movie,0,2017,116,Drama,7.6,nm10538612,nm10538612,"nm0059461,nm13233318,nm4852679,nm6096005","nm10538613,nm10538614",nm1957275,,nm9785908,,,,
1,tt9916538,Kuambil Lagi Hatiku,movie,0,2019,123,Drama,8.6,nm4457074,"nm4843252,nm4900525,nm2679404","nm10041459,nm1266058,nm1417182,nm8678236",nm1290982,,nm4700236,,,,,
2,tt9916362,Coven,movie,0,2020,92,"Drama,History",6.4,nm1893148,"nm1893148,nm3471432","nm0107165,nm0266723,nm10678594,nm3766704","nm1086949,nm2970042,nm4065853",,nm5813626,,,,,
3,tt9916270,Il talento del calabrone,movie,0,2020,84,Thriller,5.8,nm1480867,"nm1480867,nm10538402","nm0144812,nm2063290,nm3080119,nm9428255","nm0656465,nm1799384",nm0130846,nm2747888,,,,,
4,tt9916190,Safeguard,movie,0,2020,95,"Action,Adventure,Thriller",3.7,nm7308376,nm7308376,"nm0302466,nm2018573,nm2952127,nm7477011",nm10299811,"nm5785503,nm8262250",nm7879007,nm4877791,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,tt0128371,No. 3,movie,0,1997,108,"Action,Comedy,Crime",6.4,nm0814288,nm0814288,"nm0051906,nm0497870,nm0814280,nm3769061","nm0453490,nm0784580",nm0661937,nm0158611,nm0661824,,,,
99996,tt0128370,Nô,movie,0,1998,85,"Comedy,Drama",6.9,nm0503215,"nm0503215,nm0603820","nm0128212,nm0296982,nm0317650,nm0551885",nm0423395,nm0005798,"nm2119227,nm2122018",,,,,
99997,tt0128364,This Transient Life,movie,0,1970,143,Drama,7.7,nm0423228,nm0410945,"nm0645402,nm0765783,nm0848793,nm0875360",nm5491542,"nm0408361,nm0619919",nm0299448,,,,,
99998,tt0128348,Mijlocas la deschidere,movie,0,1980,97,"Drama,Sport",6.5,nm0849116,nm8809269,"nm0187656,nm3627653,nm3627840,nm3627972",nm2255791,nm0890481,nm3624216,nm6166279,,,,


In [23]:
def liste_en_texte(lst):
    if isinstance(lst, list):
        return ' '.join(lst)
    else:
        return lst
    
def cleanText(df):
    df.fillna('missing', inplace=True)
    df=df.str.replace(',', ' ')
    return df

In [24]:
columns_to_clean = ['primaryTitle', 'titleType', 'genres', 'directors', 'writers', 
                    'actor', 'producer', 'cinematographer', 'composer', 'editor', 
                    'production_designer', 'self', 'archive_footage', 'archive_sound']

for column in columns_to_clean:
    df[column] = cleanText(df[column])

In [25]:
def BooleanToText (df):
    return df.apply(lambda x: 'True' if x == 1 else 'False')

In [26]:
def DateToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(1800, 2056, 5))  # Intervalles de 5
    labels = [f"between{start}and{start+4}" for start in range(1800, 2051, 5)]

    return pd.cut(df, bins=bins, labels=labels, right=False)


In [27]:
def RuntimeToCategory (df):
    df=df.astype(int)
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 615, 15))  # Intervalles de 10h
    labels = [f"runtime_Between{start}and{start+15}" for start in range(0, 600, 15)]

    return pd.cut(df, bins=bins, labels=labels, right=False) #qcut 

In [28]:
def RatingToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 12, 2))  
    labels = ['*','**','***','****','*****']

    return pd.cut(df, bins=bins, labels=labels, right=False)

In [29]:
def listTostr (df):
    return df.apply(lambda x: ' '.join(map(str, x)))

In [30]:
def crewmod (x, type):
    return ' '.join([type +'_'+ name for name in x.split()])

In [31]:
df

Unnamed: 0,tconst,primaryTitle,titleType,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,actor,producer,cinematographer,composer,editor,production_designer,self,archive_footage,archive_sound
0,tt9916730,6 Gunn,movie,0,2017,116,Drama,7.6,nm10538612,nm10538612,nm0059461 nm13233318 nm4852679 nm6096005,nm10538613 nm10538614,nm1957275,missing,nm9785908,missing,missing,missing,missing
1,tt9916538,Kuambil Lagi Hatiku,movie,0,2019,123,Drama,8.6,nm4457074,nm4843252 nm4900525 nm2679404,nm10041459 nm1266058 nm1417182 nm8678236,nm1290982,missing,nm4700236,missing,missing,missing,missing,missing
2,tt9916362,Coven,movie,0,2020,92,Drama History,6.4,nm1893148,nm1893148 nm3471432,nm0107165 nm0266723 nm10678594 nm3766704,nm1086949 nm2970042 nm4065853,missing,nm5813626,missing,missing,missing,missing,missing
3,tt9916270,Il talento del calabrone,movie,0,2020,84,Thriller,5.8,nm1480867,nm1480867 nm10538402,nm0144812 nm2063290 nm3080119 nm9428255,nm0656465 nm1799384,nm0130846,nm2747888,missing,missing,missing,missing,missing
4,tt9916190,Safeguard,movie,0,2020,95,Action Adventure Thriller,3.7,nm7308376,nm7308376,nm0302466 nm2018573 nm2952127 nm7477011,nm10299811,nm5785503 nm8262250,nm7879007,nm4877791,missing,missing,missing,missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,tt0128371,No. 3,movie,0,1997,108,Action Comedy Crime,6.4,nm0814288,nm0814288,nm0051906 nm0497870 nm0814280 nm3769061,nm0453490 nm0784580,nm0661937,nm0158611,nm0661824,missing,missing,missing,missing
99996,tt0128370,Nô,movie,0,1998,85,Comedy Drama,6.9,nm0503215,nm0503215 nm0603820,nm0128212 nm0296982 nm0317650 nm0551885,nm0423395,nm0005798,nm2119227 nm2122018,missing,missing,missing,missing,missing
99997,tt0128364,This Transient Life,movie,0,1970,143,Drama,7.7,nm0423228,nm0410945,nm0645402 nm0765783 nm0848793 nm0875360,nm5491542,nm0408361 nm0619919,nm0299448,missing,missing,missing,missing,missing
99998,tt0128348,Mijlocas la deschidere,movie,0,1980,97,Drama Sport,6.5,nm0849116,nm8809269,nm0187656 nm3627653 nm3627840 nm3627972,nm2255791,nm0890481,nm3624216,nm6166279,missing,missing,missing,missing


In [32]:
df['feature'] = df['primaryTitle'] + ' '

df['feature'] += 'titleType_'+df['titleType'] + ' '

df['feature'] += 'Rating_'+RatingToCategory(df['averageRating']).astype(str) + ' '

df['feature'] += 'startYear_'+DateToCategory(df['startYear']).astype(str) + ' '

df['feature'] += RuntimeToCategory (df['runtimeMinutes']).astype(str)+ ' '

df['feature'] += df['genres'] + ' '

df['feature'] += 'ADULT_'+BooleanToText (df['isAdult']).astype(str)+' '

df['feature'] += df['directors'].apply(crewmod, type='directors').astype(str)+' '
df['feature'] += df['writers'].apply(crewmod, type='writers').astype(str)+' '
df['feature'] += df['actor'].apply(crewmod, type='actor').astype(str)+' '
df['feature'] += df['producer'].apply(crewmod, type='producer').astype(str)+' '

df['feature'][0]

'6 Gunn titleType_movie Rating_**** startYear_between2015and2019 runtime_Between105and120 Drama ADULT_False directors_nm10538612 writers_nm10538612 actor_nm0059461 actor_nm13233318 actor_nm4852679 actor_nm6096005 producer_nm10538613 producer_nm10538614 '

In [33]:
cv = CountVectorizer(analyzer="word")
count_vect = cv.fit_transform(df['feature'])

In [34]:
def findfilm(index):
    if index < len(df):
        return df.iloc[index][['tconst', 'primaryTitle']].tolist()
    else:
        return None  


def getindex(filmm):
    index_list = df[df['primaryTitle'] == filmm].index
    if len(index_list) > 0:
        return index_list[0]
    else:
        return None



In [44]:
def index_list(similarity_measure, Nbfilm):
    sorted_indexes = sorted(enumerate(similarity_measure), key=lambda x: x[1], reverse=True)
    top_indexes = [index for index, _ in sorted_indexes[:Nbfilm]]
    return top_indexes

In [52]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

def recommend(film='', Nbfilm=5, method='cosine'):
    indexfilm = getindex(film)
    if indexfilm is None:
        return []

    if method == 'cosine':
        similarity_measure = cosine_similarity(count_vect, count_vect[indexfilm])
        indexes = index_list(similarity_measure, Nbfilm)
        indexes = indexes[1:]
    elif method == 'knn':
        
        nbrs = NearestNeighbors(n_neighbors=Nbfilm+1, algorithm='auto').fit(count_vect)
        distances, indexes = nbrs.kneighbors(count_vect[indexfilm])
        indexes = indexes.flatten()[1:]  
    else:
        raise ValueError("Invalid method. Choose either 'cosine' or 'knn'.")

    recommendations = [findfilm(idx) for idx in indexes]
    return recommendations





In [55]:
recommend(film='Succede', Nbfilm=10, method='knn')

[['tt11428870', 'Tsarli'],
 ['tt5820148', 'Afterlov'],
 ['tt5966224', 'Relics'],
 ['tt5922786', 'Nightshot'],
 ['tt8211258', 'Operation C.H.I.M.P'],
 ['tt7488602', 'Ayaz'],
 ['tt7769308', '798Ten'],
 ['tt4096952', 'Lamia'],
 ['tt13647276', 'Ama Khando'],
 ['tt10583170', 'Vlastníci']]