# Projet netfloox creation du modèle de recommendation


In [21]:
# Load data
import pandas as pd
url = "https://datasets.imdbws.com/title.principals.tsv.gz"
df_principals = pd.read_csv(url,delimiter='\t',dtype='str',na_values=['NaN','\\N','N'], usecols=["tconst","nconst","category"]).query('category in ("actor","actress","director","composer")')#.iloc[:200000]
df_principals.head(5)

Unnamed: 0,tconst,nconst,category
1,tt0000001,nm0005690,director
3,tt0000002,nm0721526,director
4,tt0000002,nm1335271,composer
5,tt0000003,nm0721526,director
7,tt0000003,nm1335271,composer


In [22]:
#Tout raporter à laa maille film cela implique un pivot de la table avec index tconst les catégories auront chacunes leurs colonnes

principals = pd.pivot_table(df_principals, index=['tconst'], values='nconst',columns= 'category',
                      aggfunc=lambda x: ','.join(x), fill_value='')
principals.head(5)

category,actor,actress,composer,director
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0000001,,,,nm0005690
tt0000002,,,nm1335271,nm0721526
tt0000003,,,nm1335271,nm0721526
tt0000004,,,nm1335271,nm0721526
tt0000005,"nm0443482,nm0653042",,,nm0005690


In [23]:
#On ajoute les données title_basics
url = "https://datasets.imdbws.com/title.basics.tsv.gz"
df_film = pd.read_csv(url,delimiter='\t',dtype='str',na_values=['NaN','\\N','N'], usecols=["tconst","originalTitle","startYear","runtimeMinutes","genres","isAdult","titleType"]).query('titleType in ("movie")')#.iloc[:200000]
df_film.fillna("", inplace = True)
df_film.head(5)

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,0,1894,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,0,1897,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,0,1905,100,
570,tt0000574,movie,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
587,tt0000591,movie,L'enfant prodigue,0,1907,90,Drama


In [24]:
#
df_film['tconst'].nunique()

635637

In [25]:
#Join des tables sur le tconst en s'assurant qu'il soit unique
principals.index.nunique()

6684051

In [26]:
#Join des tables sur le tconst
df_recommendation = pd.merge(principals, df_film, how='inner', left_on = 'tconst', right_on = 'tconst')
df_recommendation.head(5)

Unnamed: 0,tconst,actor,actress,composer,director,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000009,"nm0183823,nm1309758",nm0063086,,nm0085156,movie,Miss Jerry,0,1894,45,Romance
1,tt0000147,,,,nm0714557,movie,The Corbett-Fitzsimmons Fight,0,1897,100,"Documentary,News,Sport"
2,tt0000502,"nm0215752,nm0252720",,,nm0063413,movie,Bohemios,0,1905,100,
3,tt0000574,"nm0846894,nm1431224,nm3002376",nm0846887,nm2421834,nm0846879,movie,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
4,tt0000591,"nm0906197,nm0332182","nm1323543,nm1759558",,nm0141150,movie,L'enfant prodigue,0,1907,90,Drama


In [27]:
#replace \N par ''
df_recommendation.replace("\\N", '', inplace=True)
df_recommendation.head(5)

Unnamed: 0,tconst,actor,actress,composer,director,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000009,"nm0183823,nm1309758",nm0063086,,nm0085156,movie,Miss Jerry,0,1894,45,Romance
1,tt0000147,,,,nm0714557,movie,The Corbett-Fitzsimmons Fight,0,1897,100,"Documentary,News,Sport"
2,tt0000502,"nm0215752,nm0252720",,,nm0063413,movie,Bohemios,0,1905,100,
3,tt0000574,"nm0846894,nm1431224,nm3002376",nm0846887,nm2421834,nm0846879,movie,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
4,tt0000591,"nm0906197,nm0332182","nm1323543,nm1759558",,nm0141150,movie,L'enfant prodigue,0,1907,90,Drama


In [9]:
#shape df_recommendation
df_recommendation.shape

(584899, 11)

In [28]:
#Ajout du ratting et numVote
recommendation=df_recommendation.drop(['originalTitle'], axis=1)
recommendation['reco_vector'] = recommendation.iloc[:, 1:].apply(' '.join, axis=1)
recommendation.head(5)

Unnamed: 0,tconst,actor,actress,composer,director,titleType,isAdult,startYear,runtimeMinutes,genres,reco_vector
0,tt0000009,"nm0183823,nm1309758",nm0063086,,nm0085156,movie,0,1894,45,Romance,"nm0183823,nm1309758 nm0063086 nm0085156 movie..."
1,tt0000147,,,,nm0714557,movie,0,1897,100,"Documentary,News,Sport","nm0714557 movie 0 1897 100 Documentary,News..."
2,tt0000502,"nm0215752,nm0252720",,,nm0063413,movie,0,1905,100,,"nm0215752,nm0252720 nm0063413 movie 0 1905 100"
3,tt0000574,"nm0846894,nm1431224,nm3002376",nm0846887,nm2421834,nm0846879,movie,0,1906,70,"Action,Adventure,Biography","nm0846894,nm1431224,nm3002376 nm0846887 nm2421..."
4,tt0000591,"nm0906197,nm0332182","nm1323543,nm1759558",,nm0141150,movie,0,1907,90,Drama,"nm0906197,nm0332182 nm1323543,nm1759558 nm014..."


In [29]:
#on ajoute le nom du film
recommendation = recommendation.join(df_recommendation['originalTitle'])
recommendation.drop(columns=['actor',	'actress',	'composer',	'director',	'titleType',	'isAdult',	'startYear',	'runtimeMinutes',	'genres'], axis=1, inplace=True)
recommendation.head(3)

Unnamed: 0,tconst,reco_vector,originalTitle
0,tt0000009,"nm0183823,nm1309758 nm0063086 nm0085156 movie...",Miss Jerry
1,tt0000147,"nm0714557 movie 0 1897 100 Documentary,News...",The Corbett-Fitzsimmons Fight
2,tt0000502,"nm0215752,nm0252720 nm0063413 movie 0 1905 100",Bohemios


In [30]:
# Import des librairies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
# Déclaration de la méthode de vectorisation et application cv = CountVectorizer() test avec 100 film
count_matrix = cv.fit_transform(recommendation['reco_vector'].iloc[:100])

In [44]:
count_matrix

<100x291 sparse matrix of type '<class 'numpy.int64'>'
	with 609 stored elements in Compressed Sparse Row format>

In [17]:
# Calcul des similarités
cosine_sim = cosine_similarity(count_matrix)

In [18]:
cosine_sim.shape

(100, 100)

In [46]:
cosine_sim

array([[0.10206207],
       [0.10910895],
       [0.11785113],
       ...,
       [0.09622504],
       [0.10206207],
       [0.11785113]])

In [47]:
#fonction qui retrouve le titre du film en fonction de l'index
def find_title_from_index(index):
  return recommendation[recommendation.index == index]["originalTitle"].values[0]
#fonction qui retrouve l'index du film en fonction du titre
def find_index_from_title(title):
  return recommendation[recommendation.originalTitle == title].index.values[0]

In [119]:
movie = "Star Wars: Episode I - The Phantom Menace"
movie_index= find_index_from_title(movie)
movie_index

81980

In [120]:
# Déclaration de la méthode de vectorisation et application
cv = CountVectorizer()
count_matrix = cv.fit_transform(recommendation['reco_vector'])

In [121]:
count_matrix[movie_index]

<1x992690 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [122]:
# Calcul des similarités uniquement pour le film en question pour limiter la taille de la matrice
cosine_sim = cosine_similarity(count_matrix, Y= count_matrix[movie_index])

In [123]:
cosine_sim.shape

(584899, 1)

In [124]:
cosine_sim[55008]

array([0.5])

In [125]:
#choper les index et les coefficients de similarité cos
similar_movies = list(enumerate(cosine_sim))

In [126]:
#tri des film similaires
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)[1:6]
sorted_similar_movies

[(82357, array([0.66666667])),
 (82356, array([0.6092718])),
 (204579, array([0.51639778])),
 (224920, array([0.51639778])),
 (225164, array([0.51639778]))]

In [127]:
# nous avons les index plus qu'a retrouver les titres de films
for id_film in sorted_similar_movies:
  print(find_title_from_index(id_film[0]))

Star Wars: Episode III - Revenge of the Sith
Star Wars: Episode II - Attack of the Clones
Voltron
Tomb Raider 2
The God Gene
