In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('all_movies_ratings.csv', iterator = True, chunksize = 250000)

In [3]:
df = pd.concat(data, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,movieId,title,year,genres,userId,rating,timestamp
0,1,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517


In [5]:
df.tail()

Unnamed: 0,movieId,title,year,genres,userId,rating,timestamp
25000090,209157,We,2018.0,Drama,119571,1.5,1574280748
25000091,209159,Window of the Soul,2001.0,Documentary,115835,3.0,1574280985
25000092,209163,Bad Poems,2018.0,Comedy|Drama,6964,4.5,1574284913
25000093,209169,A Girl Thing,2001.0,(no genres listed),119571,3.0,1574291826
25000094,209171,Women of Devil's Island,1962.0,Action|Adventure|Drama,119571,3.0,1574291937


In [6]:
df.shape

(25000095, 7)

In [7]:
df.dtypes

movieId        int64
title         object
year         float64
genres        object
userId         int64
rating       float64
timestamp      int64
dtype: object

In [8]:
df.isna().sum()

movieId          0
title            0
year         11740
genres           0
userId           0
rating           0
timestamp        0
dtype: int64

In [9]:
df['rating'].unique()

array([3.5, 4. , 3. , 5. , 2. , 4.5, 2.5, 1. , 1.5, 0.5])

In [10]:
df['genres'].unique()

array(['Adventure|Animation|Children|Comedy|Fantasy',
       'Adventure|Children|Fantasy', 'Comedy|Romance', ...,
       'Children|Comedy|Fantasy|Mystery', 'Children|Comedy|Crime|Fantasy',
       'Comedy|Horror|Mystery|Sci-Fi|Western'], dtype=object)

In [11]:
df['genres'] = df['genres'].replace('(no genres listed)', 'Not specified')

# 1st iteration on sample data

In [57]:
df_s = df.sample(500)

In [58]:
df_s['genres'] = df_s['genres'].apply(lambda row: row.split('|'))

In [59]:
dummies = pd.get_dummies(df_s['genres'].apply(pd.Series).stack()).groupby(level=0).sum()

In [60]:
df_s = pd.concat([df_s, dummies], axis=1)

In [61]:
df_s = df_s.drop('genres', axis=1)

**Collaborative filtering**

In [62]:
from sklearn.neighbors import NearestNeighbors

In [63]:
user_movie_matrix = df_s.pivot_table(index='userId', columns='title', values='rating', fill_value=0)

In [64]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_movie_matrix.T)

In [65]:
def recommend_similar_movies(data, knn_model, movie_name, n_recommendations=5):
    movie_idx = data.index.get_loc(movie_name)
    distances, indices = knn_model.kneighbors(data.iloc[movie_idx, :].values.reshape(1, -1), n_neighbors=n_recommendations + 1)
    similar_movies = [(data.index[i], distances[0, j]) for j, i in enumerate(indices.flatten()) if i != movie_idx]
    return similar_movies

In [66]:
movie = 'Fight Club'

In [67]:
recommendations = recommend_similar_movies(user_movie_matrix.T, knn_model, movie)

In [68]:
print(f"Recommandations de films similaires à {movie}:")
for movies, similarity in recommendations:
    print(movies)

Recommandations de films similaires à Fight Club:
Planet of the Apes
Pleasantville
Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Piano, The


**Collabprative filtering + content recommandation**

In [69]:
movie_user_matrix = user_movie_matrix.T

In [70]:
movie_genre_matrix = df_s.pivot_table(index='title', values = df_s.columns[6:], fill_value=0)

In [71]:
movie_user_genre_matrix = pd.concat([movie_user_matrix, movie_genre_matrix], axis=1)

In [72]:
movie_user_genre_matrix.columns = [str(elements) for elements in movie_user_genre_matrix.columns]

In [73]:
#n_jobs = -1 -> parallélisation du calcul sur tous les processeurs de la machine
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5, n_jobs=-1)
knn_model.fit(movie_user_genre_matrix.values)

In [74]:
def recommend_similar_movies_with_genre(data, knn_model, movie_name, n_recommendations=5):
    movie_idx = data.index.get_loc(movie_name)
    distances, indices = knn_model.kneighbors(data.iloc[movie_idx, :].values.reshape(1, -1), n_neighbors=n_recommendations + 1)
    similar_movies = [(data.index[i], distances[0, j]) for j, i in enumerate(indices.flatten()) if i != movie_idx]
    return similar_movies

In [75]:
movie = 'Fight Club'

In [76]:
recommendations = recommend_similar_movies_with_genre(movie_user_genre_matrix, knn_model, movie)

In [77]:
print(f"Recommandations de films similaires à {movie} :")
for movies, similarity in recommendations:
    print(movies)

Recommandations de films similaires à Fight Club :
First Blood (Rambo: First Blood)
Bad Boys II
Flightplan
3000 Miles to Graceland
Mulholland Falls


**La recommandation reste imparfaite, il faut enrichir la base de données et obtenir le résumé de chaque film pour améliorer la précision du modèle**

In [33]:
df_test = df_s.copy()

**IMDB API**

In [34]:
import imdb

In [35]:
ia = imdb.IMDb()

In [36]:
search = ia.search_movie('Fight Club')[1]

In [37]:
movie = ia.get_movie(search.movieID)

In [38]:
print(movie.get('synopsis'))

None


**Scraping**