<!-- image from image -->
![image](images/exemple.png)

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience


connect to Neo4j database and test

In [2]:
# Neo4j connection details
DB_ULR = 'bolt://localhost:7687'
DB_USER = 'neo4j' 
DB_PASS = 'test1234'
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.version()

'2.13.3'

Test

In [33]:
nodes = gds.run_cypher(
    '''
    MATCH (n)
    RETURN COUNT(n)
    '''
)
nodes.head()

Unnamed: 0,COUNT(n)
0,9923


Load data from MovieLens

In [4]:
movies = pd.read_csv('ml-1m/movies.dat',sep='::',encoding = 'ISO-8859-1', names = ['MovieID','Title','Genres'])
ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',encoding = 'ISO-8859-1', names = ['UserID','MovieID','Rating','Timestamp'])
users = pd.read_csv('ml-1m/users.dat',sep='::',encoding = 'ISO-8859-1', names = ['UserID','Gender','Age','Occupation','Zip_code'])

  movies = pd.read_csv('ml-1m/movies.dat',sep='::',encoding = 'ISO-8859-1', names = ['MovieID','Title','Genres'])
  ratings = pd.read_csv('ml-1m/ratings.dat',sep='::',encoding = 'ISO-8859-1', names = ['UserID','MovieID','Rating','Timestamp'])
  users = pd.read_csv('ml-1m/users.dat',sep='::',encoding = 'ISO-8859-1', names = ['UserID','Gender','Age','Occupation','Zip_code'])


In [5]:
display(movies.head())
display(ratings.head())
display(users.head())

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Unnamed: 0,UserID,Gender,Age,Occupation,Zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


Insert data to graph DB

In [32]:
# create user nodes
def create_user_nodes():
    """
    Crée les nœuds User dans la base de données Neo4j à partir du DataFrame users.
    Retourne :
        pd.DataFrame : Résultat de la requête de création.
    """
    gds.run_cypher('create constraint if not exists for (n:User) require (n.id) is node key')
    create_customer_res = gds.run_cypher(
        '''
            unwind $data as row
            merge (n:User{id: row.UserID})
            set n.Gender = row.Gender
            set n.Age = row.Age
            return count(*) as custmers_created
        ''', params={'data': users.to_dict('records')}
    )
    return create_customer_res

create_customer_res = create_user_nodes()
create_customer_res.head()

Unnamed: 0,custmers_created
0,6040


In [17]:
# create movies nodes
def create_movie_nodes():
    """
    Crée les nœuds Movie dans la base de données Neo4j à partir du DataFrame movies.
    Retourne :
        pd.DataFrame : Résultat de la requête de création.
    """
    gds.run_cypher('create constraint if not exists for (n:Movie) require (n.id) is node key')
    create_movie_res = gds.run_cypher(
        '''
            unwind $data as row
            merge (n:Movie{id: row.MovieID})
            set n.Title = row.Title
            set n.Genres = row.Genres
            return count(*) as movies_created
        ''', params={'data': movies.to_dict('records')}
    )
    return create_movie_res

create_movie_res = create_movie_nodes()
create_movie_res.head()

Unnamed: 0,movies_created
0,3883


In [8]:
# create ratings relationship
def create_rating_relationship():
    """ 
    Crée les relations RATED entre les nœuds User et Movie dans la base de données Neo4j à partir du DataFrame ratings.
    Retourne :
        pd.DataFrame : Résultat de la requête de création.
    """
    i = 1
    for chunk in np.array_split(ratings, 200):
        if i%10 == 0:
            print(i)
        create_rated = gds.run_cypher(
        '''
            unwind $data as row
            match (u:User{id:row.UserID}) , (n:Movie{id:row.MovieID})
            merge (u)-[r:RATED]->(n)
            set r.Rating = row.Rating
            return count(*) as create_rated
        ''',params = {'data':chunk.to_dict('records')}
    )
        i = i+1
    create_rated.head()

create_rating_relationship()

  return bound(*args, **kwds)


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200


Unnamed: 0,create_rated
0,5001


<!-- image from image -->
![image](images/view_relation.png)

In [9]:
# CALL apoc.meta.schema()
# CALL .schema.visualization()

****Recommandation****
Recommandation of similar movie
A simple way to find similar movie for specific is to count how many paths of the from
(movie1)-[]->(user)-[]->(movie2)
where movie1 is the movie we want to find similar movie for
we order the movie2 based on the number of path that connect the two movie. The idea is that movie will receive
excellente rating by the same user

In [10]:
def find_similar_movies(title, rating=5, top_n=10):
    """
    Trouve les films similaires à un titre donné selon les utilisateurs ayant donné une note spécifique.
    Args:
        title (str): Le titre du film à rechercher.
        rating (int): La note à considérer pour la similarité (par défaut : 5).
        top_n (int): Nombre de résultats à retourner (par défaut : 10).
    Returns:
        pd.DataFrame: DataFrame des films similaires.
    """
    query = '''
        MATCH (n1:Movie)-[r1]-(u:User)-[r2]-(n2:Movie)
        WHERE n1.Title CONTAINS $title
            AND n2.Title<>$title
            AND r1.Rating = $rating AND r2.Rating = $rating
        RETURN n2.Title, n2.Genres, count(DISTINCT(u)) as common_users
        ORDER BY common_users DESC
    '''
    result = gds.run_cypher(query, params={'title': title, 'rating': rating})
    return result.head(top_n)

Unnamed: 0,n2.Title,n2.Genres,common_users
0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,401
1,Toy Story 2 (1999),Animation|Children's|Comedy,385
2,Raiders of the Lost Ark (1981),Action|Adventure,373
3,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,346
4,"Shawshank Redemption, The (1994)",Drama,327


AND PARAMETRIC QUERY

In [31]:
# check the similar movies
def find_similar_movies(title, rating=5, top_n=10):
    """
    Trouve les films similaires à un titre donné selon les utilisateurs ayant donné une note spécifique.
    Args:
        title (str): Le titre du film à rechercher.
        rating (int): La note à considérer pour la similarité (par défaut : 5).
        top_n (int): Nombre de résultats à retourner (par défaut : 10).
    Returns:
        pd.DataFrame: DataFrame des films similaires.
    """
    query = '''
        MATCH (n1:Movie)-[r1]-(u:User)-[r2]-(n2:Movie)
        WHERE n1.Title CONTAINS $title
            AND n2.Title<>$title
            AND r1.Rating = $rating AND r2.Rating = $rating
        RETURN n2.Title, n2.Genres, count(DISTINCT(u)) as common_users
        ORDER BY common_users DESC
    '''
    result = gds.run_cypher(query, params={'title': title, 'rating': rating})
    return result.head(top_n)

similar_movies = find_similar_movies('Toy Story (1995)', rating=5, top_n=10)
similar_movies

Unnamed: 0,n2.Title,n2.Genres,common_users
0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,401
1,Toy Story 2 (1999),Animation|Children's|Comedy,385
2,Raiders of the Lost Ark (1981),Action|Adventure,373
3,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,346
4,"Shawshank Redemption, The (1994)",Drama,327
5,American Beauty (1999),Comedy|Drama,323
6,"Matrix, The (1999)",Action|Sci-Fi|Thriller,316
7,Schindler's List (1993),Drama|War,311
8,"Sixth Sense, The (1999)",Thriller,303
9,Saving Private Ryan (1998),Action|Drama|War,300


Use base of recommandatio n

In [12]:
gds.run_cypher("CALL gds.graph.drop('myGraph', false)")

# create a projection
create_projection = gds.run_cypher(
    '''
        CALL gds.graph.project(
            'myGraph',
            ['User', 'Movie'],
            {
                RATED: {
                    properties: 'Rating'
                }
            }
        );
    ''')

create_projection.head()


Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'User': {'label': 'User', 'properties': {}}, ...","{'RATED': {'aggregation': 'DEFAULT', 'orientat...",myGraph,9923,1000209,1835


In [34]:
def get_user_similarity(graph_name='myGraph', top_n=10):
    """
    Calcule la similarité entre utilisateurs à partir du graphe projeté.
    Args:
        graph_name (str): Nom du graphe projeté dans Neo4j.
        top_n (int): Nombre de lignes à retourner (par défaut 10).
    Returns:
        pd.DataFrame: DataFrame des similarités utilisateur-utilisateur.
    """
    user_similarity = gds.run_cypher(
        f'''
        CALL gds.nodeSimilarity.stream('{graph_name}')
        YIELD node1, node2, similarity
        RETURN gds.util.asNode(node1).id AS UserID1,
               gds.util.asNode(node2).id AS UserID2,
               similarity
        ORDER BY similarity DESC, UserID1,  UserID2
        '''
    )
    return user_similarity.head(top_n)

user_similarity = get_user_similarity(top_n=10)
user_similarity


Unnamed: 0,UserID1,UserID2,similarity
0,4725,4808,0.755415
1,4808,4725,0.755415
2,1122,2126,0.632
3,2126,1122,0.632
4,1272,2837,0.601852
5,2837,1272,0.601852
6,4344,4508,0.549313
7,4508,4344,0.549313
8,5281,5287,0.547541
9,5287,5281,0.547541


In [40]:
# afficher des utilisateurs centraux
central_users = gds.run_cypher("MATCH (u:User) RETURN u ORDER BY u.rating DESC LIMIT 10")
for user in central_users:
    print(user)

u




In [14]:
# create similar relationship
i = 1
for chunk in np.array_split(user_similarity.query('UserID1 > UserID2'), 10):
    if i%10 == 0:
        print(i)
    create_rated = gds.run_cypher(
        '''
            unwind $data as row
            match (u1:User{id:row.UserID1}) , (u2:User{id:row.UserID2})
            merge (u1)-[r:SIMILAR]->(u2)
            set r.similarity = row.similarity
            return count(*) as create_rated
        ''',params = {'data':chunk.to_dict('records')}
    )
    i = i+1
create_rated.head()

  return bound(*args, **kwds)


10


Unnamed: 0,create_rated
0,3179


<!-- image from image -->
![image](images/view_relation2.png)

In [30]:
# check the similar movies
def recommend_movies_for_user(user_id, top_n=31):
    """
    Recommande des films pour un utilisateur en se basant sur les notes des utilisateurs similaires.
    Args:
        user_id (int): L'identifiant de l'utilisateur pour lequel recommander des films.
        top_n (int): Nombre de recommandations à retourner.
    Returns:
        pd.DataFrame: DataFrame des films recommandés.
    """
    query = '''
        MATCH (u1:User)-[r1:SIMILAR]-(u2)-[r2:RATED]-(m:Movie)
        WHERE id(u1) = $id
           AND NOT ((u1)-[]-(m))
        RETURN m.Title, m.Genres, Sum(r1.similarity * r2.Rating) / Sum(r1.similarity)+ log(count(r2)) as score
        ORDER BY score DESC
    '''
    result = gds.run_cypher(query, params={'id': user_id})
    return result.head(top_n)

similar_movies_for_user = recommend_movies_for_user(4725, top_n=31)
similar_movies_for_user

Unnamed: 0,m.Title,m.Genres,score
0,Schindler's List (1993),Drama|War,6.864803
1,Toy Story (1995),Animation|Children's|Comedy,6.497486
2,October Sky (1999),Drama,6.390129
3,Almost Famous (2000),Comedy|Drama,6.389066
4,Bulworth (1998),Comedy,6.283263
5,Boys Don't Cry (1999),Drama,6.259083
6,Apollo 13 (1995),Drama,6.209804
7,Boogie Nights (1997),Drama,6.169459
8,GoodFellas (1990),Crime|Drama,6.156693
9,"Simple Plan, A (1998)",Crime|Thriller,6.120458


In [29]:
# cherk actual movies
def get_actual_movies_for_user(user_id, top_n=10):
    """
    Récupère les films réellement notés par un utilisateur, triés par note décroissante.
    Args:
        user_id (int): L'identifiant de l'utilisateur.
        top_n (int): Nombre de films les mieux notés à retourner.
    Returns:
        pd.DataFrame: DataFrame des films notés par l'utilisateur.
    """
    result = gds.run_cypher(
        '''
        MATCH (u1:User)-[r:RATED]-(m:Movie)
        WHERE id(u1) = $id
        RETURN m.Title, m.Genres, r.Rating as rating
        ORDER BY rating DESC
        ''', params={'id': user_id}
    )
    return result.head(top_n)

actual_movies = get_actual_movies_for_user(4725, top_n=15)
actual_movies

Unnamed: 0,m.Title,m.Genres,rating
0,Twelve Monkeys (1995),Drama|Sci-Fi,5
1,Babe (1995),Children's|Comedy|Drama,5
2,Dead Man Walking (1995),Drama,5
3,Braveheart (1995),Action|Drama|War,5
4,Little Women (1994),Drama,5
5,"Little Princess, A (1995)",Children's|Drama,5
6,Pulp Fiction (1994),Crime|Drama,5
7,Tank Girl (1995),Action|Comedy|Musical|Sci-Fi,5
8,"Corrina, Corrina (1994)",Comedy|Drama|Romance,5
9,Forrest Gump (1994),Comedy|Romance|War,5


***Recommendation par rapport au genre***

  Recommande des films à un utilisateur basé sur ses genres préférés.

    Params :
    - user_id : ID de l'utilisateur dans Neo4j.
    - gds : Instance GDS connectée.
    - top_genres : nombre de genres favoris à considérer.
    - top_recommendations : nombre de films recommandés à retourner.

    Retour :
    - DataFrame des recommandations.
    """

In [37]:
def recommend_by_genre(user_id, top_genres=3, top_recommendations=10):
    """ 
    Recommande des films à un utilisateur en se basant sur les genres des films qu'il a notés positivement.
    Args:
        user_id (int): L'identifiant de l'utilisateur pour lequel recommander des films.
        top_genres (int): Nombre de genres à considérer pour la recommandation.
        top_recommendations (int): Nombre de recommandations à retourner.
        Returns:
        pd.DataFrame: DataFrame des films recommandés.
    """
    query = f"""
    MATCH (u:User {{id: $id}})-[r:RATED]->(m:Movie)
    WHERE r.Rating >= 4
    WITH u, split(m.Genres, '|') AS genres
    UNWIND genres AS genre
    WITH u, genre, count(*) AS genreScore
    ORDER BY genreScore DESC
    WITH u, collect(genre)[..{top_genres}] AS topGenres

    MATCH (rec:Movie)
    WHERE any(g IN split(rec.Genres, '|') WHERE g IN topGenres)
      AND NOT EXISTS {{
        MATCH (u)-[:RATED]->(rec)
      }}

    RETURN DISTINCT rec.Title AS Films_recommander, rec.Genres AS Genres_recommander
    ORDER BY rec.Title
    LIMIT {top_recommendations}
    """
    try:
        result = gds.run_cypher(query, params={"id": user_id})
        return result
    except Exception as e:
        print(f"Erreur pendant la recommandation : {e}")
        return pd.DataFrame()

recommendations = recommend_by_genre(user_id=75)
print(recommendations)


                             Films_recommander  \
0                       $1,000,000 Duck (1971)   
1                           'burbs, The (1989)   
2                              20 Dates (1998)   
3          20,000 Leagues Under the Sea (1954)   
4                        200 Cigarettes (1999)   
5               24 7: Twenty Four Seven (1997)   
6                               28 Days (2000)   
7  3 Ninjas: High Noon On Mega Mountain (1998)   
8                             3 Strikes (2000)   
9                           8 1/2 Women (1999)   

                    Genres_recommander  
0                    Children's|Comedy  
1                               Comedy  
2                               Comedy  
3  Adventure|Children's|Fantasy|Sci-Fi  
4                         Comedy|Drama  
5                         Comedy|Drama  
6                               Comedy  
7                    Action|Children's  
8                               Comedy  
9                               Comedy 