<!-- image from image -->
![image](images/exemple.png)

In [1]:
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience

In [2]:
# Neo4j Connection details
DB_ULR = 'bolt://localhost:7687'
DB_USER = 'neo4j' 
DB_PASS = 'test1234'
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))
gds.version()

'2.13.3'

In [3]:
nodes = gds.run_cypher('''
    MATCH (n)
    RETURN COUNT(n)
''') 
nodes.head()

Unnamed: 0,COUNT(n)
0,224964


In [17]:
movies = pd.read_csv('datasets/movies.dat', sep='::', encoding="ISO-8859-1", names=['MovieID','Title','Genres', 'Actors', 'Realizations', 'Date'], engine='python')
ratings = pd.read_csv('datasets/ratings.dat', sep='::', encoding="ISO-8859-1",names=['UserID','MovieID','Rating','Timestamp'], engine='python')
users = pd.read_csv('datasets/users.dat', sep='::', encoding="ISO-8859-1",names=['UserID','Name','Gender','Age','Occupation','Zip_code'], engine='python')

In [18]:
print("************** Les films *******")
display(movies.head())
print("************** Les utilisateurs *******")
display(users.head())
print("************** Les notes *******")
display(ratings.head())

************** Les films *******


Unnamed: 0,MovieID,Title,Genres,Actors,Realizations,Date
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,Tina Stewart|Jessica Smith,Jeremy Hendricks|Sonya Edwards,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,Richard Sanchez|David Walker|Carol Rodriguez|J...,Edward Williams|Keith Hudson,1995.0
2,3,Grumpier Old Men,Comedy|Romance,Nicholas Ramsey|Donna Williams|David Huerta|Ch...,Kyle Luna|Daniel Gonzalez,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,Curtis Wright,Derrick Campbell,1995.0
4,5,Father of the Bride Part II,Comedy,David Rogers|Stanley Galloway,Howard Martinez,1995.0


************** Les utilisateurs *******


Unnamed: 0,UserID,Name,Gender,Age,Occupation,Zip_code
0,1,Mary Campbell,F,35,27,92606
1,2,Larry Shaw,M,15,22,6265
2,3,Suzanne Quinn,F,27,27,54865
3,4,Monique Hughes,F,62,2,1871
4,5,Justin Stone,M,47,21,99193


************** Les notes *******


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [19]:
def create_user_nodes(gds, users_df):
    '''
    Crée des noeuds d'utilisateur dans la base de données Neo4j.
        :param gds: Instance de GraphDataScience
        :param users_df: DataFrame contenant les informations des utilisateurs
        :return: Résultat de la requête Cypher
    '''
    # Ajouter la contrainte (si elle n'existe pas encore)
    gds.run_cypher('''
        CREATE CONSTRAINT IF NOT EXISTS 
        FOR (n:User) 
        REQUIRE n.id IS NODE KEY
    ''')

    # Inserer les Users
    result = gds.run_cypher('''
        UNWIND $data AS row
        MERGE (u:User {id: row.UserID})
        SET u.gender = row.Gender,
            u.age = row.Age
        RETURN count(*) AS users_created
    ''', params={'data': users_df.to_dict('records')})

    return result


def create_movie_nodes(gds, movies_df):
    '''
    Crée des noeuds de film dans la base de données Neo4j.
        :param gds: Instance de GraphDataScience
        :param movies_df: DataFrame contenant les informations des films
        :return: Résultat de la requête Cypher
    '''
    # Ajouter la contrainte (si elle n'existe pas encore)
    gds.run_cypher('''
        CREATE CONSTRAINT IF NOT EXISTS 
        FOR (n:Movie) 
        REQUIRE n.id IS NODE KEY
    ''')

    # Inserer les Movies
    result = gds.run_cypher('''
        UNWIND $data AS row
        MERGE (m:Movie {id: row.MovieID})
        SET m.title = row.Title,
            m.genres = row.Genres
        RETURN count(*) AS movies_created
    ''', params={'data': movies_df.to_dict('records')})

    return result



In [20]:
# Créer les utilisateurs
res_users = create_user_nodes(gds, users)
print(res_users)

# Créer les films
res_movies = create_movie_nodes(gds, movies)
print(res_movies)


   users_created
0         162541
   movies_created
0           62423


In [3]:
nodes = gds.run_cypher('''
    MATCH (u:User) RETURN count(u);
''') 
nodes.head()

Unnamed: 0,count(u)
0,162541


In [4]:
nodes = gds.run_cypher('''
    MATCH (m:Movie) RETURN count(m);
''')
nodes.head()

Unnamed: 0,count(m)
0,62423


In [23]:
def create_rated_relationships(gds, ratings_df, chunk_size=200):
    i = 1
    for chunk in np.array_split(ratings_df, chunk_size):
        print(f"Chunk {i}/{chunk_size}")
        result = gds.run_cypher('''
            UNWIND $data AS row
            MATCH (u:User {id: row.UserID}), (m:Movie {id: row.MovieID})
            MERGE (u)-[r:RATED]->(m)
            SET r.rating = row.Rating
            RETURN count(*) AS created
        ''', params={'data': chunk.to_dict('records')})
        print(result.head())
        i += 1
    return result

In [24]:
# Créer les relations RATED
res_rated = create_rated_relationships(gds, ratings)
print(res_rated)

  return bound(*args, **kwds)


Chunk 1/200
   created
0   125001
Chunk 2/200
   created
0   125001
Chunk 3/200
   created
0   125001
Chunk 4/200
   created
0   125001
Chunk 5/200
   created
0   125001
Chunk 6/200
   created
0   125001
Chunk 7/200
   created
0   125001
Chunk 8/200
   created
0   125001
Chunk 9/200
   created
0   125001
Chunk 10/200
   created
0   125001
Chunk 11/200
   created
0   125001
Chunk 12/200
   created
0   125001
Chunk 13/200
   created
0   125001
Chunk 14/200
   created
0   125001
Chunk 15/200
   created
0   125001
Chunk 16/200
   created
0   125001
Chunk 17/200
   created
0   125001
Chunk 18/200
   created
0   125001
Chunk 19/200
   created
0   125001
Chunk 20/200
   created
0   125001
Chunk 21/200
   created
0   125001
Chunk 22/200
   created
0   125001
Chunk 23/200
   created
0   125001
Chunk 24/200
   created
0   125001
Chunk 25/200
   created
0   125001
Chunk 26/200
   created
0   125001
Chunk 27/200
   created
0   125001
Chunk 28/200
   created
0   125001
Chunk 29/200
   created
0   1

In [5]:
nodes = gds.run_cypher('''
    MATCH (u:User)-[r:RATED]->(m:Movie) RETURN count(r);
''')
nodes.head()

Unnamed: 0,count(r)
0,25000095


<!-- image from image -->
![image](images/view_relation.png)

In [5]:
def get_similar_movies(title):
    """
    Récupère la liste des films similaires à un film donné en fonction des utilisateurs 
    ayant noté les deux films avec la note maximale (5).

    Cette fonction interroge la base de données Neo4j pour trouver les films qui partagent 
    des utilisateurs en commun ayant attribué une note de 5 à ces films et au film cible.

    Args:
        title (str): Le titre exact du film pour lequel trouver des films similaires.

    Returns:
        pd.DataFrame: Un DataFrame contenant :
            - title (str): Le titre des films similaires.
            - genres (str): Les genres des films similaires.
            - common_users (int): Le nombre d'utilisateurs ayant noté les deux films avec la note 5.
    """
    
    query = '''
    MATCH (m1:Movie)-[r1:RATED]-(u:User)-[r2:RATED]-(m2:Movie)
    WHERE m1.title = $title
      AND m2.title <> $title
      AND r1.rating = 5 
      AND r2.rating = 5
    RETURN m2.title AS title, m2.genres AS genres, count(DISTINCT u) AS common_users
    ORDER BY common_users DESC
    '''

    result = gds.run_cypher(query, params={'title': title})
    return result


similar_movies = get_similar_movies('1408')
similar_movies.head()


Unnamed: 0,title,genres,common_users
0,Fight Club,Action|Crime|Drama|Thriller,109
1,Matrix The,Action|Sci-Fi|Thriller,107
2,Dark Knight The,Action|Crime|Drama|IMAX,102
3,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,96
4,Sixth Sense The,Drama|Horror|Mystery,94


In [6]:
def get_similar_movies(title, min_rating=5):
    """
    Retourne les films similaires basés sur les utilisateurs ayant noté les deux films avec une note minimale.

    Args:
        title (str): Le titre du film pour lequel on veut trouver des films similaires.
        min_rating (int): La note minimale que les utilisateurs doivent avoir donnée pour être considérés.

    Returns:
        pd.DataFrame: Liste des films similaires avec le nombre d'utilisateurs en commun.
    """
    
    query = '''
    MATCH (m1:Movie)-[r1:RATED]-(u:User)-[r2:RATED]-(m2:Movie)
    WHERE m1.title = $title
      AND m2.title <> $title
      AND r1.rating >= $min_rating 
      AND r2.rating >= $min_rating
    RETURN m2.title AS title, m2.genres AS genres, count(DISTINCT u) AS common_users
    ORDER BY common_users DESC
    '''

    result = gds.run_cypher(query, params={'title': title, 'min_rating': min_rating})
    return result


similar_movies = get_similar_movies('I Am Legend')
similar_movies.head()


Unnamed: 0,title,genres,common_users
0,Matrix The,Action|Sci-Fi|Thriller,562
1,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,508
2,Shawshank Redemption The,Crime|Drama,492
3,Dark Knight The,Action|Crime|Drama|IMAX,467
4,Lord of the RingsThe Return of the King The,Action|Adventure|Drama|Fantasy,459


In [7]:
def get_similar_movies(gds, title, min_rating=5):
    """
    Récupère la liste des films similaires à un film donné en fonction des utilisateurs
    ayant attribué une note égale ou supérieure à la note minimale spécifiée.

    Cette fonction utilise une requête Cypher pour identifier les films qui sont connectés
    par des utilisateurs ayant attribué une note >= min_rating à la fois au film cible
    et aux autres films. Les résultats sont triés par le nombre d'utilisateurs communs.

    Args:
        gds (GraphDataScience): L'instance de connexion au serveur Neo4j via GraphDataScience.
        title (str): Le titre exact du film pour lequel rechercher des films similaires.
        min_rating (int, optional): La note minimale que les utilisateurs doivent avoir attribuée pour que le film soit considéré comme similaire. Par défaut à 5.

    Returns:
        pd.DataFrame: Un DataFrame contenant :
            - title (str): Le titre des films similaires.
            - genres (str): Le ou les genres des films similaires.
            - common_users (int): Le nombre d'utilisateurs en commun ayant donné une note >= min_rating aux deux films.
    """

    query = '''
    MATCH (m1:Movie)-[r1:RATED]-(u:User)-[r2:RATED]-(m2:Movie)
    WHERE m1.title = $title
      AND m2.title <> $title
      AND r1.rating >= $min_rating
      AND r2.rating >= $min_rating
    RETURN m2.title AS title, m2.genres AS genres, count(DISTINCT u) AS common_users
    ORDER BY common_users DESC
    '''
    
    result = gds.run_cypher(query, params={'title': title, 'min_rating': min_rating})
    return result


similar_movies = get_similar_movies(gds, "I Am Legend", min_rating=4)
similar_movies.head()

Unnamed: 0,title,genres,common_users
0,Matrix The,Action|Sci-Fi|Thriller,4311
1,Shawshank Redemption The,Crime|Drama,3878
2,Dark Knight The,Action|Crime|Drama|IMAX,3837
3,Lord of the RingsThe Fellowship of the Ring The,Adventure|Fantasy,3702
4,Lord of the RingsThe Return of the King The,Action|Adventure|Drama|Fantasy,3681


In [None]:
gds.run_cypher("CALL gds.graph.drop('myGraph', false)")

create_projection = gds.run_cypher('''
CALL gds.graph.project(
  'myGraph',
  ['User', 'Movie'],
  {
    RATED: {
      properties: 'rating'
    }
  }
);
''')

create_projection.head()


In [24]:
gds.run_cypher("CALL gds.graph.drop('myGraphFiltered', false)")

create_projection = gds.run_cypher('''
CALL gds.graph.project.cypher(
  'myGraphFiltered',
  '
  MATCH (u:User)
  WHERE COUNT { (u)-[:RATED]->() } >= 2000
  RETURN id(u) AS id
  ',
  '
  MATCH (u1:User)-[r:RATED]->(m:Movie)
  RETURN id(u1) AS source, id(m) AS target, r.rating AS rating
  ',
  {validateRelationships: false}
)
''')

create_projection.head()


Unnamed: 0,nodeQuery,relationshipQuery,graphName,nodeCount,relationshipCount,projectMillis
0,MATCH (u:User)\n WHERE COUNT { (u)-[:RATED]->...,MATCH (u1:User)-[r:RATED]->(m:Movie)\n RETURN...,myGraphFiltered,436,0,6448


In [None]:
# MATCH (u:User {id: 1})-[:RATED]->(m1:Movie)
# WITH collect(DISTINCT m1.genres) AS genres_list
# UNWIND genres_list AS genre
# MATCH (m2:Movie)
# WHERE genre IN m2.genres AND NOT EXISTS {
#     MATCH (:User {id: 1})-[:RATED]->(m2)
# }
# RETURN m2.title, m2.genres
# LIMIT 5



In [25]:
users_similarity = gds.run_cypher('''
CALL gds.nodeSimilarity.stream('myGraphFiltered', {
  topK: 5,
  similarityCutoff: 0.6
})
YIELD node1, node2, similarity
RETURN 
  gds.util.asNode(node1).id AS UserID1,
  gds.util.asNode(node2).id AS UserID2,
  similarity
ORDER BY similarity DESCENDING, UserID1, UserID2
LIMIT 50
''')

users_similarity.head()


ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.nodeSimilarity.stream`: Caused by: java.lang.IllegalStateException: Memory required to run Node Similarity (97808b) exceeds available memory (-85895359152b)}

In [9]:
# Create Similar relationship
i=1
for chunk in np.array_split(users_similarity.query('UserID1>UserID2'),10):
  print(i)
  create_similar = gds.run_cypher('''
    unwind $data as row
    match (u1:User{id: row.UserID1}), (u2:User{id: row.UserID2})
    merge (u1)-[r:SIMILAR]->(u2)
    set r.Similarity=row.similarity
    return count(*) as create_rated
    ''', params = {'data': chunk.to_dict('records')})
  i = i+1
create_similar.head()

1
2
3
4
5
6
7
8
9
10


  return bound(*args, **kwds)


Unnamed: 0,create_rated
0,0


In [None]:
# Check similar movies
similar_movies_for_user = gds.run_cypher('''
    MATCH (u1:User)-[r1:SIMILAR]-(u2)-[r2:RATED]-(m:Movie)
    WHERE id(u1)=$id
    AND NOT ( (u1)-[]-(m))
    RETURN m.Title,m.Genres,Sum(r1.Similarity*r2.Rating)/sum(r1.Similarity)+log(count(r2)) as score
    ORDER BY score DESC
''',params = {'id':4725})
similar_movies_for_user.head(10)

In [None]:
# Check actual movies
movies_for_user = gds.run_cypher('''
    MATCH (u1:User)-[r:RATED]-(m:Movie)
  WHERE id(u1)=$id
  RETURN m.Title,m.Genres,r.Rating as rating
  ORDER BY rating DESC
''',params = {'id':4725})
movies_for_user.head(10)