In [1]:
%load_ext autoreload
%autoreload 2

# Neo4j v2.0.1

**Descarga**
- [Descargar Neo4j Desktop v2.0.1 (macOS)](https://neo4j.com/download/neo4j-desktop/?edition=desktop&flavour=osx&release=2.0.1&offline=false)

**Configuración**
1. Instalar Neo4j Desktop.
2. Abrir la app y crear un nuevo proyecto con "Add".
3. Dentro del proyecto, seleccionar "Add Graph" → "Local DBMS".
4. Configurar:
   - Name: `BDNR_ml-25m`
   - Password: `bdnr2025`
   - Versión usada: `2025.05.0`
5. Hacer clic en "Create".

**Conexión**
```
neo4j://127.0.0.1:7687
```

In [4]:
import json
from src.benchmark.utils import get_system_info
import py2neo

print("py2neo Version:", py2neo.__version__)
print("System Info:\n", json.dumps(get_system_info(), indent=3))

py2neo Version: 2021.2.4
System Info:
 {
   "python_version": "3.12.1",
   "system": {
      "os": "Darwin",
      "release": "24.5.0",
      "machine": "x86_64",
      "processor": "i386"
   },
   "cpu": {
      "physical_cores": 8,
      "total_cores": 8,
      "frequency_mhz": 2400
   },
   "memory": {
      "total_ram_gb": 16.0,
      "available_ram_gb": 0.76
   }
}


# Data Load

In [10]:
from py2neo import Graph

graph = Graph("bolt://localhost:7687", auth=("neo4j", "bdnr2025"))

In [12]:
try:
    graph.run("RETURN 1").evaluate()
    print("Successfully connected to Neo4j")
except Exception as e:
    print(f"Connection failed: {e}")

Successfully connected to Neo4j


In [13]:
databases = graph.run("SHOW DATABASES").data()
print("Databases:", [db["name"] for db in databases])

Databases: ['neo4j', 'system']


In [27]:
from pprint import pprint

result = graph.run("SHOW DATABASES").data()
for db in result:
    print(f"\n{db['name']}")
    pprint(db, indent=2, width=100, depth=3, sort_dicts=False)


neo4j
{ 'name': 'neo4j',
  'type': 'standard',
  'aliases': [],
  'access': 'read-write',
  'address': 'localhost:7687',
  'role': 'primary',
  'writer': True,
  'requestedStatus': 'online',
  'currentStatus': 'online',
  'statusMessage': '',
  'default': True,
  'home': True,
  'constituents': []}

system
{ 'name': 'system',
  'type': 'system',
  'aliases': [],
  'access': 'read-write',
  'address': 'localhost:7687',
  'role': 'primary',
  'writer': True,
  'requestedStatus': 'online',
  'currentStatus': 'online',
  'statusMessage': '',
  'default': False,
  'home': False,
  'constituents': []}


In [28]:
import pandas as pd

# Read CSV files
base_path = "/Users/efaliveni//Downloads/ml-25m"

ratings = pd.read_csv(f"{base_path}/ratings.csv")
movies = pd.read_csv(f"{base_path}/movies.csv")
tags = pd.read_csv(f"{base_path}/tags.csv")
genome_tags = pd.read_csv(f"{base_path}/genome-tags.csv")
genome_scores = pd.read_csv(f"{base_path}/genome-scores.csv")

In [30]:
# Filter Data
SAMPLE_SIZE = 500

selected_users = ratings['userId'].unique()[:SAMPLE_SIZE]
filtered_ratings = ratings[ratings['userId'].isin(selected_users)]
filtered_movie_ids = filtered_ratings['movieId'].unique()
filtered_movies = movies[movies['movieId'].isin(filtered_movie_ids)]
filtered_tags = tags[tags['movieId'].isin(filtered_movie_ids)]
filtered_scores = genome_scores[genome_scores['movieId'].isin(filtered_movie_ids)]

In [31]:
filtered_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
60581,203649,Beats (2019),Children|Drama
60937,204542,It: Chapter Two (2019),Horror
61002,204692,In the Tall Grass,Drama|Horror|Thriller
61005,204698,Joker (2019),Crime|Drama|Thriller


In [None]:
import re

def extract_year(title):
    match = re.search(r"\((\d{4})\)$", title)
    return int(match.group(1)) if match else None

# Process Data 
filtered_movies.loc[:, "genres"] = filtered_movies["genres"].fillna("").apply(
    lambda x: x.split("|") if x else []
)
filtered_movies.loc[:, "year"] = filtered_movies["title"].apply(
    extract_year
).astype("Int64")

### Batch Load: UNWIND & MERGE

In [33]:
def load_ratings(ratings_df: pd.DataFrame, batch_size: int = 1000) -> None:
    """Creates User and Movie nodes with RATED relationships."""
    rating_data = ratings_df.to_dict("records")
    
    query = """
    UNWIND $batch AS row
    MERGE (u:User {userId: row.userId})
    MERGE (m:Movie {movieId: row.movieId})
    MERGE (u)-[r:RATED]->(m)
    SET r.rating = row.rating, 
        r.timestamp = datetime({epochSeconds: row.timestamp})
    """
    
    # Process in batches to avoid memory issues
    for i in range(0, len(rating_data), batch_size):
        batch = rating_data[i:i + batch_size]
        graph.run(query, batch=batch)


def load_movies(movies_df: pd.DataFrame, batch_size: int = 1000) -> None:
    """Creates Movie nodes."""
    movie_data = movies_df.to_dict("records")
      
    query = """
    UNWIND $batch AS row
    MERGE (m:Movie {movieId: row.movieId})
    SET m.title = row.title,
        m.releaseYear = row.year,
        m.genres = CASE 
            WHEN row.genres IS NULL THEN []
            WHEN row.genres = '' THEN []
            ELSE [genre IN split(row.genres, '|') WHERE genre <> '']
        END
    """
    
    for i in range(0, len(movie_data), batch_size):
        batch = movie_data[i:i + batch_size]
        graph.run(query, batch=batch)


def load_tags(tags_df: pd.DataFrame, batch_size: int = 1000) -> None:
    """Creates TAGGED relationships between existing Users and Movies"""
    # Clean data - remove null tags
    tag_data = tags_df.dropna(subset=["tag"]).to_dict("records")
    
    query = """
    UNWIND $batch AS row
    MATCH (u:User {userId: row.userId})
    MATCH (m:Movie {movieId: row.movieId})
    MERGE (u)-[t:TAGGED]->(m)
    SET t.tag = row.tag
    """
    
    for i in range(0, len(tag_data), batch_size):
        batch = tag_data[i:i + batch_size]
        graph.run(query, batch=batch)


def load_genome_tags(genome_tags_df: pd.DataFrame, batch_size: int = 1000) -> None:
    """Load genome tags as GenomeTag nodes"""
    genome_tag_data = genome_tags_df.to_dict("records")
    
    query = """
    UNWIND $batch AS row
    MERGE (gt:GenomeTag {tagId: row.tagId})
    SET gt.tag = row.tag
    """
    
    for i in range(0, len(genome_tag_data), batch_size):
        batch = genome_tag_data[i:i + batch_size]
        graph.run(query, batch=batch)


def load_relevance_scores(
        scores_df: pd.DataFrame, 
        threshold: float = 0.8, 
        batch_size: int = 1000
) -> None:
    """Load relevance scores between movies and genome tags"""
    # Filter by relevance threshold
    filtered_scores = scores_df[scores_df['relevance'] >= threshold]
    score_data = filtered_scores.to_dict("records")
    
    query = """
    UNWIND $batch AS row
    MATCH (m:Movie {movieId: row.movieId})
    MATCH (gt:GenomeTag {tagId: row.tagId})
    MERGE (m)-[r:HAS_RELEVANCE]->(gt)
    SET r.score = row.relevance
    """
    
    for i in range(0, len(score_data), batch_size):
        batch = score_data[i:i + batch_size]
        graph.run(query, batch=batch)

In [37]:
# graph.run("MATCH (n) DETACH DELETE n")

In [38]:
import time

start_time = time.time()

stats = {
    'movies_loaded': 0,
    'users_loaded': 0,
    'tags_created': 0,
    'execution_time': 0,
    'genome_tags_created': 0,
    'relevance_relations_created': 0
}

try:
    print("Starting data loading process...")
    
    # Load movies (foundational nodes)
    print("\nLoading movies...")
    movie_start = time.time()
    load_movies(filtered_movies)
    stats['movies_loaded'] = len(filtered_movies)
    print(f"Movies loaded in {time.time() - movie_start:.2f}s")
    
    # Load users through ratings
    print("\nLoading ratings (creates users)...")
    ratings_start = time.time()
    load_ratings(filtered_ratings)
    stats['users_loaded'] = len(filtered_ratings)
    print(f"Ratings loaded in {time.time() - ratings_start:.2f}s")
    
    # Load tags
    print("\nLoading tags...")
    tags_start = time.time()
    load_tags(filtered_tags)
    stats['tags_created'] = len(filtered_tags)
    print(f"Tags loaded in {time.time() - tags_start:.2f}s")
    
   # Load genome tags
    print("\nLoading genome tags...")
    genome_tags_start = time.time()
    load_genome_tags(genome_tags)
    stats['genome_tags_created'] = len(genome_tags)
    print(f"Genome tags loaded in {time.time() - genome_tags_start:.2f}s")
    
    # Load relevance scores
    print("\nLoading relevance scores...")
    relevance_start = time.time()
    load_relevance_scores(filtered_scores)
    stats['relevance_relations_created'] = len(filtered_scores)
    print(f"Relevance scores loaded in {time.time() - relevance_start:.2f}s")


except Exception as e:
    print(f"\nError during data loading: {str(e)}")
    raise
finally:
    stats['execution_time'] = time.time() - start_time
    print(f"\nTotal processing time: {stats['execution_time']:.2f} seconds")
    print(f"Summary: {stats}")

Starting data loading process...

Loading movies...
Movies loaded in 4.52s

Loading ratings (creates users)...
Ratings loaded in 75.67s

Loading tags...
Tags loaded in 57.65s

Loading genome tags...
Genome tags loaded in 0.16s

Loading relevance scores...
Relevance scores loaded in 82.77s

Total processing time: 220.76 seconds
Summary: {'movies_loaded': 7141, 'users_loaded': 62834, 'tags_created': 811156, 'execution_time': 220.7614848613739, 'genome_tags_created': 1128, 'relevance_relations_created': 7641072}


In [41]:
# Total de nodos por tipo
def contar_nodos():
    query = """
    MATCH (n)
    RETURN labels(n)[0] AS tipo, count(*) AS cantidad
    """
    return graph.run(query).to_table()

# Total de relaciones por tipo
def contar_relaciones():
    query = """
    MATCH ()-[r]->()
    RETURN type(r) AS relacion, count(*) AS cantidad
    """
    return graph.run(query).to_table()

# Usuarios sin ratings
def usuarios_sin_rating():
    query = """
    MATCH (u:User)
    WHERE NOT (u)-[:RATED]->()
    RETURN count(u) AS usuarios_sin_rating
    """
    return graph.run(query).to_table()

# Películas sin género ni rating ni genome tags
def peliculas_huerfanas():
    query = """
    MATCH (m:Movie)
    WHERE NOT (m)-[:HAS_GENRE]->()
      AND NOT (m)<-[:RATED]-()
      AND NOT (m)-[:HAS_RELEVANCE]->()
    RETURN count(m) AS peliculas_sin_relaciones
    """
    return graph.run(query).to_table()


In [42]:
print("Nodos por tipo:")
print(contar_nodos())

print("\nRelaciones por tipo:")
print(contar_relaciones())

print("\nUsuarios sin ratings:")
print(usuarios_sin_rating())

print("\nPelículas sin ningún vínculo:")
print(peliculas_huerfanas())


Nodos por tipo:
 tipo      | cantidad 
-----------|----------
 Movie     |     7141 
 User      |      500 
 GenomeTag |     1128 


Relaciones por tipo:
 relacion      | cantidad 
---------------|----------
 HAS_RELEVANCE |    83312 
 RATED         |    62834 
 TAGGED        |      313 


Usuarios sin ratings:
 usuarios_sin_rating 
---------------------
                   0 


Películas sin ningún vínculo:
 peliculas_sin_relaciones 
--------------------------
                        0 

