### 1. Introduction
This notebook is used to create the Graph and database in Neo4j AuraDB

### 2. Import dependencies

In [None]:
import os
import streamlit as st
import pandas as pd
from langchain_neo4j import Neo4jGraph
from neo4j import GraphDatabase
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.graphs.graph_document import Node, Relationship
from datetime import datetime


### 3. Import dataset

In [None]:
file_path = "/src/netflix_db/netflix_titles.csv"
dataset = pd.read_csv(file_path)

In [3]:
dataset.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
dataset.fillna("Unknown", inplace=True)

In [5]:
dataset.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

### 3. Create Graph

In [6]:
# Create the Graph Instance
graph = Neo4jGraph(
    url=st.secrets["NEO4J_URI"],
    username=st.secrets["NEO4J_USERNAME"],
    password=st.secrets["NEO4J_PASSWORD"],
)

# Create the Embeddings
embedding_provider = OpenAIEmbeddings(
    openai_api_key=os.getenv('OPENAI_API_KEY'),
    model="text-embedding-ada-002"
    )


In [48]:
graph.query("MATCH (n) DETACH DELETE n")

[]

In [7]:
# Normalize data helper function
def normalize_list(data):
    if not isinstance(data, list): return []
    return [item.strip().lower() for item in data]

# Embedding helper function
def list_embedding(list_of_items, embedding_provider):
    return [embedding_provider.embed_query(item) for item in list_of_items]


In [None]:
# Create a unique constraint
graph.query("""
    CREATE CONSTRAINT FOR (m:Movie) REQUIRE m.title IS UNIQUE
    """)

graph.query("""
    CREATE CONSTRAINT FOR (n:MovieType) REQUIRE n.movieType IS UNIQUE
    """)

graph.query("""
    CREATE CONSTRAINT FOR (d:Director) REQUIRE d.directorName IS UNIQUE
    """)

graph.query("""
    CREATE CONSTRAINT FOR (a:Actor) REQUIRE a.actorName IS UNIQUE
    """)

graph.query("""
    CREATE CONSTRAINT FOR (g:Genre) REQUIRE g.genre IS UNIQUE
    """)



In [49]:
# Iterate over the dataset and create the graph
for index, row in dataset.iterrows():
    # if (0 >= index) or (index <= 100):
    print("Processing - ", index)
    # Normalize the data
    movie_properties = {
        "show_id": row['show_id'],
        "title": row['title'],
        "country": normalize_list(row['country'].split(", ")),
        "release_year": row['release_year'],
        "duration": row['duration'],
        "description": row['description'],
        "type": row['type'].lower(),
        "director": normalize_list(row['director'].split(", ")),
        "cast": normalize_list(row['cast'].split(", ")),
        "listed_in": normalize_list(row['listed_in'].split(", ")),
        "descriptionEmbedding": embedding_provider.embed_query(row['description']),
        # "actorEmbedding": list_embedding(normalize_list(row['cast'].split(", ")), embedding_provider),
        # "directorEmbedding": list_embedding(normalize_list(row['director'].split(", ")), embedding_provider),
        # "genreEmbedding": list_embedding(normalize_list(row['listed_in'].split(", ")), embedding_provider),
    }

    # Cypher query
    graph.query("""
    MERGE (m:Movie {id: $show_id})
    ON CREATE SET 
        m.title = $title,
        m.description = $description,
        m.country = $country,
        m.release_year = $release_year,
        m.duration = $duration
    WITH m, $descriptionEmbedding AS descriptionEmbedding
    CALL db.create.setNodeVectorProperty(m, 'descriptionEmbedding', descriptionEmbedding)

    MERGE (mt:MovieType {movieType: $type})
    MERGE (m)-[:IS_TYPE]->(mt)

    WITH m, $director AS directors // $directorEmbedding AS embeddings
    UNWIND range(0, size(directors) - 1) AS i
    MERGE (d:Director {directorName: directors[i]})
    WITH m, d // i, embeddings
    // CALL db.create.setNodeVectorProperty(d, 'directorEmbedding', embeddings[i])
    MERGE (m)-[:DIRECTED_BY]->(d)

    WITH m, $cast AS cast // $actorEmbedding AS embeddings
    UNWIND range(0, size(cast) - 1) AS i
    MERGE (a:Actor {actorName: cast[i]})
    WITH m, a // embeddings, i
    // CALL db.create.setNodeVectorProperty(a, 'actorEmbedding', embeddings[i])
    MERGE (a)-[:ACTED_IN]->(m)

    WITH m, $listed_in AS genres // $genreEmbedding AS embeddings
    UNWIND range(0, size(genres) - 1) AS i
    MERGE (g:Genre {genre: genres[i]})
    WITH m, g // embeddings, i
    // CALL db.create.setNodeVectorProperty(g, 'genreEmbedding', embeddings[i])
    MERGE (m)-[:IN_GENRE]->(g)
    """, 
    movie_properties)



Processing -  0
Processing -  1
Processing -  2
Processing -  3
Processing -  4
Processing -  5
Processing -  6
Processing -  7
Processing -  8
Processing -  9
Processing -  10
Processing -  11
Processing -  12
Processing -  13
Processing -  14
Processing -  15
Processing -  16
Processing -  17
Processing -  18
Processing -  19
Processing -  20
Processing -  21
Processing -  22
Processing -  23
Processing -  24
Processing -  25
Processing -  26
Processing -  27
Processing -  28
Processing -  29
Processing -  30
Processing -  31
Processing -  32
Processing -  33
Processing -  34
Processing -  35
Processing -  36
Processing -  37
Processing -  38
Processing -  39
Processing -  40
Processing -  41
Processing -  42
Processing -  43
Processing -  44
Processing -  45
Processing -  46
Processing -  47
Processing -  48
Processing -  49
Processing -  50
Processing -  51
Processing -  52
Processing -  53
Processing -  54
Processing -  55
Processing -  56
Processing -  57
Processing -  58
Process

In [None]:
# Create a Vector Index for the Movie Description
graph.query("""
    CREATE VECTOR INDEX MovieVector IF NOT EXISTS
    FOR (m:Movie)
    ON m.descriptionEmbedding
    OPTIONS {indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
            }}
        """
    )

[]