<a href="https://colab.research.google.com/github/conker84/gc-2k22-spark/blob/main/neo4j_spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Leveraging Neo4j with Apache Spark

## Environment Configuration

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz

In [None]:
!tar xf spark-3.2.1-bin-hadoop3.2.tgz

In [None]:
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

### Download The Dataset

In [None]:
!wget -q https://s3.amazonaws.com/dev.assets.neo4j.com/wp-content/uploads/desktop-csv-import.zip

In [None]:
!unzip desktop-csv-import.zip

### Configure the Spark Environment

In [None]:
neo4j_url = input('Neo4j URL: ')

In [None]:
neo4j_user = input('Neo4j User: ')

In [None]:
neo4j_password = input('Neo4j Password: ')

In [None]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
        .master('local[*]')
        .appName('Leverage Neo4j with Apache Spark')
        .config('spark.ui.port', '4050')
        # Just to show dataframes as tables
        .config('spark.sql.repl.eagerEval.enabled', True)
        .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.2_for_spark_3')
        # As we're using always the same database instance we'll
        # define them as global variables
        # so we don't need to repeat them each time
        .config("neo4j.url", neo4j_url)
        .config("neo4j.authentication.type", "basic")
        .config("neo4j.authentication.basic.username", neo4j_user)
        .config("neo4j.authentication.basic.password", neo4j_password)
        .getOrCreate())

## Explore the Dataset

In [None]:
df = (spark.read
      .format('csv')
      .option('inferSchema', True)
      .option('path', '/content/desktop-csv-import/products.csv')
      .load())

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
df = (df.withColumnRenamed('_c0', 'id')
      .withColumnRenamed('_c1', 'name')
      .withColumnRenamed('_c2', 'price'))

In [None]:
df.printSchema()

In [None]:
df