In [1]:
import sparknlp

from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
# spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[2]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "org.postgresql:postgresql:42.5.0")\
    .getOrCreate()
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.2
Apache Spark version: 3.3.1


In [2]:
df_keyword=spark.read.parquet('./keywordFrequency.parquet/')
df_keyword.show()

+------------------+-----+
|               col|count|
+------------------+-----+
|            waters|  151|
|         first day|   46|
|  noah and entered|    2|
|        great deep|    4|
|             still|  276|
|        may canaan|    2|
|          ashkenaz|    3|
|men moved eastward|    1|
|             serug|    6|
|            spared|   24|
| anything too hard|    2|
|    sake of twenty|    1|
|      two men said|    1|
|        swept away|   10|
|         ammonites|   78|
|              hazo|    1|
|       master made|    1|
|           jewelry|   11|
|         go toward|    1|
|        loved esau|    1|
+------------------+-----+
only showing top 20 rows



In [3]:
from pyspark.sql.functions import *
   
df_keyword = df_keyword.withColumn("ID", monotonically_increasing_id())
df_keyword.show()
df_keyword.printSchema()

+------------------+-----+---+
|               col|count| ID|
+------------------+-----+---+
|            waters|  151|  0|
|         first day|   46|  1|
|  noah and entered|    2|  2|
|        great deep|    4|  3|
|             still|  276|  4|
|        may canaan|    2|  5|
|          ashkenaz|    3|  6|
|men moved eastward|    1|  7|
|             serug|    6|  8|
|            spared|   24|  9|
| anything too hard|    2| 10|
|    sake of twenty|    1| 11|
|      two men said|    1| 12|
|        swept away|   10| 13|
|         ammonites|   78| 14|
|              hazo|    1| 15|
|       master made|    1| 16|
|           jewelry|   11| 17|
|         go toward|    1| 18|
|        loved esau|    1| 19|
+------------------+-----+---+
only showing top 20 rows

root
 |-- col: string (nullable = true)
 |-- count: long (nullable = true)
 |-- ID: long (nullable = false)



In [6]:
df_keyword2=df_keyword.selectExpr('col as name','count as frequency','ID as id')
df_keyword2.show()

+------------------+---------+---+
|              name|frequency| id|
+------------------+---------+---+
|            waters|      151|  0|
|         first day|       46|  1|
|  noah and entered|        2|  2|
|        great deep|        4|  3|
|             still|      276|  4|
|        may canaan|        2|  5|
|          ashkenaz|        3|  6|
|men moved eastward|        1|  7|
|             serug|        6|  8|
|            spared|       24|  9|
| anything too hard|        2| 10|
|    sake of twenty|        1| 11|
|      two men said|        1| 12|
|        swept away|       10| 13|
|         ammonites|       78| 14|
|              hazo|        1| 15|
|       master made|        1| 16|
|           jewelry|       11| 17|
|         go toward|        1| 18|
|        loved esau|        1| 19|
+------------------+---------+---+
only showing top 20 rows



In [7]:

df_keyword2.write.format("jdbc")\
    .option("url", "jdbc:postgresql://192.168.1.39:5432/aiknowledge")\
    .option("dbtable", "public.articles_keyword")\
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .mode('overwrite').save()

In [None]:
from py2neo import Graph, Node, Relationship
import time
from tqdm import tqdm

for row in tqdm(df_keyword.rdd.collect()):
    keyword = Node('bible_keyword', name=row['keyword'])
    keyword['count']=row['count']
    graph.create(keyword)


  


### write to neo4j