In [1]:
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType, DataType,ArrayType
from pyspark.sql.functions import udf, struct
from pyspark.ml import Pipeline
from IPython.display import display, HTML
import re

In [2]:
import sparknlp

from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
# spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[2]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.3,org.postgresql:postgresql:42.5.0")\
    .getOrCreate()
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.2
Apache Spark version: 3.3.1


In [3]:
### bible keyword frequency
df_keyword=spark.read.parquet('./bibleKeyword2.parquet/')       
df_keyword.show()
df_keyword=df_keyword.selectExpr('*','unique_keywords as keywords')                  
df_keyword.show()      


+--------+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
|      ID|book|chapter|verse|                text|     unique_keywords|OsisID|BookName|TotalChapters|Volume|
+--------+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
| Gen_1_1|   1|      1|    1|In the beginning ...|[beginning, god, ...|   Gen| Genesis|           50|    OT|
| Gen_1_2|   1|      1|    2|Now the earth was...|[earth, formless,...|   Gen| Genesis|           50|    OT|
| Gen_1_3|   1|      1|    3|And God said, "Le...|[god, said, light...|   Gen| Genesis|           50|    OT|
| Gen_1_4|   1|      1|    4|God saw that the ...|[god, saw, light,...|   Gen| Genesis|           50|    OT|
| Gen_1_5|   1|      1|    5|God called the li...|[god, called, lig...|   Gen| Genesis|           50|    OT|
| Gen_1_6|   1|      1|    6|And God said, "Le...|[god, said, expan...|   Gen| Genesis|           50|    OT|
| Gen_1_7|   1|    

In [4]:
from pyspark.sql.functions import *

length=df_keyword.count()
length     
df_keyword = df_keyword.withColumn("number", monotonically_increasing_id())
df_keyword.select('text','number').show()

+--------------------+------+
|                text|number|
+--------------------+------+
|In the beginning ...|     0|
|Now the earth was...|     1|
|And God said, "Le...|     2|
|God saw that the ...|     3|
|God called the li...|     4|
|And God said, "Le...|     5|
|So God made the e...|     6|
|God called the ex...|     7|
|And God said, "Le...|     8|
|God called the dr...|     9|
|Then God said, "L...|    10|
|The land produced...|    11|
|And there was eve...|    12|
|And God said, "Le...|    13|
|and let them be l...|    14|
|God made two grea...|    15|
|God set them in t...|    16|
|to govern the day...|    17|
|And there was eve...|    18|
|And God said, "Le...|    19|
+--------------------+------+
only showing top 20 rows



In [5]:
df_keyword.select('ID','text','number').show()

+--------+--------------------+------+
|      ID|                text|number|
+--------+--------------------+------+
| Gen_1_1|In the beginning ...|     0|
| Gen_1_2|Now the earth was...|     1|
| Gen_1_3|And God said, "Le...|     2|
| Gen_1_4|God saw that the ...|     3|
| Gen_1_5|God called the li...|     4|
| Gen_1_6|And God said, "Le...|     5|
| Gen_1_7|So God made the e...|     6|
| Gen_1_8|God called the ex...|     7|
| Gen_1_9|And God said, "Le...|     8|
|Gen_1_10|God called the dr...|     9|
|Gen_1_11|Then God said, "L...|    10|
|Gen_1_12|The land produced...|    11|
|Gen_1_13|And there was eve...|    12|
|Gen_1_14|And God said, "Le...|    13|
|Gen_1_15|and let them be l...|    14|
|Gen_1_16|God made two grea...|    15|
|Gen_1_17|God set them in t...|    16|
|Gen_1_18|to govern the day...|    17|
|Gen_1_19|And there was eve...|    18|
|Gen_1_20|And God said, "Le...|    19|
+--------+--------------------+------+
only showing top 20 rows



In [6]:

df_keyword=df_keyword.select('*',transform("keywords", lambda x: trim(x)).alias("keywords2"))

In [13]:
df_keyword.select('text','keywords2').show()

+--------------------+--------------------+
|                text|           keywords2|
+--------------------+--------------------+
|In the beginning ...|[beginning, god, ...|
|Now the earth was...|[earth, formless,...|
|And God said, "Le...|[god, said, light...|
|God saw that the ...|[god, saw, light,...|
|God called the li...|[god, called, lig...|
|And God said, "Le...|[god, said, expan...|
|So God made the e...|[god, made, expan...|
|God called the ex...|[god, called, exp...|
|And God said, "Le...|[god, said, water...|
|God called the dr...|[god, called, dry...|
|Then God said, "L...|[god, said, land,...|
|The land produced...|[land, produced, ...|
|And there was eve...|[evening, third, ...|
|And God said, "Le...|[god, said, light...|
|and let them be l...|[let, lights, exp...|
|God made two grea...|[god, made, two, ...|
|God set them in t...|[god, set, expans...|
|to govern the day...|[govern, day, nig...|
|And there was eve...|[evening, fourth,...|
|And God said, "Le...|[god, said

In [16]:

df2 = df_keyword.select('*',explode(df_keyword.keywords2))
df2.printSchema()
df2.select('text','keywords2','col').show()

root
 |-- ID: string (nullable = true)
 |-- book: integer (nullable = true)
 |-- chapter: integer (nullable = true)
 |-- verse: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- unique_keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- OsisID: string (nullable = true)
 |-- BookName: string (nullable = true)
 |-- TotalChapters: integer (nullable = true)
 |-- Volume: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- number: long (nullable = false)
 |-- keywords2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- col: string (nullable = true)

+--------------------+--------------------+--------------------+
|                text|           keywords2|                 col|
+--------------------+--------------------+--------------------+
|In the beginning ...|[beginning, god, ...|           beginning|
|In the beginning ...|[beginning, god, ...|   

In [17]:
df2.select('ID','number','text','keywords2','col').distinct().orderBy('number').show()

+-------+------+--------------------+--------------------+--------------------+
|     ID|number|                text|           keywords2|                 col|
+-------+------+--------------------+--------------------+--------------------+
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|             created|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|       beginning god|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|         god created|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...| created the heavens|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|               earth|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|             heavens|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|                 god|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|beginning god cre...|
|Gen_1_1|     0|In the beginning ...|[beginning, god, ...|           beginning|
|Gen_1_2|     1|Now the earth was...|[ea

In [18]:
df2.count()
df3=df2.select('*').groupBy('col').count()



In [19]:
df3.describe().show()

+-------+-------+-----------------+
|summary|    col|            count|
+-------+-------+-----------------+
|  count|  96024|            96024|
|   mean|   null|4.269016079313505|
| stddev|   null|38.47582950956845|
|    min|  aaron|                1|
|    max|zuzites|             6183|
+-------+-------+-----------------+



In [20]:
df3.orderBy('count',ascending=False).show()
# df_keyword.printSchema()
# df_keyword.createTempView('keyword')     
# df_keyword.describe().show()

+------+-----+
|   col|count|
+------+-----+
|  lord| 6183|
|   god| 3263|
|  said| 2903|
|   one| 2130|
|people| 1934|
|   man| 1804|
|  king| 1795|
|   son| 1601|
|israel| 1569|
|   men| 1566|
|  land| 1219|
|  come| 1216|
|  like| 1205|
| jesus| 1176|
|   day| 1169|
|  went| 1161|
|  came| 1148|
|    go| 1073|
|  made| 1034|
|    us|  975|
+------+-----+
only showing top 20 rows



In [21]:
df_keyword.write.format("jdbc")\
    .option("url", "jdbc:postgresql://192.168.1.39:5432/aiknowledge")\
    .option("dbtable", "public.bible_keyword")\
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .mode('overwrite').save()

In [23]:
df3.show()

+------------------+-----+
|               col|count|
+------------------+-----+
|            waters|  151|
|         first day|   46|
|  noah and entered|    2|
|        great deep|    4|
|             still|  276|
|        may canaan|    2|
|          ashkenaz|    3|
|men moved eastward|    1|
|             serug|    6|
|            spared|   24|
| anything too hard|    2|
|    sake of twenty|    1|
|      two men said|    1|
|        swept away|   10|
|         ammonites|   78|
|              hazo|    1|
|       master made|    1|
|           jewelry|   11|
|         go toward|    1|
|        loved esau|    1|
+------------------+-----+
only showing top 20 rows



In [27]:
df3.where("col like '%god%'").orderBy(column('count').desc()).limit(100).show()

+-----------------+-----+
|              col|count|
+-----------------+-----+
|              god| 3263|
|    lord your god|  345|
|             gods|  207|
|    god of israel|  157|
|            o god|   91|
|     lord our god|   71|
|         lord god|   70|
|   kingdom of god|   55|
|         god said|   53|
|       man of god|   50|
|      word of god|   38|
|     god almighty|   35|
|   lord their god|   31|
|      lord my god|   28|
|     house of god|   27|
|    god is giving|   25|
|       ark of god|   25|
|lord god almighty|   25|
|         god came|   24|
|       son of god|   24|
+-----------------+-----+
only showing top 20 rows



In [28]:
df3.write.mode('overwrite').parquet('./keywordFrequency.parquet')