# 8 Keyword Extraction with YAKE

In [1]:
# ! pip install -q pyspark==3.3.0 spark-nlp==4.2.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spark-nlp-jsl 4.2.2 requires spark-nlp==4.2.2, but you have spark-nlp 4.2.0 which is incompatible.[0m[31m
[0m

In [7]:
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType, DataType,ArrayType
from pyspark.sql.functions import udf, struct
from pyspark.ml import Pipeline
from IPython.display import display, HTML
import re

In [8]:
import sparknlp

from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
# spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[2]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.3,org.postgresql:postgresql:42.5.0")\
    .getOrCreate()
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.2.2
Apache Spark version: 3.3.1


In [9]:
stopwords = StopWordsCleaner().getStopWords()

In [10]:
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

## YAKE Keyword Extractor

Yake is an Unsupervised, Corpus-Independent, Domain and Language-Independent and Single-Document keyword extraction algorithm.

Extracting keywords from texts has become a challenge for individuals and organizations as the information grows in complexity and size. The need to automate this task so that text can be processed in a timely and adequate manner has led to the emergence of automatic keyword extraction tools. Yake is a novel feature-based system for multi-lingual keyword extraction, which supports texts of different sizes, domain or languages. Unlike other approaches, Yake does not rely on dictionaries nor thesauri, neither is trained against any corpora. Instead, it follows an unsupervised approach which builds upon features extracted from the text, making it thus applicable to documents written in different languages without the need for further knowledge. This can be beneficial for a large number of tasks and a plethora of situations where access to training corpora is either limited or restricted.


The algorithm makes use of the position of a sentence and token. Therefore, to use the annotator, the text should be first sent through a Sentence Boundary Detector and then a tokenizer.

You can tweak the following parameters to get the best result from the annotator.

- *setMinNGrams(int)* Select the minimum length of a extracted keyword
- *setMaxNGrams(int)* Select the maximum length of a extracted keyword
- *setNKeywords(int)* Extract the top N keywords
- *setStopWords(list)* Set the list of stop words
- *setThreshold(float)* Each keyword will be given a keyword score greater than 0. (Lower the score better the keyword) Set an upper bound for the keyword score from this method.
- *setWindowSize(int)* Yake will construct a co-occurence matrix. You can set the window size for the cooccurence matrix construction from this method. ex: windowSize=2 will look at two words to both left and right of a candidate word.


<b>References</b>

Campos, R., Mangaravite, V., Pasquali, A., Jatowt, A., Jorge, A., Nunes, C. and Jatowt, A. (2020). YAKE! Keyword Extraction from Single Documents using Multiple Local Features. In Information Sciences Journal. Elsevier, Vol 509, pp 257-289. [pdf](https://doi.org/10.1016/j.ins.2019.09.013)

In [11]:
document = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

sentenceDetector = SentenceDetector() \
            .setInputCols("document") \
            .setOutputCol("sentence")

token = Tokenizer() \
            .setInputCols("sentence") \
            .setOutputCol("token") \
            .setContextChars(["(", ")", "?", "!", ".", ","])

keywords = YakeKeywordExtraction() \
            .setInputCols("token") \
            .setOutputCol("keywords") \
            .setMinNGrams(1) \
            .setMaxNGrams(3)\
            .setNKeywords(20)\
            .setStopWords(stopwords)

yake_pipeline = Pipeline(stages=[document, sentenceDetector, token, keywords])

empty_df = spark.createDataFrame([['']]).toDF("text")

yake_Model = yake_pipeline.fit(empty_df)

In [12]:
# LightPipeline

light_model = LightPipeline(yake_Model)

text = '''
Then the LORD said, "The outcry against Sodom and Gomorrah is so great and their sin so grievous'''

light_result = light_model.fullAnnotate(text)[0]

[(s.metadata['sentence'], s.result) for s in light_result['sentence']]

[('0',
  'Then the LORD said, "The outcry against Sodom and Gomorrah is so great and their sin so grievous')]

In [13]:
light_result.keys()


dict_keys(['document', 'sentence', 'token', 'keywords'])

In [20]:
# df = spark.createDataFrame(data=light_result,schema=[['document', 'sentence', 'token', 'keywords']])
# df.printSchema()
# df.show(truncate=False)


In [12]:
# import pandas as pd

# keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'],  k.metadata['sentence']) for k in light_result['keywords']],
#                        columns = ['keywords','begin','end','score','sentence'])
# keys_df['score'] = keys_df['score'].astype(float)

# # ordered by relevance 
# keys_df.sort_values(['sentence','score']).head(30)

Unnamed: 0,keywords,begin,end,score,sentence
7,grievous,89,96,0.393326,0
8,lord said,10,18,0.440864,0
0,lord,10,13,0.47587,0
3,sodom,41,45,0.47587,0
4,gomorrah,51,58,0.47587,0
1,said,15,18,0.642974,0
2,outcry,26,31,0.642974,0
5,great,66,70,0.642974,0
6,sin,82,84,0.642974,0
10,sodom and gomorrah,41,58,0.907923,0


### Getting keywords from datraframe

In [39]:
# ! wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed_sample_text_small.csv

df = spark.read.csv("../data/bibleNIV.csv",inferSchema=True)\
                
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)



In [40]:
df.show()

+---+---+---+--------------------+
|_c0|_c1|_c2|                 _c3|
+---+---+---+--------------------+
|  1|  1|  1|In the beginning ...|
|  1|  1|  2|Now the earth was...|
|  1|  1|  3|And God said, "Le...|
|  1|  1|  4|God saw that the ...|
|  1|  1|  5|God called the li...|
|  1|  1|  6|And God said, "Le...|
|  1|  1|  7|So God made the e...|
|  1|  1|  8|God called the ex...|
|  1|  1|  9|And God said, "Le...|
|  1|  1| 10|God called the dr...|
|  1|  1| 11|Then God said, "L...|
|  1|  1| 12|The land produced...|
|  1|  1| 13|And there was eve...|
|  1|  1| 14|And God said, "Le...|
|  1|  1| 15|and let them be l...|
|  1|  1| 16|God made two grea...|
|  1|  1| 17|God set them in t...|
|  1|  1| 18|to govern the day...|
|  1|  1| 19|And there was eve...|
|  1|  1| 20|And God said, "Le...|
+---+---+---+--------------------+
only showing top 20 rows



In [41]:
df2=df.selectExpr('_c0 as book','_c1 as chapter','_c2 as verse','_c3 as text')
df2.show()

+----+-------+-----+--------------------+
|book|chapter|verse|                text|
+----+-------+-----+--------------------+
|   1|      1|    1|In the beginning ...|
|   1|      1|    2|Now the earth was...|
|   1|      1|    3|And God said, "Le...|
|   1|      1|    4|God saw that the ...|
|   1|      1|    5|God called the li...|
|   1|      1|    6|And God said, "Le...|
|   1|      1|    7|So God made the e...|
|   1|      1|    8|God called the ex...|
|   1|      1|    9|And God said, "Le...|
|   1|      1|   10|God called the dr...|
|   1|      1|   11|Then God said, "L...|
|   1|      1|   12|The land produced...|
|   1|      1|   13|And there was eve...|
|   1|      1|   14|And God said, "Le...|
|   1|      1|   15|and let them be l...|
|   1|      1|   16|God made two grea...|
|   1|      1|   17|God set them in t...|
|   1|      1|   18|to govern the day...|
|   1|      1|   19|And there was eve...|
|   1|      1|   20|And God said, "Le...|
+----+-------+-----+--------------

In [42]:
result = yake_pipeline.fit(df2).transform(df2)

In [43]:
result = result.withColumn('unique_keywords', F.array_distinct("keywords.result"))

In [44]:
result.show()

+----+-------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|book|chapter|verse|                text|            document|            sentence|               token|            keywords|     unique_keywords|
+----+-------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   1|      1|    1|In the beginning ...|[{document, 0, 54...|[{document, 0, 54...|[{token, 0, 1, In...|[{chunk, 7, 15, b...|[beginning, god, ...|
|   1|      1|    2|Now the earth was...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 2, No...|[{chunk, 8, 12, e...|[earth, formless,...|
|   1|      1|    3|And God said, "Le...|[{document, 0, 55...|[{document, 0, 55...|[{token, 0, 2, An...|[{chunk, 4, 6, go...|[god, said, light...|
|   1|      1|    4|God saw that the ...|[{document, 0, 77...|[{document, 0, 77...|[{token, 0, 2, Go...|[{chunk, 0, 2,

In [20]:
def highlight(text, keywords):
    for k in keywords:
        text = (re.sub(r'(\b%s\b)'%k, r'<span style="background-color: yellow;">\1</span>', text, flags=re.IGNORECASE))
    return text

In [21]:
highlight_udf = udf(highlight, StringType())


In [45]:
result = result.withColumn("highlighted_keywords",highlight_udf('text','unique_keywords'))

In [46]:
result.printSchema()

root
 |-- book: integer (nullable = true)
 |-- chapter: integer (nullable = true)
 |-- verse: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata

In [47]:
result.write.mode('overwrite').json('bibleKeyword.json')

In [27]:
# pandas_df=result.selectExpr('text','unique_keywords').pandas_api()
# pandas_df.head()

In [48]:
result_db=result.selectExpr('book','chapter','verse','text','unique_keywords')

In [49]:
result_db.show()

+----+-------+-----+--------------------+--------------------+
|book|chapter|verse|                text|     unique_keywords|
+----+-------+-----+--------------------+--------------------+
|   1|      1|    1|In the beginning ...|[beginning, god, ...|
|   1|      1|    2|Now the earth was...|[earth, formless,...|
|   1|      1|    3|And God said, "Le...|[god, said, light...|
|   1|      1|    4|God saw that the ...|[god, saw, light,...|
|   1|      1|    5|God called the li...|[god, called, lig...|
|   1|      1|    6|And God said, "Le...|[god, said, expan...|
|   1|      1|    7|So God made the e...|[god, made, expan...|
|   1|      1|    8|God called the ex...|[god, called, exp...|
|   1|      1|    9|And God said, "Le...|[god, said, water...|
|   1|      1|   10|God called the dr...|[god, called, dry...|
|   1|      1|   11|Then God said, "L...|[god, said, land,...|
|   1|      1|   12|The land produced...|[land, produced, ...|
|   1|      1|   13|And there was eve...|[evening, thir

### get bible book names 

In [38]:
bibleBooks=spark.read.csv('../data/bibleBooks.csv',header=True,inferSchema=True)
bibleBooks.show()

+------+------+------------+-------------+------+
|BookID|OsisID|    BookName|TotalChapters|Volume|
+------+------+------------+-------------+------+
|     1|   Gen|     Genesis|           50|    OT|
|     2|  Exod|      Exodus|           40|    OT|
|     3|   Lev|   Leviticus|           27|    OT|
|     4|   Num|     Numbers|           36|    OT|
|     5|  Deut| Deuteronomy|           34|    OT|
|     6|  Josh|      Joshua|           24|    OT|
|     7|  Judg|      Judges|           21|    OT|
|     8|  Ruth|        Ruth|            4|    OT|
|     9|  1Sam|    1 Samuel|           31|    OT|
|    10|  2Sam|    2 Samuel|           24|    OT|
|    11|  1Kgs|     1 Kings|           22|    OT|
|    12|  2Kgs|     2 Kings|           25|    OT|
|    13|  1Chr|1 Chronicles|           29|    OT|
|    14|  2Chr|2 Chronicles|           36|    OT|
|    15|  Ezra|        Ezra|           10|    OT|
|    16|   Neh|    Nehemiah|           13|    OT|
|    17|  Esth|      Esther|           10|    OT|


In [50]:
result_db.printSchema()

root
 |-- book: integer (nullable = true)
 |-- chapter: integer (nullable = true)
 |-- verse: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- unique_keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [59]:
df_join = result_db.join(bibleBooks, result_db.book ==bibleBooks.BookID, how="inner").drop('BookID')
df_join.show()

+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
|book|chapter|verse|                text|     unique_keywords|OsisID|BookName|TotalChapters|Volume|
+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
|   1|      1|    1|In the beginning ...|[beginning, god, ...|   Gen| Genesis|           50|    OT|
|   1|      1|    2|Now the earth was...|[earth, formless,...|   Gen| Genesis|           50|    OT|
|   1|      1|    3|And God said, "Le...|[god, said, light...|   Gen| Genesis|           50|    OT|
|   1|      1|    4|God saw that the ...|[god, saw, light,...|   Gen| Genesis|           50|    OT|
|   1|      1|    5|God called the li...|[god, called, lig...|   Gen| Genesis|           50|    OT|
|   1|      1|    6|And God said, "Le...|[god, said, expan...|   Gen| Genesis|           50|    OT|
|   1|      1|    7|So God made the e...|[god, made, expan...|   Gen| Genesis|           50|    OT|


In [65]:
from pyspark.sql.functions import concat_ws,col
df3=df_join.select(concat_ws('_',df_join.OsisID,df_join.chapter,df_join.verse)
              .alias("ID"),'*')
df3.show()

+--------+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
|      ID|book|chapter|verse|                text|     unique_keywords|OsisID|BookName|TotalChapters|Volume|
+--------+----+-------+-----+--------------------+--------------------+------+--------+-------------+------+
| Gen_1_1|   1|      1|    1|In the beginning ...|[beginning, god, ...|   Gen| Genesis|           50|    OT|
| Gen_1_2|   1|      1|    2|Now the earth was...|[earth, formless,...|   Gen| Genesis|           50|    OT|
| Gen_1_3|   1|      1|    3|And God said, "Le...|[god, said, light...|   Gen| Genesis|           50|    OT|
| Gen_1_4|   1|      1|    4|God saw that the ...|[god, saw, light,...|   Gen| Genesis|           50|    OT|
| Gen_1_5|   1|      1|    5|God called the li...|[god, called, lig...|   Gen| Genesis|           50|    OT|
| Gen_1_6|   1|      1|    6|And God said, "Le...|[god, said, expan...|   Gen| Genesis|           50|    OT|
| Gen_1_7|   1|    

In [72]:
# df3.write.mode('overwrite').json('bibleKeyword2.json')
df3.write.parquet('bibleKeyword2.parquet')

In [70]:
# pgDF=spark.read.format("jdbc").\
#     option("url", "jdbc:postgresql://192.168.1.39:5432/aiknowledge").\
#     option("dbtable", "public.articles_articles").\
#     option("user", "postgres").\
#     option("password", "postgres").\
#     option("driver", "org.postgresql.Driver").load()    
df3.write.format("jdbc")\
    .option("url", "jdbc:postgresql://192.168.0.8:5432/aiknowledge")\
    .option("dbtable", "public.bible")\
    .option("user", "postgres")\
    .option("password", "postgres")\
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite").save()

In [69]:
df3.select('*').where('chapter=1 and verse=1').show(100)

+----------+----+-------+-----+--------------------+--------------------+------+---------------+-------------+------+
|        ID|book|chapter|verse|                text|     unique_keywords|OsisID|       BookName|TotalChapters|Volume|
+----------+----+-------+-----+--------------------+--------------------+------+---------------+-------------+------+
|   Gen_1_1|   1|      1|    1|In the beginning ...|[beginning, god, ...|   Gen|        Genesis|           50|    OT|
|  Exod_1_1|   2|      1|    1|These are the nam...|[names, sons, isr...|  Exod|         Exodus|           40|    OT|
|   Lev_1_1|   3|      1|    1|The LORD called t...|[lord, called, mo...|   Lev|      Leviticus|           27|    OT|
|   Num_1_1|   4|      1|    1|The LORD spoke to...|[lord, spoke, mos...|   Num|        Numbers|           36|    OT|
|  Deut_1_1|   5|      1|    1|These are the wor...|[words, moses, sp...|  Deut|    Deuteronomy|           34|    OT|
|  Josh_1_1|   6|      1|    1|After the death o...|[dea

In [34]:
for r in result.select("highlighted_keywords").limit(10).collect():
    display(HTML(r.highlighted_keywords))
    print("\n\n")



















































In [22]:
from neo4j import GraphDatabase
import time
from tqdm import tqdm

In [23]:
from py2neo import Graph
graph = Graph("neo4j://localhost:7687", auth=("daniel", "fighting"),name="aiknowledge")
graph.run("UNWIND range(1, 3) AS n RETURN n, n * n as n_sq")

n,n_sq
1,1
2,4
3,9


In [53]:
# from py2neo import Graph, Node, Relationship
# for word in keywords_bible2:
#     graph.create(Node("bible",name = word))

22/11/18 21:31:13 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1699617 ms exceeds timeout 120000 ms
22/11/18 21:31:13 WARN SparkContext: Killing executors is not supported by current scheduler.


In [25]:
# const_ners = 'CREATE CONSTRAINT ners IF NOT EXISTS ON (n:NER) ASSERT n.name IS UNIQUE'
# graph.run(const_ners)

In [39]:
spark.stop()