In [1]:
!pip install sparknlp pymongo

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import pyspark
conf = pyspark.SparkConf()
conf.set('spark.jars.packages', 
         "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,com.databricks:spark-xml_2.12:0.18.0,com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3")
conf.set('spark.driver.memory','8g')
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)
spark

In [52]:
import os
user = os.environ['USER']
PATH = '/scratch/' + user + '/personal/ip/ip_address.txt'
ip_df = spark.read.text(PATH)
uri = ip_df.head(1)[0].value
uri

'10.32.32.6:27040'

In [41]:
spark.sparkSession

In [53]:
from pymongo import MongoClient       

client = MongoClient(uri)

In [54]:
client.server_info()

{'version': '7.0.4',
 'gitVersion': '38f3e37057a43d2e9f41a39142681a76062d582e',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [7, 0, 4, 0],
 'openssl': {'running': 'OpenSSL 3.0.2 15 Mar 2022',
  'compiled': 'OpenSSL 3.0.2 15 Mar 2022'},
 'buildEnvironment': {'distmod': 'ubuntu2204',
  'distarch': 'x86_64',
  'cc': '/opt/mongodbtoolchain/v4/bin/gcc: gcc (GCC) 11.3.0',
  'ccflags': '-Werror -include mongo/platform/basic.h -ffp-contract=off -fasynchronous-unwind-tables -g2 -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -gdwarf-5 -fno-omit-frame-pointer -fno-strict-aliasing -O2 -march=sandybridge -mtune=generic -mprefer-vector-width=128 -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -gdwarf64 -Wa,--nocompress-debug-sections -fno-builtin-memcmp -Wimplicit-fallthrough=5',
  'cxx': '/opt

In [56]:
db=client.test

In [57]:
db

Database(MongoClient(host=['10.32.32.6:27040'], document_class=dict, tz_aware=False, connect=True), 'test')

In [11]:
import os
from zipfile import ZipFile
source_path = "/scratch/work/public/proquest/proquest_hnp/BostonGlobe/BG_20151210212722_00001.zip"
with ZipFile(source_path, "r") as zip:
    zip.extractall('zip_tmp')

In [27]:
df = spark.read\
    .option('rootTag', 'Record')\
    .option('rowTag', 'Record')\
    .option('recursiveFileLookup', 'true')\
    .format("xml").load("zip_tmp")

                                                                                

In [17]:
db.newspapers.find_one()

{'_id': ObjectId('662432274db4621eb50e699e'),
 'ActionCode': 'change',
 'AlphaPubDate': 'Nov 5, 1927',
 'DateTimeStamp': 20151210160011,
 'FullText': '                                                                                                JOrWues ye\'                 9 Open BEGmms                 ITH wild grass turned                 trown and were, the beasts of field and wood must turn to something else to stay the pangs of hunger. Hooted animals Ike the deer so widely distributed and the moose and caribou of Northern New England are eaters of grass and the bumble rabbit and bare must be included as lovers of this plain but succulent forage.                 These animate have the browsing habit. Necessity compels it. They find their living. not only on wild grasses, but upon the shrub.                 bery bordering waters and in the woods. Browsing is the act of anipping off the tender shoots of shrubs and various trees.                 Browse is of an excessively nutritious

In [29]:
import sparknlp
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *

class YakePipeline(Pipeline):
    """
    A pipeline for extracting keywords using YAKE.

    Example:
    pipeline = YakePipeline()
    processed_df = pipeline.fit(df).transfrom(df)
    """
    def __init__(self):
        super(YakePipeline, self).__init__()
        self.stopwords = StopWordsCleaner().getStopWords()
        self.document = DocumentAssembler() \
                .setInputCol("FullText") \
                .setOutputCol("document")
        self.sentenceDetector = SentenceDetector() \
                .setInputCols("document") \
                .setOutputCol("sentence")
        self.token = Tokenizer() \
                .setInputCols("sentence") \
                .setOutputCol("token") \
                .setContextChars(["(", ")", "?", "!", ".", ","])
        self.keywords = YakeKeywordExtraction() \
                .setInputCols("token") \
                .setOutputCol("keywords") \
                .setMinNGrams(1) \
                .setMaxNGrams(3)\
                .setNKeywords(20)\
                .setStopWords(self.stopwords)
        self.setStages([self.document, self.sentenceDetector, self.token, self.keywords])


In [30]:
# Start SparkNLP
spark = sparknlp.start() # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)



In [18]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 5.3.3
Apache Spark version: 3.5.1


In [31]:
# Extract keywords and append to df
yake_pipeline = YakePipeline()

result = yake_pipeline.fit(df).transform(df)\
    .drop("document")\
    .drop("token")\
    .drop("sentence")

In [20]:
# df now includes keywords, along with other outputs from pipeline
result.printSchema()

root
 |-- Abstract: string (nullable = true)
 |-- ActionCode: string (nullable = true)
 |-- AlphaPubDate: string (nullable = true)
 |-- Contributor: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ContribRole: string (nullable = true)
 |    |    |-- FirstName: string (nullable = true)
 |    |    |-- LastName: string (nullable = true)
 |    |    |-- MiddleName: string (nullable = true)
 |    |    |-- NameSuffix: string (nullable = true)
 |    |    |-- OrganizationName: string (nullable = true)
 |    |    |-- OriginalForm: string (nullable = true)
 |    |    |-- PersonName: string (nullable = true)
 |    |    |-- PersonTitle: string (nullable = true)
 |-- DateTimeStamp: long (nullable = true)
 |-- FullText: string (nullable = true)
 |-- LanguageCode: string (nullable = true)
 |-- NumericPubDate: long (nullable = true)
 |-- ObjectType: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Pagination: string (nullable = true)
 |

In [45]:
# Print keywords from one article
import pandas as pd

result_collect = result.limit(2).collect() 

keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'],  k.metadata['sentence']) for k in result_collect[0]['keywords']],
                       columns = ['keywords','begin','end','score','sentence'])
keys_df['score'] = keys_df['score'].astype(float)

# ordered by relevance
keys_df.sort_values(['sentence', 'score']).head(30)

Unnamed: 0,keywords,begin,end,score,sentence
3,grass,163,167,0.23126,0
2,wild,158,161,0.260285,0
4,must,237,240,0.357272,0
0,open,126,129,0.479114,0
1,ith,154,156,0.479114,0
43,open begmms ith,126,156,0.522269,0
40,ith wild,154,161,0.597748,0
38,open begmms,126,136,0.648148,0
39,begmms ith,131,156,0.648148,0
9,grass,408,412,0.23126,1


In [32]:
# Write to mongo
result.write.format("com.mongodb.spark.sql.DefaultSource")\
    .mode("overwrite")\
    .option('uri',f'mongodb://{uri}/test.newspapers')\
    .save()

                                                                                

In [58]:
# Filter data and show document titles
data = db.newspapers.aggregate([
     {'$match': {'$or' : [{'keywords.result': 'china'}, {'keywords.result': 'mergers'}]}},
])

for doc in data:
    print(doc["RecordTitle"])

EARTHQUAKE CONTINUES FOR THREE HOURS, 50 MINUTES
Display Ad 20 -- No Title
LIST OF 15 FOREIGNERS HELD BY THE CHINESE BANDITS
5000 Years of China
MISSIONARY FROM CHINA GIVES ADDRESS AT TRINITY
5000 Years of China
Display Ad 12 -- No Title
MARINE REGIMENT EMBARKS FOR CHINA
SHATTUCK OFFERS LAW SCHOOL FUND $20,000
MISS WHITTEN HOME AS MRS V. G. STATEN
Travel Broadens One
REUNITING CHINA'S REVOLUTION
THREE BOSTON ADVENTISTS IN NANKING BELIEVED SAFE
EDITORIAL POINTS
5000 Years of China
INTEREST GROWS IN RAILROAD SHARES
Display Ad 46 -- No Title
MAKING MORE MONEY
BURTON HOLMES LECTURES ON PEKING
Display Ad 61 -- No Title
GOVERNMENT FAVORS RAIL MERGERS
GEORGE W. PENNIMAN DEAD IN WASHINGTON
Display Ad 20 -- No Title
DECORATIVE CHINA PLACING
ASK RETURN OF WOMEN AND MEN TO CHINESE STATIONS
IS IT UNITY?
MORE WARSHIPS, JAPAN'S REPLY
Will Rogers' Dispatch
5000 Years of China
DENIES DISSENSION IN CABINET ON CHINA
LOOKS FORWARD TO "GOLDEN AGE"
FOREIGN RELATIONS TOPIC OF UNITARIANS
Display Ad 17 -- No 