In [None]:
import os

# Install java
! java -version

# Install pyspark
! pip3 install --ignore-installed pyspark==2.4.5

# Install Spark NLP
! pip3 install --ignore-installed spark-nlp==2.4.5

# Install nltk
! pip3 install nltk

In [1]:
import sparknlp

spark = sparknlp.start()

In [2]:
from pyspark.sql import functions as F

data_path = './data/all-the-news-2-1.csv'
data = spark.read.csv(data_path, header=True)

In [3]:
data.columns

['_c0',
 'Unnamed: 0',
 'date',
 'year',
 'month',
 'day',
 'author',
 'title',
 'article',
 'url',
 'section',
 'publication']

In [4]:
data.show(1)

+---+----------+-------------------+----+-----+---+-----------+--------------------+--------------------+--------+--------------------+------------+
|_c0|Unnamed: 0|               date|year|month|day|     author|               title|             article|     url|             section| publication|
+---+----------+-------------------+----+-----+---+-----------+--------------------+--------------------+--------+--------------------+------------+
|  0|         0|2016-12-09 18:31:00|2016| 12.0|  9|Lee Drutman|We should take co...|"This post is par...| however| several critics ...| for example|
+---+----------+-------------------+----+-----+---+-----------+--------------------+--------------------+--------+--------------------+------------+
only showing top 1 row



In [5]:
text_col = 'article'
article_text = data.select(text_col).filter(F.col(text_col).isNotNull())

In [6]:
article_text.limit(5).show(truncate=90)

+------------------------------------------------------------------------------------------+
|                                                                                   article|
+------------------------------------------------------------------------------------------+
|"This post is part of Polyarchy, an independent blog produced by the political reform p...|
| The Indianapolis Colts made Andrew Luck the highest-paid player in NFL history this of...|
|DAVOS, Switzerland (Reuters) - U.S. President Donald Trump denied a report on Friday th...|
|PARIS (Reuters) - Former French president Nicolas Sarkozy published a new memoir on Thu...|
|"Paris Hilton arrived at LAX Wednesday dressed to pay her last respects to her uncle Mo...|
+------------------------------------------------------------------------------------------+



In [7]:
from sparknlp.base import DocumentAssembler

documentAssembler = DocumentAssembler() \
     .setInputCol(text_col) \
     .setOutputCol('document')

In [8]:
from sparknlp.annotator import Tokenizer

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokenized')

In [9]:
from sparknlp.annotator import Normalizer

normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

In [10]:
from sparknlp.annotator import LemmatizerModel

lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


IllegalArgumentException: 'Unsupported class file major version 55'

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')

In [None]:
from sparknlp.annotator import StopWordsCleaner

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('unigrams') \
     .setStopWords(eng_stopwords)

In [None]:
from sparknlp.annotator import NGramGenerator

ngrammer = NGramGenerator() \
    .setInputCols(['lemmatized']) \
    .setOutputCol('ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

In [None]:
from sparknlp.annotator import PerceptronModel

pos_tagger = PerceptronModel.pretrained('pos_anc') \
    .setInputCols(['document', 'lemmatized']) \
    .setOutputCol('pos')

In [None]:
from sparknlp.base import Finisher

finisher = Finisher() \
     .setInputCols(['unigrams', 'ngrams', 'pos']) \

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline() \
     .setStages([documentAssembler,                  
                 tokenizer,
                 normalizer,                  
                 lemmatizer,                  
                 stopwords_cleaner, 
                 pos_tagger,
                 ngrammer,  
                 finisher])

In [None]:
processed_review = pipeline.fit(article_text).transform(article_text)