In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [87]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('rt')
eng_stopwords.append('qt')
eng_stopwords.append('&amp')
eng_stopwords.append('amp')
eng_stopwords.append('+')
eng_stopwords.append('w')
eng_stopwords.append('would')

In [88]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml import Pipeline

In [89]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text_no_links') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [90]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [91]:
# Create sparse vector of counts for LDA input featuresCol
cv = CountVectorizer(inputCol="finished_clean_lemma",
                     outputCol="features", vocabSize=3000,
                     minDF=9)
# IDF to prepare for LDA
idf = IDF(inputCol="features", outputCol="idf")
# LDA model
lda = LDA(k=2, maxIter=50)

In [92]:
# Create Spark-NLP Pipeline
mlPipeline = Pipeline(stages=[
    cv,
    idf,
    lda
])

In [93]:
congDF = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2017.csv/*.part",header=True)

In [94]:
LDAtext = congDF.select("text")
data = LDAtext.filter(LDAtext['text'].isNull()==False)

#data.select('text').show(1,False)

In [95]:
noLinkText = data.withColumn('text_no_links',regexp_replace('text','http.*($|\s)',''))

#noLinkText.show(1,False)

In [96]:
# transform text with the pipeline
nlpPipeDF = pipeline.fit(noLinkText).transform(noLinkText).select("text","finished_clean_lemma")
#congress.show(10,False)

In [97]:
mlPipeFit = mlPipeline.fit(nlpPipeDF)

In [98]:
LDA_DF = mlPipeFit.transform(nlpPipeDF)

In [99]:
#LDA_DF.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|finished_clean_lemma|            features|                 idf|   topicDistribution|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Listen to these s...|[listen, story, j...|(3000,[1,2,18,25,...|(3000,[1,2,18,25,...|[0.94629916370678...|
|RT @HomelandDems ...|[homelanddems, ra...|(3000,[27,39,107,...|(3000,[27,39,107,...|[0.11869013408053...|
|RT @LADeptHealth ...|[ladepthealth, re...|(3000,[73,89,170,...|(3000,[73,89,170,...|[0.92531351294988...|
|A victory for the...|[victory, america...|(3000,[6,16,129,3...|(3000,[6,16,129,3...|[0.39946547459208...|
|"RT @realDonaldTr...|[realdonaldtrump,...|(3000,[118,122,19...|(3000,[118,122,19...|[0.81490647916756...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [100]:
ldaModel = mlPipeFit.stages[2]

In [101]:
ll = ldaModel.logLikelihood(LDA_DF)
lp = ldaModel.logPerplexity(LDA_DF)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -1432389.477686104
The upper bound on perplexity: 7.142911529205186


In [102]:
# Describe topics.
print("The topics described by their top-weighted terms:")
ldaModel.describeTopics(10).show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+-------------------------------------------------------------------+
|topic|termIndices|termWeights                                                        |
+-----+-----------+-------------------------------------------------------------------+
|0    |[0, 1, 2]  |[0.022815960569666137, 0.016388284228719893, 0.01516908620320191]  |
|1    |[4, 12, 38]|[0.014676640715496136, 0.008235765064459555, 0.0075458184431462265]|
+-----+-----------+-------------------------------------------------------------------+



In [104]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")

topics = ldaModel.describeTopics(10)
topics_rdd = topics.rdd

vocab = mlPipeFit.stages[0].vocabulary

topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("----------")
    print("topic: ", idx)
    print("----------")
    for word in topic:
        print(word)

Learned topics (as distributions over vocab of 3000 words):
----------
topic:  0
----------
bill
health
care
trumpcare
senate
american
work
medicaid
cut
gop
----------
topic:  1
----------
today
vote
law
thank
right
act
house
year
proud
trump


In [105]:
spark.stop()