In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('rt')
eng_stopwords.append('qt')
eng_stopwords.append('&amp')
eng_stopwords.append('amp')
eng_stopwords.append('+')
eng_stopwords.append('w')
eng_stopwords.append('today')
eng_stopwords.append('live')
eng_stopwords.append('make')
eng_stopwords.append('hear')
eng_stopwords.append('meet')
eng_stopwords.append('thank')
eng_stopwords.append('see')
eng_stopwords.append('time')
eng_stopwords.append('day')
eng_stopwords.append('watch')
eng_stopwords.append('get')
eng_stopwords.append('th')
eng_stopwords.append('year')
eng_stopwords.append('la')
eng_stopwords.append('pm')
eng_stopwords.append('hr')
eng_stopwords.append('rep')
eng_stopwords.append('come')
eng_stopwords.append('last')
eng_stopwords.append('dc')

In [4]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml import Pipeline
import pyspark.sql.functions as f
import boto3, os

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text_no_links') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [6]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [7]:
# Create sparse vector of counts for LDA input featuresCol
cv = CountVectorizer(inputCol="finished_clean_lemma",
                     outputCol="features", vocabSize=3500,
                     minDF=8)
# IDF to prepare for LDA
idf = IDF(inputCol="features", outputCol="idf")
# LDA model
lda = LDA(k=4, maxIter=50)

In [8]:
# Create Spark-NLP Pipeline
mlPipeline = Pipeline(stages=[
    cv,
    idf,
    lda
])

In [9]:
congDF0617 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2017.csv/*.part",header=True)
congDF0717 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/July2017.csv/*.part",header=True)
congDF0817 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Aug2017.csv/*.part",header=True)
congDF0917 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Sep2017.csv/*.part",header=True)
congDF1017 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Oct2017.csv/*.part",header=True)
congDF1117 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Nov2017.csv/*.part",header=True)
congDF1217 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Dec2017.csv/*.part",header=True)

In [10]:
congDF17 = sc.union([congDF0617.rdd,congDF0717.rdd,
                  congDF0817.rdd,congDF0917.rdd,
                  congDF1017.rdd,congDF1117.rdd,
                  congDF1217.rdd]).toDF()

In [11]:
congDF0118 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Jan2018.csv/*.part",header=True)
congDF0218 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Feb2018.csv/*.part",header=True)
congDF0318 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Mar2018.csv/*.part",header=True)
congDF0418 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Apr2018.csv/*.part",header=True)
congDF0518 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/May2018.csv/*.part",header=True)
congDF0618 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2018.csv/*.part",header=True)
congDF0718 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/July2018.csv/*.part",header=True)
congDF0818 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Aug2018.csv/*.part",header=True)
congDF0918 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Sep2018.csv/*.part",header=True)
congDF1018 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Oct2018.csv/*.part",header=True)
congDF1118 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Nov2018.csv/*.part",header=True)
congDF1218 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Dec2018.csv/*.part",header=True)

In [12]:
congDF18 = sc.union([congDF0118.rdd,congDF0218.rdd,
                     congDF0318.rdd,congDF0418.rdd,
                     congDF0518.rdd,congDF0618.rdd,
                     congDF0718.rdd,congDF0818.rdd,
                     congDF0918.rdd,congDF1018.rdd,
                     congDF1118.rdd,congDF1218.rdd]).toDF()

In [13]:
congDF0119 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Jan2019.csv/*.part",header=True)
congDF0219 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Feb2019.csv/*.part",header=True)
congDF0319 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Mar2019.csv/*.part",header=True)
congDF0419 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Apr2019.csv/*.part",header=True)
congDF0519 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/May2019.csv/*.part",header=True)
congDF0619 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2019.csv/*.part",header=True)
congDF0719 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/July2019.csv/*.part",header=True)
congDF0819 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Aug2019.csv/*.part",header=True)
congDF0919 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Sep2019.csv/*.part",header=True)
congDF1019 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Oct2019.csv/*.part",header=True)
congDF1119 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Nov2019.csv/*.part",header=True)
congDF1219 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/Dec2019.csv/*.part",header=True)

In [14]:
congDF19 = sc.union([congDF0119.rdd,congDF0219.rdd,
                     congDF0319.rdd,congDF0419.rdd,
                     congDF0519.rdd,congDF0619.rdd,
                     congDF0719.rdd,congDF0819.rdd,
                     congDF0919.rdd,congDF1019.rdd,
                     congDF1119.rdd,congDF1219.rdd]).toDF()

In [15]:
congDF = sc.union([congDF17.rdd,congDF18.rdd,congDF19.rdd]).toDF()

In [16]:
def LDA_Prep(DF):
    data = DF.filter(DF["text"].isNull()==False).drop("_c0")
    noLinkText = data.withColumn('text_no_links',f.regexp_replace('text','http.*($|\s)',''))
    nlpPipeDF = pipeline.fit(noLinkText).transform(noLinkText).select("text","finished_clean_lemma")
    return nlpPipeDF

In [17]:
def LDA_Eval(mlPiped,time):
    s3 = boto3.resource('s3')
    ldaModel=mlPiped.stages[2]
    ll = ldaModel.logLikelihood(LDA_DF)
    lp = ldaModel.logPerplexity(LDA_DF)
    with open('results.txt','w') as file:
        file.write("Learned topics for "+str(time)+" congress tweets (as distributions over vocab of "
                    + str(ldaModel.vocabSize())+ " words):\n\n")
        file.write("The lower bound on the log likelihood of the entire corpus: " + str(ll))
        file.write("\nThe upper bound on perplexity: " + str(lp))
        file.write("\n")
    #ldaModel.describeTopics(5).show(truncate=False)
    # Output topics. Each is a distribution over words (matching word count vectors)
    topics = ldaModel.describeTopics(40)
    topics_rdd = topics.rdd
    vocab = mlPiped.stages[0].vocabulary
    topics_words = topics_rdd\
           .map(lambda row: row['termIndices'])\
           .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
           .collect()
    with open('results.txt','a') as file:
        for idx, topic in enumerate(topics_words):
            file.write("\n----------")
            file.write("\ntopic: "+str(idx))
            file.write("\n----------\n")
            for word in topic:
                file.write(word+" | ")
        file.write("\n========================================================================\n")
    s3.meta.client.upload_file('results.txt', '502finalprojbucky',
                               'congressresults/results'+str(time).replace(" ","")+'.txt')

In [18]:
Monthly2017 = [congDF0617,congDF0717,congDF0817,congDF0917,
               congDF1117,congDF1117,congDF1217]
Monthly17 = ["June 2017","July 2017","Aug 2017","Sep 2017",
             "Oct 2017","Nov 2017","Dec 2017"]

In [19]:
Monthly2018 = [congDF0118,congDF0218,congDF0318,
               congDF0418,congDF0518,congDF0618,
               congDF0718,congDF0818,congDF0918,
               congDF1018,congDF1118,congDF1218]
Monthly18 = ["Jan 2018","Feb 2018","Mar 2018","Apr 2018",
             "May 2018","June 2018","July 2018","Aug 2018",
             "Sep 2018","Oct 2018","Nov 2018","Dec 2018"]

In [20]:
Monthly2019 = [congDF0119,congDF0219,congDF0319,
               congDF0419,congDF0519,congDF0619,
               congDF0719,congDF0819,congDF0919,
               congDF1019,congDF1119,congDF1219]
Monthly19 = ["Jan 2019","Feb 2019","Mar 2019","Apr 2019",
             "May 2019","June 2019","July 2019","Aug 2019",
             "Sep 2019","Oct 2019","Nov 2019","Dec 2019"]

In [21]:
for df,month in zip(Monthly2017,Monthly17):
    # Prep the data for LDA
    prepDF = LDA_Prep(df)    
    # Fit the model
    mlPipeFit = mlPipeline.fit(prepDF)
    # Transform the data
    LDA_DF = mlPipeFit.transform(prepDF)
    # Write summary of the results to a text file in s3 bucket
    LDA_Eval(mlPipeFit,month)

In [22]:
for df,month in zip(Monthly2018,Monthly18):
    # Prep the data for LDA
    prepDF = LDA_Prep(df)    
    # Fit the model
    mlPipeFit = mlPipeline.fit(prepDF)
    # Transform the data
    LDA_DF = mlPipeFit.transform(prepDF)
    # Write summary of the results to a text file in s3 bucket
    LDA_Eval(mlPipeFit,month)

In [23]:
for df,month in zip(Monthly2019,Monthly19):
    # Prep the data for LDA
    prepDF = LDA_Prep(df)    
    # Fit the model
    mlPipeFit = mlPipeline.fit(prepDF)
    # Transform the data
    LDA_DF = mlPipeFit.transform(prepDF)
    # Write summary of the results to a text file in s3 bucket
    LDA_Eval(mlPipeFit,month)

In [None]:
#spark.stop()