In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp-prediction") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('rt')
eng_stopwords.append('qt')
eng_stopwords.append('&amp')
eng_stopwords.append('amp')
eng_stopwords.append('+')
eng_stopwords.append('w')
eng_stopwords.append('today')
eng_stopwords.append('make')
eng_stopwords.append('hear')
eng_stopwords.append('meet')
eng_stopwords.append('see')
eng_stopwords.append('time')
eng_stopwords.append('day')
eng_stopwords.append('watch')
eng_stopwords.append('get')
eng_stopwords.append('im')

In [4]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as f
from operator import itemgetter
import pyspark.sql.types as T
import boto3, os, datetime

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text_no_links') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [6]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [7]:
def LDA_Prep(DF):
    noLinkText = DF.withColumn('text_no_links',f.regexp_replace('tweet_text','http.*($|\s)',''))
    nlpPipeDF = pipeline.fit(noLinkText).transform(noLinkText).select("tweet_text","finished_clean_lemma","tweet_type")
    return nlpPipeDF

In [8]:
# Create sparse vector of counts for LDA input featuresCol
cv = CountVectorizer(inputCol="finished_clean_lemma",
                     outputCol="features", vocabSize=7500,
                     minDF=15)
# IDF to prepare for LDA
idf = IDF(inputCol="features", outputCol="idf")
# LDA model
lda = LDA(k=2, maxIter=60)

In [9]:
LDA_Model = PipelineModel(stages=[cv,idf,lda]).load("s3://502finalprojbucky/FullModel/model/")

In [10]:
InfoOp = spark.read.csv("s3://502finalprojbucky/InfOpEnglish/*.csv").dropDuplicates().na.drop()

In [11]:
congress = spark.read.csv("s3://502finalprojbucky/YearlyCongress/*.csv").dropDuplicates().na.drop()

In [12]:
InfoOp=InfoOp.withColumn("tweet_type",f.lit("InfoOp"))

In [13]:
congress = congress.withColumn("tweet_type",f.lit("Congress"))

In [14]:
InfoOp = InfoOp.withColumnRenamed("_c3","tweet_text")\
                .select(["tweet_text","tweet_type"])

In [15]:
congress = congress.withColumnRenamed("_c5","tweet_text")\
                .select(["tweet_text","tweet_type"])

In [16]:
SampleDF = sc.union([InfoOp.rdd,congress.rdd]).toDF()

In [17]:
PrepDF = LDA_Prep(SampleDF)

In [18]:
SampleTrans = LDA_Model.transform(PrepDF)

In [19]:
preds = SampleTrans.select(["tweet_type","topicDistribution"])

In [20]:
preds= preds.withColumn("idx",f.monotonically_increasing_id())

In [21]:
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = f.udf(ith_, T.DoubleType())

In [22]:
predsDF = preds\
        .withColumn("T1",ith(preds["topicDistribution"], f.lit(0)))\
        .withColumn("T2",ith(preds["topicDistribution"], f.lit(1)))\
        .select(["tweet_type","T1","T2"])

In [23]:
predsDF = predsDF.where(predsDF.T1 != predsDF.T2)

In [24]:
schema=T.StructType([T.StructField('maxval',T.IntegerType()),T.StructField('maxval_colname',T.StringType())])

maxcol = f.udf(lambda row: max(row,key=itemgetter(0)), schema)
maxDF = predsDF.withColumn('maxfield', maxcol(f.struct([f.struct(predsDF[x],f.lit(x)) for x in predsDF.columns[1:]]))).\
select(predsDF.columns+['maxfield.maxval_colname'])

In [25]:
InfT1 = maxDF.where(maxDF.tweet_type=="InfoOp").where(maxDF.maxval_colname=="T1").count()
InfT2 = maxDF.where(maxDF.tweet_type=="InfoOp").where(maxDF.maxval_colname=="T2").count()
CongT1 = maxDF.where(maxDF.tweet_type=="Congress").where(maxDF.maxval_colname=="T1").count()
CongT2 = maxDF.where(maxDF.tweet_type=="Congress").where(maxDF.maxval_colname=="T2").count()

In [26]:
InfPercSame = max([InfT1,InfT2])/sum([InfT1,InfT2])
CongPercSame = max([CongT1,CongT2])/sum([CongT1,CongT2])

In [27]:
print("Percent of Congress Tweets in Same Topic: "+str(CongPercSame))
print("Percent of Suspected Information Operations Tweets in Same Topic: "+str(InfPercSame))

Percent of Congress Tweets in Same Topic: 0.7788784543128019
Percent of Suspected Information Operations Tweets in Same Topic: 0.5663152325399131


In [28]:
print("Congress Tweets in Topic 1: "+str(CongT1))
print("Congress Tweets in Topic 2: "+str(CongT2))

Congress Tweets in Topic 1: 296277
Congress Tweets in Topic 2: 1043606


In [29]:
print("Suspected Information Operations Tweets in Topic 1: "+str(InfT1))
print("Suspected Information Operations Tweets in Topic 2: "+str(InfT2))

Suspected Information Operations Tweets in Topic 1: 3320266
Suspected Information Operations Tweets in Topic 2: 2542663


In [66]:
def LDA_Topics(mlPiped):
    s3 = boto3.resource('s3')
    ldaModel=mlPiped.stages[2]
    with open('resultsfull.txt','w') as file:
        file.write("Results for LDA Topic Prediction on Full Set of Tweets\n")
        file.write("=========================================================================\
==============================================================================\n\n")
        file.write("Topic 1: "+
                   str(sum([CongT1,InfT1]))+" Total Tweets\n  - "+
                   str(InfT1)+" Tweets from Suspected Information Operations\n  - "+
                   str(CongT1)+" Tweets from Congress Accounts\n\n")
        file.write("Topic 2: "+
                   str(sum([CongT2,InfT2]))+" Total Tweets\n  - "+
                   str(InfT2)+" Tweets from Suspected Information Operations Tweets\n  - "+
                   str(CongT2)+" Tweets from Congress Accounts\n\n")
        file.write("Distribution of Tweet Type by Topic:\n")
        file.write("  - Topic 1: "+str(round(InfPercSame*100,2))+
                   "% of all Suspected Information Operations Tweets, and "+
                   str(round((1 - CongPercSame)*100,2))+"% of all Congress Tweets\n")
        file.write("  - Topic 2: "+str(round((1-InfPercSame)*100,2))+
                   "% of all Suspected Information Operations Tweets, and "+
                   str(round(CongPercSame*100,2))+"% of all Congress Tweets\n\n\n")
    #ldaModel.describeTopics(5).show(truncate=False)
    # Output topics. Each is a distribution over words (matching word count vectors)
    topics = ldaModel.describeTopics(50)
    topics_rdd = topics.rdd
    vocab = mlPiped.stages[0].vocabulary
    topics_words = topics_rdd\
           .map(lambda row: row['termIndices'])\
           .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
           .collect()
    with open('resultsfull.txt','a') as file:
        file.write("Top 50 Words for Both Topics\n")
        file.write("=========================================================================\
==============================================================================\n")
        for idx, topic in enumerate(topics_words):
            file.write("\nTopic: "+str(idx+1))
            file.write("\n-----------------------------------------------------------------------\
-------------------------------------------------------------------------------\n")
            for word in topic:
                file.write(word+" | ")
            file.write("\n")
    s3.meta.client.upload_file('resultsfull.txt', '502finalprojbucky',
                               'FullResults/fullresults.txt')

In [67]:
LDA_Topics(LDA_Model)

In [68]:
#spark.stop()