In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('rt')
eng_stopwords.append('qt')
eng_stopwords.append('&amp')
eng_stopwords.append('amp')
eng_stopwords.append('+')

In [4]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text_no_links') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [6]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [7]:
from pyspark.sql.functions import regexp_replace, monotonically_increasing_id, col, when
from pyspark.sql import Row
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.clustering import LDA, LDAModel

In [8]:
congDF = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2017.csv/*.part",header=True)

In [9]:
congDF = congDF.drop("_c0")
data = congDF.filter(congDF['text'].isNull()==False)

#data.select('text').show(1,False)

In [10]:
noLinks = data.withColumn('index',monotonically_increasing_id())
noLinks = noLinks.withColumn('text_no_links',regexp_replace('text','http.*($|\s)',''))

#noLinks.show(1,False)

In [11]:
# transform text with the pipeline
congress = pipeline.fit(noLinks).transform(noLinks)
#congress.select('finished_clean_lemma').show(10,False)

In [12]:
data = congress.select('finished_clean_lemma').withColumn('index',monotonically_increasing_id())

In [13]:
#TF
cv = CountVectorizer(inputCol="finished_clean_lemma",outputCol="features",
                     vocabSize=3500,minDF = 8.5)

In [14]:
# Fit TF
cvmodel = cv.fit(data)

In [15]:
# Transform
result_cv = cvmodel.transform(data)

In [16]:
num_topics=3
max_iter=100

In [17]:
lda_model = LDA(k=num_topics,maxIter=max_iter,optimizer='online').fit(result_cv.select("index","features"))

In [18]:
transform = lda_model.transform(result_cv)

In [19]:
def extractTopDist(row):
    return row.topicDistribution.toArray().tolist()
DF = transform.rdd.map(extractTopDist)

In [20]:
DF = spark.createDataFrame(DF,["Topic1","Topic2","Topic3"]).withColumn("index_1",monotonically_increasing_id())

In [21]:
CoTopDF = transform.join(DF,transform.index==DF.index_1,'inner')\
.select(["index","finished_clean_lemma","features","Topic1","Topic2","Topic3"])

In [22]:
CongressTopics = CoTopDF.rdd.map(lambda r: r.asDict())\
       .map(lambda r: Row(MaxTopic=[max([i for i in r.items() if i[0]\
                                        not in ["index","finished_clean_lemma","features"]], 
                                        key=lambda kv: kv[1])[0],
                                   max([i for i in r.items() if i[0]\
                                        not in ["index","finished_clean_lemma","features"]], 
                                      key=lambda kv: kv[1])[1]], **r) )\
       .toDF()

In [23]:
TopicsCongress = CongressTopics.withColumn("tweet_content",col("finished_clean_lemma"))\
.withColumn("Idx",col("index"))\
.drop("finished_clean_lemma").drop("index")\
.drop("Topic1").drop("Topic2").drop("Topic3")

In [24]:
#TopicsCongress.show(20,False)

In [25]:
lda_model.describeTopics(5).show(10,False)

+-----+------------------+--------------------------------------------------------------------------------------------------------------+
|topic|termIndices       |termWeights                                                                                                   |
+-----+------------------+--------------------------------------------------------------------------------------------------------------+
|0    |[1, 0, 2, 3, 5]   |[0.028399792264002622, 0.027505717915976, 0.025889878035053358, 0.024564169627953828, 0.01801591830408264]    |
|1    |[10, 35, 40, 4, 0]|[0.01360761939099871, 0.011240882646675708, 0.010539315120029126, 0.010502358277515673, 0.009872355136037902] |
|2    |[9, 4, 15, 25, 27]|[0.013221634343320065, 0.013052844421598504, 0.011492108564766785, 0.009220411810831662, 0.009127870868404582]|
+-----+------------------+--------------------------------------------------------------------------------------------------------------+



In [26]:
transform.select("topicDistribution").show(10,False)

+--------------------------------------------------------------+
|topicDistribution                                             |
+--------------------------------------------------------------+
|[0.7989402834060485,0.02524204271710266,0.17581767387684885]  |
|[0.019143224281580586,0.8948697457154637,0.0859870300029557]  |
|[0.9639917256201228,0.014853243126906527,0.021155031252970722]|
|[0.4015449790166039,0.02173458305284354,0.5767204379305526]   |
|[0.0430605522251171,0.5045839491961142,0.4523554985787687]    |
|[0.7565930029387901,0.22304899926694674,0.020357997794263286] |
|[0.6373857192880207,0.02574980565361803,0.3368644750583614]   |
|[0.034174997619897604,0.4573142754573956,0.5085107269227068]  |
|[0.40501261179828946,0.5596651107310903,0.03532227747062027]  |
|[0.0270401114004059,0.3925295678498196,0.5804303207497745]    |
+--------------------------------------------------------------+
only showing top 10 rows



In [27]:
lda_model.topicsMatrix()

DenseMatrix(3196, 3, [2029.0923, 2095.048, 1909.8921, 1812.0948, 104.6979, 1329.0313, 1079.231, 1080.4057, ..., 0.5127, 7.7615, 0.4548, 0.6301, 2.4416, 8.7418, 4.3756, 3.0019], 0)

In [38]:
cvRDD = result_cv.select("features","finished_clean_lemma").rdd

In [62]:
cvRDD.map(lambda r: Row(Term = [i for i in r[0][1]],Idx = [i for i in r[0][0].indices])).

In [73]:
cvRDD.map(lambda r: r[0].indices).take(1)

[array([   1,    2,   20,   27,  124,  255,  285, 1680], dtype=int32)]

In [78]:
cvRDD.map(lambda r: r[1][0]).take(1)

['listen']

In [33]:
#FeatDF.show(2,False)

In [28]:
#def extractFeats(row):
   #return row.features.toArray().tolist()

In [29]:
#FeatMatrix = result_cv.rdd.map(extractFeats)

In [30]:
#FeatDF = FeatMatrix.toDF().withColumn("index_2",monotonically_increasing_id())

In [31]:
Terms = cvmodel.vocabulary

In [32]:
#spark.stop()