In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('rt')
eng_stopwords.append('qt')
eng_stopwords.append('&amp')
eng_stopwords.append('amp')
eng_stopwords.append('+')

In [4]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text_no_links') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [6]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [7]:
congDF = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2017.csv/*.part",header=True)

In [8]:
congDF = congDF.drop("_c0")
data = congDF.filter(congDF['text'].isNull()==False)

#data.select('text').show(1,False)

In [28]:
from pyspark.sql.functions import regexp_replace, monotonically_increasing_id, col
noLinks = data.withColumn('index',monotonically_increasing_id())
noLinks = noLinks.withColumn('text_no_links',regexp_replace('text','http.*($|\s)',''))

#noLinks.show(1,False)

In [10]:
# transform text with the pipeline
congress = pipeline.fit(noLinks).transform(noLinks)
#congress.select('finished_clean_lemma').show(10,False)

In [11]:
# expand the "finished_clean_lemma" column so that the words are not in a list
from pyspark.sql.functions import explode, col
congress_words = congress.withColumn("exploded_text", explode(col("finished_clean_lemma")))

In [12]:
counts = congress_words.groupby('exploded_text').count()

In [13]:
counts_pd = counts.toPandas()
#{counts_pd.loc[i, 'exploded_text']: counts_pd.loc[i, 'count'] for i in range(counts_pd.shape[0])}

In [14]:
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.clustering import LDA, LDAModel

In [15]:
data = congress.select('finished_clean_lemma').withColumn('index',monotonically_increasing_id())

In [16]:
#TF
cv = CountVectorizer(inputCol="finished_clean_lemma",outputCol="features",
                     vocabSize=5000,minDF = 10.0)

In [17]:
# Fit TF
cvmodel = cv.fit(data)

In [18]:
# Transform
result_cv = cvmodel.transform(data)

In [19]:
num_topics = 5
max_iter = 100

In [20]:
lda_model = LDA(k=num_topics,maxIter=max_iter,optimizer='online').fit(result_cv.select("index","features"))

In [21]:
transform = lda_model.transform(result_cv)

In [22]:
def extract(row):
    return row.topicDistribution.toArray().tolist()
DF = transform.rdd.map(extract)

In [23]:
DF = spark.createDataFrame(DF,["Topic1","Topic2","Topic3","Topic4","Topic5"]).withColumn("index_1",monotonically_increasing_id())

In [27]:
CoTopDF = transform.join(DF,transform.index==DF.index_1,'inner')\
.select(["index","finished_clean_lemma","features","Topic1","Topic2","Topic3","Topic4","Topic5"])

In [None]:
CoTopDF.filter()

In [None]:
#spark.stop()