In [11]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import wikipedia
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
import time
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

#LocalMode
spark = SparkSession.builder.getOrCreate()

#StandaloneMode
#spark = SparkSession.builder.master('spark://localhost:7077').getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

def get_title(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    title = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split('\n', 2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        title = 'error'
    return title

def get_content(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    actual_content = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split('\n', 2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        actual_content = 'error'
    return actual_content

def clean(article):
    title = article[0]
    document = article[1]
    tokens = RegexpTokenizer(r'\w+').tokenize(document.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return (title, tokens_stemmed)

def splitByDoc(textfile):
    return list(filter(lambda x: x != '\n', textfile[1].split('</doc>')))

In [2]:
spark

In [9]:
start = time.time()

data = sc.wholeTextFiles('C:/Users/Alina/Big Data/Wikipedia Exports/all_articles_2mb/*/*')
pagesRaw = data.flatMap(splitByDoc)
pagesTitleContent = pagesRaw.map(lambda x: (get_title(x), get_content(x))).filter(lambda x: x[0] != 'error' and x[0] != '')

#RDD to DataFrame
dfPagesTitleContent = sqlContext.createDataFrame(pagesTitleContent, ['title', 'content'])

regexTokenizer = RegexTokenizer(inputCol='content', outputCol='list_of_words_raw', pattern='\\W', minTokenLength=4)

stopWordsRemover = StopWordsRemover(inputCol='list_of_words_raw', outputCol='list_of_words')
stopwordsSpark = stopWordsRemover.getStopWords()
stopwordsSpark.extend(['also'])
stopWordsRemover.setStopWords(stopwordsSpark)

countVectorizer = CountVectorizer(inputCol='list_of_words', outputCol='features')

lda = LDA(k=20, maxIter=15)
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, lda])
model = pipeline.fit(dfPagesTitleContent)

end = time.time()
print(end - start)

8.319759130477905


In [5]:
model.stages

[RegexTokenizer_6456ea0e87c3,
 StopWordsRemover_50d1df041045,
 CountVectorizerModel: uid=CountVectorizer_713da8535bfd, vocabularySize=24678,
 LocalLDAModel: uid=LDA_8e8443e7507e, k=20, numFeatures=24678]

In [6]:
cleanedPagesTitles = pagesTitleContent.map(lambda x: x[0])
cleanedPagesTitles.take(20)
#cleanedPagesTitles.count()

['Anarchism\r',
 'Autism\r',
 'Albedo\r',
 'A\r',
 'Alabama\r',
 'Achilles\r',
 'Abraham Lincoln\r',
 'Aristotle\r',
 'An American in Paris\r',
 'Academy Award for Best Production Design\r',
 'Academy Awards\r',
 'Actrius\r',
 'Animalia (book)\r',
 'International Atomic Time\r',
 'Altruism\r',
 'Ayn Rand\r',
 'Alain Connes\r',
 'Allan Dwan\r',
 'Algeria\r',
 'List of Atlas Shrugged characters\r']

In [7]:
vocab = model.stages[2].vocabulary
topics = model.stages[3].describeTopics()
topicsRdd = topics.rdd
topicsRaw = topicsRdd.map(lambda row: row['termIndices']).collect()
result = map(lambda entry: [vocab[idx] for idx in entry], topicsRaw)
print(list(result))

[['connes', 'sciences', 'academy', 'algebras', 'geometry', 'medal', 'operator', 'theory', 'aardwolf', 'differential'], ['loimios', 'pigmentation', 'american', 'caliber', 'generalization', 'biogas', 'ludwig', 'pseudonyms', 'carbocation', 'overlaps'], ['used', 'letter', 'alphabet', 'english', 'many', 'form', 'greek', 'variants', 'latin', 'cursive'], ['achilles', 'hector', 'thetis', 'patroclus', 'troy', 'odysseus', 'zeus', 'agamemnon', 'ajax', 'iliad'], ['used', 'letter', 'alphabet', 'ambiguity', 'form', 'ambiguous', 'english', 'variants', 'languages', 'cursive'], ['huxley', 'bates', 'song', 'america', 'beautiful', 'poem', 'published', 'ward', 'first', 'written'], ['ketchikan', 'famed', 'orange', 'juniper', 'collectives', 'santee', 'manually', 'mixtures', 'conducts', 'voices'], ['congress', 'states', 'articles', 'confederation', 'constitution', 'state', 'continental', 'government', 'union', 'power'], ['alaska', 'andorra', 'state', 'first', 'lincoln', 'time', 'states', 'used', 'many', 'cen

In [34]:
article_title = "Plant"
article_content_test = clean([article_title, wikipedia.page(article_title).content])[1]
article_content_test_rdd = sc.parallelize([article_content_test]).zipWithIndex()
df_txts_test  = sqlContext.createDataFrame(article_content_test_rdd, ['list_of_words', 'index'])
cv_test = CountVectorizer(inputCol='list_of_words', outputCol='features')

cvmodel_test = cv_test.fit(df_txts_test)
result_cv_test = cvmodel_test.transform(df_txts_test)

result_cv_test.select('list_of_words').rdd.flatMap(list).flatMap(list).take(20)

['plan',
 'typic',
 'diagram',
 'list',
 'step',
 'detail',
 'time',
 'resourc',
 'use',
 'achiev',
 'object',
 'someth',
 'commonli',
 'understood',
 'tempor',
 'set',
 'intend',
 'action',
 'one',
 'expect']

In [15]:
ll = model.stages[3].logLikelihood(result_cv_test)
lp = model.stages[3].logPerplexity(result_cv_test)
print('The lower bound on the log likelihood of the entire corpus: ' + str(ll))
print('The upper bound on perplexity: ' + str(lp))

The lower bound on the log likelihood of the entire corpus: -687484.6794305448
The upper bound on perplexity: 1617.6110104248114
