In [69]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from gensim.test.utils import datapath, get_tmpfile
from gensim.corpora import WikiCorpus, MmCorpus
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
import wikipedia
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, IDF, RegexTokenizer, StopWordsRemover, Normalizer
from pyspark.sql import SQLContext
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

def get_title(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    title = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        title = 'error'
    return title

def get_content(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    actual_content = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        actual_content = 'error'
    return actual_content

def clean(article):
    title = article[0]
    document = article[1]
    tokens = RegexpTokenizer(r'\w+').tokenize(document.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return (title, tokens_stemmed)

def splitByDoc(textfile):
    return list(filter(lambda x: x != "\n", textfile[1].split("</doc>")))

In [70]:
spark

In [83]:
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/cars/*/*")
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/42_articles/*/*")
data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/metadaten_test1/*/*")

pagesRaw = data.flatMap(splitByDoc)
pagesTitleContent = pagesRaw.map(lambda x : (get_title(x),get_content(x)))

#RDD to DataFrame
dfPagesTitleContent = sqlContext.createDataFrame(pagesTitleContent, ["title",'content'])

regexTokenizer = RegexTokenizer(inputCol="content", outputCol="list_of_words_raw", pattern="\\W")
tokenizedWords = regexTokenizer.transform(dfPagesTitleContent)

stopWordsRemover = StopWordsRemover(inputCol="list_of_words_raw", outputCol="list_of_words")
cleanedWords = stopWordsRemover.transform(tokenizedWords)

mappedCleanedWords = cleanedWords.select("list_of_words").rdd.map(lambda x: x[0])
dictionary = corpora.Dictionary(mappedCleanedWords.collect())
corpus = mappedCleanedWords.map(lambda x: dictionary.doc2bow(x)).collect()
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=100)

print(lda_model.print_topics(num_topics=5, num_words=5))

[(0, '0.023*"wikipedia" + 0.005*"project" + 0.004*"w" + 0.004*"language" + 0.004*"articles"'), (1, '0.026*"de" + 0.024*"la" + 0.012*"d" + 0.011*"en" + 0.011*"le"'), (2, '0.013*"wikipedia" + 0.009*"page" + 0.008*"one" + 0.008*"article" + 0.007*"think"'), (3, '0.047*"gt" + 0.039*"lt" + 0.014*"br" + 0.012*"logo" + 0.011*"wikipedia"'), (4, '0.012*"wikipedia" + 0.010*"http" + 0.009*"org" + 0.008*"br" + 0.008*"lt"')]


In [72]:
cleanedPagesTitles = pagesTitleContent.map(lambda x: x[0])
cleanedPagesTitles.collect()
#cleanedPagesTitles.count()

['Audi',
 'BMW',
 'Opel',
 'Mercedes-Benz',
 'Human',
 'Nature',
 'Chair',
 'Table',
 'Plant',
 'Window',
 'Notebook',
 'BYD',
 'Volvo',
 'Å koda Auto']

In [73]:
dictionary = corpora.Dictionary(mappedCleanedWords.collect())
corpus = mappedCleanedWords.map(lambda x: dictionary.doc2bow(x)).collect()
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=100)

print(lda_model.print_topics(num_topics=5, num_words=5))

[(0, '0.020*"plants" + 0.013*"window" + 0.012*"windows" + 0.007*"plant" + 0.006*"used"'), (1, '0.015*"opel" + 0.010*"mercedes" + 0.009*"koda" + 0.008*"benz" + 0.007*"chair"'), (2, '0.009*"life" + 0.009*"earth" + 0.008*"water" + 0.007*"nature" + 0.006*"human"'), (3, '0.018*"humans" + 0.015*"human" + 0.011*"volvo" + 0.007*"years" + 0.005*"000"'), (4, '0.032*"audi" + 0.017*"bmw" + 0.007*"company" + 0.006*"cars" + 0.006*"car"')]


In [75]:
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [78]:
#article_title = "Lamborghini"
article_title = "Flower"
cleaned_article_content = clean([article_title, wikipedia.page(article_title).content])[1]
print( list( lda_model[ [dictionary.doc2bow(cleaned_article_content)] ]) )

[[(0, 0.43369463), (1, 0.15516591), (2, 0.071672365), (3, 0.21654968), (4, 0.12291738)]]
