## Gruppe 3 - Big Data WS2021

## Anwendungsfall: Topic Modeling mit LDA bei Wikipedia-Daten

### Problemstellungen:
- Warum ist Topic Modeling mit Wikipedia-Artikeln ein Big Data-Anwendungsfall?
- Welche Möglichkeiten gibt es zur Umsetzung?
- Wie wird beim Programm "Skalierung", "Parallelisierung" und "Fehlertoleranz" angegangen?
- Wo genau finden sich diese Eigenschaften?

---

In [1]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import wikipedia
import gensim
from gensim import corpora, models
import pyLDAvis.gensim
from pyspark.ml.feature import CountVectorizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.clustering import LDA
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
import time
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

#LocalMode
spark = SparkSession.builder.getOrCreate()

#StandaloneMode
#spark = SparkSession.builder.master('spark://localhost:7077').getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

def get_title(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    title = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split('\n', 2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        title = 'error'
    return title

def get_content(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    actual_content = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split('\n', 2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        actual_content = 'error'
    return actual_content

def clean(article):
    title = article[0]
    document = article[1]
    tokens = RegexpTokenizer(r'\w+').tokenize(document.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return (title, tokens_stemmed)

def splitByDoc(textfile):
    return list(filter(lambda x: x != '\n', textfile[1].split('</doc>')))

In [2]:
spark

In [4]:
start = time.time()

data = sc.wholeTextFiles('C:/Users/Alina/Big Data/Wikipedia Exports/all_articles_2mb/*/*')
pagesRaw = data.flatMap(splitByDoc)
pagesTitleContent = pagesRaw.map(lambda x : (get_title(x),get_content(x))).filter(lambda x: x[0] != 'error' and x[0] != '').filter(lambda x: x[0] != 'error' and x[0] != '')

#RDD to DataFrame
dfPagesTitleContent = sqlContext.createDataFrame(pagesTitleContent, ['title','content'])

regexTokenizer = RegexTokenizer(inputCol='content', outputCol='list_of_words_raw', pattern='\\W', minTokenLength=4)
tokenizedWords = regexTokenizer.transform(dfPagesTitleContent)

stopWordsRemover = StopWordsRemover(inputCol='list_of_words_raw', outputCol='list_of_words')
stopwordsSpark = stopWordsRemover.getStopWords()
stopwordsSpark.extend(['also'])
stopWordsRemover.setStopWords(stopwordsSpark)
cleanedWords = stopWordsRemover.transform(tokenizedWords)

mappedCleanedWords = cleanedWords.select('list_of_words').rdd.map(lambda x: x[0])
dictionary = corpora.Dictionary(mappedCleanedWords.collect())
corpus = mappedCleanedWords.map(lambda x: dictionary.doc2bow(x)).collect()
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=15)

end = time.time()
print(end - start)

20.139456748962402
[(14, '0.014*"alabama" + 0.011*"state" + 0.008*"asia" + 0.005*"abacus" + 0.004*"agriculture"'), (3, '0.016*"acid" + 0.016*"alkanes" + 0.014*"algae" + 0.010*"carbon" + 0.008*"acids"'), (1, '0.011*"species" + 0.011*"amphibians" + 0.008*"frogs" + 0.008*"salamanders" + 0.008*"water"'), (12, '0.011*"space" + 0.011*"states" + 0.008*"congress" + 0.008*"first" + 0.007*"articles"'), (0, '0.032*"animation" + 0.014*"animated" + 0.009*"film" + 0.007*"films" + 0.006*"disney"')]


In [72]:
cleanedPagesTitles = pagesTitleContent.map(lambda x: x[0])
cleanedPagesTitles.take(20)
#cleanedPagesTitles.count()

['Audi',
 'BMW',
 'Opel',
 'Mercedes-Benz',
 'Human',
 'Nature',
 'Chair',
 'Table',
 'Plant',
 'Window',
 'Notebook',
 'BYD',
 'Volvo',
 'Škoda Auto']

In [5]:
print(lda_model.print_topics(num_topics=20, num_words=5))

[(0, '0.032*"animation" + 0.014*"animated" + 0.009*"film" + 0.007*"films" + 0.006*"disney"'), (1, '0.011*"species" + 0.011*"amphibians" + 0.008*"frogs" + 0.008*"salamanders" + 0.008*"water"'), (2, '0.016*"huxley" + 0.010*"ambiguity" + 0.009*"category" + 0.006*"ambiguous" + 0.004*"aardwolf"'), (3, '0.016*"acid" + 0.016*"alkanes" + 0.014*"algae" + 0.010*"carbon" + 0.008*"acids"'), (4, '0.019*"court" + 0.018*"autism" + 0.010*"appeal" + 0.006*"appellate" + 0.006*"children"'), (5, '0.018*"atomic" + 0.017*"standards" + 0.016*"number" + 0.009*"elements" + 0.008*"element"'), (6, '0.040*"achilles" + 0.006*"island" + 0.006*"greek" + 0.005*"patroclus" + 0.005*"hector"'), (7, '0.019*"algeria" + 0.010*"algerian" + 0.006*"french" + 0.004*"country" + 0.004*"government"'), (8, '0.009*"aristotle" + 0.008*"andorra" + 0.006*"used" + 0.004*"ascii" + 0.004*"first"'), (9, '0.011*"crew" + 0.011*"apollo" + 0.010*"moon" + 0.010*"lunar" + 0.009*"anarchism"'), (10, '0.042*"apollo" + 0.009*"albedo" + 0.006*"greek

In [6]:
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [7]:
article_title = 'Lamborghini'
cleaned_article_content = clean([article_title, wikipedia.page(article_title).content])[1]
print(list(lda_model[[dictionary.doc2bow(cleaned_article_content)]]))

[[(4, 0.011865782), (8, 0.18252897), (9, 0.06576394), (10, 0.018534407), (11, 0.015645297), (12, 0.04536496), (13, 0.13573109), (14, 0.12905116), (15, 0.11341942), (16, 0.18709102), (18, 0.08424152)]]
