In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from gensim.test.utils import datapath, get_tmpfile
from gensim.corpora import WikiCorpus, MmCorpus
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
import wikipedia
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

def get_title(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    title = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        title = 'error'
    return title

def get_content(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    actual_content = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        actual_content = 'error'
    return actual_content

def check_if_person(content):
    content = content[:150]
    list1 = re.findall(r"[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", content)
    list2 = re.findall(r"[ADFJMNOS]\w* [\d]{1,2}[,] [\d]{4}", content)
    if(len(list1)>0 or len(list2)>0):
        return True
    return False

def clean(article):
    title = article[0]
    document = article[1]
    tokens = RegexpTokenizer(r'\w+').tokenize(document.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return (title, tokens_stemmed)

In [2]:
spark

In [5]:
data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/cars/*/*")
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/42_articles/*/*")
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/metadaten_test1/*/*")
pagesRaw = data.flatMap(lambda x :(x[1].split('</doc>')))
#TODO:  pagesTitleContent and cleanedPages in one step
pagesTitleContent = pagesRaw.map(lambda x : (get_title(x),get_content(x)))
cleanedPages = pagesTitleContent.map(lambda x : clean(x))
article_contents = cleanedPages.map(lambda x: x[1])
dictionary = corpora.Dictionary(article_contents.collect())
corpus = article_contents.map(lambda x: dictionary.doc2bow(x)).collect()
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word = dictionary, passes=100)
print(lda_model.print_topics(num_topics=5, num_words=5))

[(0, '0.029*"bmw" + 0.019*"merced" + 0.015*"benz" + 0.012*"car" + 0.011*"model"'), (1, '0.028*"human" + 0.022*"window" + 0.007*"use" + 0.006*"popul" + 0.006*"year"'), (2, '0.022*"audi" + 0.012*"car" + 0.012*"opel" + 0.010*"chair" + 0.008*"model"'), (3, '0.013*"page" + 0.012*"notebook" + 0.007*"paper" + 0.007*"pad" + 0.006*"bound"'), (4, '0.030*"plant" + 0.008*"natur" + 0.007*"water" + 0.007*"life" + 0.006*"earth"')]


In [8]:
article_title = "Lamborghini"
#article_title = "Car"
cleaned_article_content = clean([article_title, wikipedia.page(article_title).content])[1]
print( list( lda_model[ [dictionary.doc2bow(cleaned_article_content)] ]) )

[[(0, 0.19287233), (1, 0.010782254), (2, 0.76519686), (4, 0.026896305)]]


In [9]:
cleanedPagesTitles = cleanedPages.map(lambda x: x[0])
cleanedPagesTitles.collect()
#cleanedPagesTitles.count()

['Audi',
 'BMW',
 'Opel',
 'Mercedes-Benz',
 'Human',
 'Nature',
 'Chair',
 'Table',
 'Plant',
 'Window',
 'Notebook',
 'BYD',
 'Volvo',
 'Å koda Auto',
 '']

In [7]:
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)