In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import wikipedia
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql import SQLContext
from pyspark.ml.clustering import LDA

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

def get_title(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    title = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        title = 'error'
    return title

def get_content(content):
    # Remove any leading or lagging space if present 
    content = content.strip()
    actual_content = ''
    try:
        if(content != ''):
            # Split the content on the basis of new line
            arr = content.split("\n",2)
            # Second line is the title
            title = arr[1]
            # Rest is the actual content
            actual_content = arr[2]
    except:
        actual_content = 'error'
    return actual_content

def check_if_person(content):
    content = content[:150]
    list1 = re.findall(r"[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", content)
    list2 = re.findall(r"[ADFJMNOS]\w* [\d]{1,2}[,] [\d]{4}", content)
    if(len(list1)>0 or len(list2)>0):
        return True
    return False

def clean(article):
    title = article[0]
    document = article[1]
    tokens = RegexpTokenizer(r'\w+').tokenize(document.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return (title, tokens_stemmed)

In [2]:
spark

In [3]:
data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/cars/*/*")
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/42_articles/*/*")
#data = sc.wholeTextFiles("C:/Users/Alina/Big Data/Wikipedia Exports/metadaten_test1/*/*")

pagesRaw = data.flatMap(lambda x :(x[1].split('</doc>')))
pagesTitleContent = pagesRaw.map(lambda x : (get_title(x),get_content(x)))
cleanedPages = pagesTitleContent.map(lambda x : clean(x))
article_contents = cleanedPages.map(lambda x: x[1]).zipWithIndex()
#article_contents = sc.parallelize(article_contents.take(14))
#list_of_words = article_contents.flatMap(lambda x: x).zipWithIndex()

In [12]:
cleanedPagesTitles = cleanedPages.map(lambda x: x[0])
cleanedPagesTitles.collect()
#cleanedPagesTitles.count()

['Audi',
 'BMW',
 'Opel',
 'Mercedes-Benz',
 'Human',
 'Nature',
 'Chair',
 'Table',
 'Plant',
 'Window',
 'Notebook',
 'BYD',
 'Volvo',
 'Škoda Auto',
 '']

In [13]:
df_txts  = sqlContext.createDataFrame(article_contents, ["list_of_words",'index'])

# TF
cv = CountVectorizer(inputCol="list_of_words", outputCol="features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)

# IDF
#idf = IDF(inputCol="raw_features", outputCol="features")
#idfModel = idf.fit(result_cv)
#result_tfidf = idfModel.transform(result_cv) 
#result_tfidf.select("features").take(1)

In [14]:
result_cv.show()

+--------------------+-----+--------------------+
|       list_of_words|index|            features|
+--------------------+-----+--------------------+
|[audi, ag, german...|    0|(75,[0,1,2,3,4,5,...|
|[bayerisch, motor...|    1|(75,[0,1,2,3,4,5,...|
|[opel, automobil,...|    2|(75,[0,1,2,3,4,5,...|
|[merced, benz, ge...|    3|(75,[0,1,2,3,4,5,...|
|[human, homo, sap...|    4|(75,[0,1,2,3,4,5,...|
|[natur, broadest,...|    5|(75,[0,1,2,3,4,5,...|
|[one, basic, piec...|    6|(75,[0,1,2,3,4,7,...|
|  [tabl, may, refer]|    7|(75,[12,67],[1.0,...|
|[plant, mainli, m...|    8|(75,[0,1,2,3,4,5,...|
|[window, open, wa...|    9|(75,[0,1,2,4,5,6,...|
|[notebook, also, ...|   10|(75,[0,1,2,3,4,10...|
|[byd, byd, may, r...|   11|(75,[12,67],[1.0,...|
|[volvo, group, le...|   12|(75,[0,1,2,3,4,5,...|
|[škoda, auto, com...|   13|(75,[0,1,2,3,4,5,...|
|                  []|   14|          (75,[],[])|
+--------------------+-----+--------------------+



In [15]:
lda = LDA(k=3, maxIter=15)
model = lda.fit(result_cv)

In [16]:
vocab = cvmodel.vocabulary
topics = model.describeTopics()
topicsRdd = topics.rdd
topics_words = topicsRdd.map(lambda row: row['termIndices']).map(lambda idx_list: [vocab[idx] for idx in idx_list]).collect()
topics_words

[['introduc',
  'becam',
  'system',
  'also',
  'differ',
  'remain',
  'small',
  'record',
  'call',
  'form'],
 ['year',
  'use',
  'also',
  'first',
  'develop',
  'two',
  'gener',
  'group',
  'includ',
  'new'],
 ['gener',
  'name',
  'first',
  'manufactur',
  '1',
  'also',
  'new',
  '2',
  'product',
  'produc']]

In [8]:
article_title = "Lamborghini"
#article_title = "Car"
article_content_test = clean([article_title, wikipedia.page(article_title).content])[1]

In [9]:
article_content_test_rdd = sc.parallelize([article_content_test]).zipWithIndex()
df_txts_test  = sqlContext.createDataFrame(article_content_test_rdd, ["list_of_words",'index'])

# TF
cv_test = CountVectorizer(inputCol="list_of_words", outputCol="features", vocabSize=50, minDF=1.0)
cvmodel_test = cv_test.fit(df_txts_test)
result_cv_test = cvmodel_test.transform(df_txts_test)

In [10]:
result_cv_test.show()

+--------------------+-----+--------------------+
|       list_of_words|index|            features|
+--------------------+-----+--------------------+
|[automobili, lamb...|    0|(50,[0,1,2,3,4,5,...|
+--------------------+-----+--------------------+



In [17]:
ll = model.logLikelihood(result_cv_test)
lp = model.logPerplexity(result_cv_test)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -3990.358669491167
The upper bound on perplexity: 4.118017202777262
