This document follows through examples from the spark website.

In [2]:
import pandas

In [9]:
from pyspark.ml.feature import CountVectorizer

In [3]:
# $example on$
from pyspark.ml.clustering import LDA
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("LDAExample") \
        .getOrCreate()

In [7]:
    # $example on$
   # Loads data.
dataset = spark.read.format("libsvm").load("G:/Users/Gabriel/Documents/Education/UoB/GitHubDesktop/DST-Assessment-05/Gabriel Grant/data/sample_lda_libsvm_data.txt")

In [8]:
pdf = dataset.toPandas()

In [9]:
pdf

Unnamed: 0,label,features
0,0.0,"(1.0, 2.0, 6.0, 0.0, 2.0, 3.0, 1.0, 1.0, 0.0, ..."
1,1.0,"(1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 2.0, 0.0, ..."
2,2.0,"(1.0, 4.0, 1.0, 0.0, 0.0, 4.0, 9.0, 0.0, 1.0, ..."
3,3.0,"(2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, ..."
4,4.0,"(3.0, 1.0, 1.0, 9.0, 3.0, 0.0, 2.0, 0.0, 0.0, ..."
5,5.0,"(4.0, 2.0, 0.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, ..."
6,6.0,"(2.0, 1.0, 0.0, 3.0, 0.0, 0.0, 5.0, 0.0, 2.0, ..."
7,7.0,"(1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 0.0, 0.0, ..."
8,8.0,"(4.0, 4.0, 0.0, 3.0, 4.0, 2.0, 1.0, 3.0, 0.0, ..."
9,9.0,"(2.0, 8.0, 2.0, 0.0, 3.0, 0.0, 2.0, 0.0, 2.0, ..."


In [6]:
print(type(dataset))
print(type(pdf))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [39]:
pdf.iloc[0,1]

SparseVector(11, {0: 1.0, 1: 2.0, 2: 6.0, 4: 2.0, 5: 3.0, 6: 1.0, 7: 1.0, 10: 3.0})

In [10]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)
# $example off$

spark.stop()

The lower bound on the log likelihood of the entire corpus: -825.496506446713
The upper bound on perplexity: 3.1749865632565886
The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[6, 9, 7]  |[0.10767123632203073, 0.10199236986356748, 0.09654523704941002]|
|1    |[4, 6, 1]  |[0.10598168657220601, 0.09940520744036675, 0.09583399488877938]|
|2    |[6, 0, 3]  |[0.12208807663421029, 0.0994020256050733, 0.09443625719697706] |
|3    |[10, 4, 2] |[0.10443364261791666, 0.09949740727419742, 0.09934688732133116]|
|4    |[1, 7, 6]  |[0.10903533893585092, 0.105973084186274, 0.0972892636143374]   |
|5    |[1, 9, 4]  |[0.18840634223007793, 0.17618877869602934, 0.0920274024386615] |
|6    |[5, 4, 0]  |[0.15881607973368447, 0.14514785705282326, 0.13

In [16]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

In [17]:
sc = SparkContext(appName="LatentDirichletAllocationExample")

In [19]:
data = sc.textFile("data/sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

In [5]:
df = spark.createDataFrame(
   [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
   ["label", "raw"])

In [7]:
df.collect()

[Row(label=0, raw=['a', 'b', 'c']),
 Row(label=1, raw=['a', 'b', 'b', 'c', 'a'])]

In [10]:
cv = CountVectorizer()

In [11]:
cv.setInputCol("raw")

CountVectorizer_330c7f275071

In [13]:
cv.setOutputCol("vectors")

CountVectorizer_330c7f275071

In [14]:
model = cv.fit(df)

In [15]:
model.setInputCol("raw")

CountVectorizerModel: uid=CountVectorizer_330c7f275071, vocabularySize=3

In [16]:
model.transform(df).show(truncate=False)

+-----+---------------+-------------------------+
|label|raw            |vectors                  |
+-----+---------------+-------------------------+
|0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+-----+---------------+-------------------------+

