In [1]:
# Spark modules
from pyspark import SparkConf, SparkContext, SparkFiles

# Standard modules
import pickle
import time

# Third party modules
import gensim
import shapely

# Definition of LDA Helper class
# contains useful method to compute the topics
# distribution and the top topic aggregation method
class LDAHelper:

    def __init__(self, ntopics):
        self.ntopics = ntopics

    def calculate_topic_distributions(self, model, documents):
        """
        Input: model, documents (set of bow)
        Output: a list with (topic_index, sum_of_distributions)
        """
        top_topics = [0] * (self.ntopics)
        for document in documents:
            dist = model.get_document_topics(document, minimum_probability=0)
            for doc_id in range(self.ntopics):
                top_topics[doc_id] += dist[doc_id][1]

        to_return = []
        for i, top in enumerate(top_topics):
            to_return.append((model.show_topic(i, 5), top))

        return sorted(to_return, key=lambda x: x[1], reverse=True)

    def merge_topic_lists(self, wtopics1, wtopics2):
        """
        Merge two topic lists together.
        If it finds the same topic in both lists,
        it keeps only one of them with the sum of
        the weights.
        """
        new_topics = []
        for wtopic1 in wtopics1:
            topic1, weight_1 = wtopic1
            new_w = weight_1
            for wtopic2 in wtopics2:
                topic2, weight_2 = wtopic2
                if topic1 == topic2:
                    new_w += weight_2
                    break
            new_topics.append((topic1, new_w))
        return new_topics


    def merge_toptopic(self, top_topic1, top_topic2):
        merged_top_topics = self.merge_topic_lists(top_topic1, top_topic2)
        merged_top_topics = sorted(merged_top_topics, key=lambda x: x[1], reverse=True)
        return  merged_top_topics[:self.ntopics]  

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1501058268283_0012,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
# LDA Helper
lda_helper = LDAHelper(15)

# Load the files from Azure HDFS
new_grid_filename1 = "input_1_new_grid_part1.pkl"
new_grid_filename2 = "input_1_new_grid_part2.pkl"
lda_map_filename = "input_2_lda_map.pkl"
hdfs_base_path = "wasb:///mydata/"

sc.addFile(hdfs_base_path + new_grid_filename1)
sc.addFile(hdfs_base_path + new_grid_filename2)
sc.addFile(hdfs_base_path + lda_map_filename)

# Get files back
new_grid = []
with open(SparkFiles.get(new_grid_filename1), 'rb') as fw:
    new_grid = pickle.load(fw)

with open(SparkFiles.get(new_grid_filename2), 'rb') as fw:
    new_grid += pickle.load(fw)

lda_map = None
with open(SparkFiles.get(lda_map_filename), 'rb') as fw:
    lda_map = pickle.load(fw)


In [13]:
# init: define spark_grid
spark_grid = []
counter = 0
for cell in new_grid:
    cell_id = counter
    for p in cell['parts']:
        corpus = p['corpus']
        tup = (cell_id, corpus*10)
        spark_grid.append(tup)
    counter += 1

# define lda_map brodcast variable
# read-only shared variable
lda_map_broadcast = sc.broadcast(lda_map)

times = []
start_time = time.time()

# Spark Core
rdd_grid = spark.sparkContext.parallelize(spark_grid)
topics = rdd_grid.map(lambda scell: (scell[0], lda_helper.calculate_topic_distributions(lda_map_broadcast.value, scell[1])))
topics = topics.reduceByKey(lambda a, b: lda_helper.merge_toptopic(a, b))
topics = topics.sortByKey(True)
spark_result = topics.collect()

# Format the output
spark_result = list(zip(*spark_result))[1]
spark_result = list(spark_result)

times.append(time.time() - start_time)
print("Done:", times[0], " seconds")

('Done:', 153.42076802253723, ' seconds')