In [1]:
from __future__ import print_function

import json, codecs, os
import numpy as np
from timeit import default_timer as timer

from pyspark.ml.feature import IDF
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer

from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import collect_list

In [2]:
arr_collections_debug = ["5000000","8000000"]
arr_collections = ["100","500","1000","5000","10000","50000","100000","500000","1000000"]#,"5000000","8000000"]
MONGO_HOST = "192.168.0.15"
MONGO_PORT = "27017"
firstRun = True

review_df = None
spark = SparkSession.builder \
    .appName("tfidf_spark") \
    .master("spark://spark:7077") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()
spark.sparkContext.setLogLevel('ERROR')
sqlContext = SQLContext(spark)

def getCollectionReadURL(collection):
    return "mongodb://" + MONGO_HOST + ":" + MONGO_PORT + "/yelp_filtered_read." + collection + "?ssl=false"

def readFromCollection(collection, profile=False):
    url = getCollectionReadURL(collection)
    df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", url).load()
    return df

In [3]:
def tokenize(review_df):
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    return tokenizer.transform(review_df)

def vectorize(review_df):
    countVectorizer = CountVectorizer(inputCol='words', outputCol='vectorizer', minDF=3).fit(review_df)
    return countVectorizer.transform(review_df)

def idf(review_df):
    idf = IDF(inputCol="vectorizer", outputCol="tfidf_features").fit(review_df)
    return idf.transform(review_df)

def process(review_df):
    grouped_df = review_df.groupBy("business_id").agg(collect_list('text').alias("text"))
    grouped_df = grouped_df.withColumn("text", concat_ws(" ", col("text")))
    grouped_df = tokenize(grouped_df)
    grouped_df = vectorize(grouped_df)
    grouped_df = idf(grouped_df)
    return grouped_df

In [4]:
arr_collection_timings = {}
for collection in arr_collections_debug:
    review_df = readFromCollection(collection)
    count = review_df.count()
    arr_timings = []
    
    for i in range(30):
        starttime = timer()
        grouped_df = process(review_df)
        endtime = timer()
        
        if(firstRun):
            print("Ignored: " + str(endtime-starttime))
            firstRun = False
            starttime = timer()
            grouped_df = process(review_df)
            endtime = timer()
        
        arr_timings.append(endtime-starttime)
        print("["+"{:02d}".format(i+1)+"] "+ str(count) + ": " + str(round(endtime-starttime, 3)) + " segundos")
        
    arr_collection_timings[collection] = round(np.mean(arr_timings), 3)

Ignored: 10.537348199999997
[01] 100: 2.615 segundos
[02] 100: 2.034 segundos
[03] 100: 1.665 segundos
[04] 100: 1.539 segundos
[05] 100: 1.74 segundos
[06] 100: 1.44 segundos
[07] 100: 1.416 segundos
[08] 100: 1.782 segundos
[09] 100: 1.303 segundos
[10] 100: 1.542 segundos
[11] 100: 1.409 segundos
[12] 100: 1.223 segundos
[13] 100: 1.249 segundos
[14] 100: 1.148 segundos
[15] 100: 1.247 segundos
[16] 100: 1.238 segundos
[17] 100: 1.148 segundos
[18] 100: 1.197 segundos
[19] 100: 1.136 segundos
[20] 100: 1.065 segundos
[21] 100: 1.106 segundos
[22] 100: 1.063 segundos
[23] 100: 1.029 segundos
[24] 100: 1.112 segundos
[25] 100: 1.143 segundos
[26] 100: 1.183 segundos
[27] 100: 1.052 segundos
[28] 100: 1.111 segundos
[29] 100: 1.038 segundos
[30] 100: 1.125 segundos
[01] 500: 2.105 segundos
[02] 500: 1.925 segundos
[03] 500: 1.781 segundos
[04] 500: 1.775 segundos
[05] 500: 1.797 segundos
[06] 500: 1.875 segundos
[07] 500: 1.805 segundos
[08] 500: 1.804 segundos
[09] 500: 1.827 segundos

In [5]:
review_df.show()

+--------------------+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|                 _id|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|[60120699d445e70d...|-MhfebM0QIsKt87iD...|   0|2015-04-15 05:21:16|    0|xQY8N_XvtGbearJ5X...|    2|As someone who ha...|     5|OwjRMXRC0KyPrIlcj...|
|[60120699d445e70d...|lbrU8StCq3yDfr-QM...|   0|2013-12-07 03:16:52|    1|UmFMZ8PyXZTY2Qcwz...|    1|I am actually hor...|     1|nIJD_7ZXHq-FX8byP...|
|[60120699d445e70d...|HQl28KMwrEKHqhFrr...|   0|2015-12-05 03:18:11|    0|LG2ZaYiOgpr2DK_90...|    5|I love Deagan's. ...|     1|V34qejxNsCbcgD8C0...|
|[60120699d445e70d...|5JxlZaqCnk1MnbgRi...|   0|2011-05-27 05:30:52|    0|i6g_oA9Yf9Y31qt0w...

In [6]:
# Bellagio Gallery of Fine Art
#company_df1 = review_df[review_df['business_id'] == "-MhfebM0QIsKt87iDN-FNw"]

#The Empanadas House
#company_df2 = review_df[review_df['business_id'] == "pQeaRpvuhoEqudo3uymHIQ"]

In [7]:
#company_df1.select("vectorizer").collect()

In [8]:
#company_df2.select("vectorizer").collect()

In [9]:
arr_collection_timings

{'100': 1.337,
 '500': 1.76,
 '1000': 2.072,
 '5000': 3.198,
 '10000': 4.088,
 '50000': 9.33,
 '100000': 14.823,
 '500000': 47.715,
 '1000000': 91.201}