In [15]:
from __future__ import print_function

import os
import json
import codecs
import numpy as np
import pandas as pd
import textblob as tb
import cProfile, pstats, sys
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from pyspark import SparkContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import VectorAssembler

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

%matplotlib inline

In [2]:
MONGO_URL_INPUT = "mongodb://192.168.0.15:27017/yelp.review?ssl=false"
MONGO_URL_OUTPUT = "mongodb://192.168.0.15:27017/yelp.teste"

review_df = None
profiler = cProfile.Profile()
spark = SparkSession.builder \
    .appName("kmeans-spark") \
    .master("spark://spark:7077") \
    .config("spark.mongodb.input.uri", MONGO_URL_INPUT) \
    .config("spark.mongodb.output.uri", MONGO_URL_OUTPUT) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0')\
    .getOrCreate()
sqlContext = SQLContext(spark)

def profile():
    profiler.disable()
    ps = pstats.Stats(profiler, stream=sys.stdout)
    ps.print_stats()

In [22]:
def readDataset():
    return spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

def tokenize(review_df):
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    return tokenizer.transform(review_df)

def vectorize(review_df):
    vectorizer = CountVectorizer(inputCol='words', outputCol='vectorizer').fit(review_df)
    return vectorizer.transform(review_df)

def idf(review_df):
    idf = IDF(inputCol="vectorizer", outputCol="tfidf_features")
    idf_model = idf.fit(review_df)
    return idf_model.transform(review_df)

def polarity(text):
    blob = tb.TextBlob(text)
    return blob.sentiment.polarity

def subjectivity(text):
    blob = tb.TextBlob(text)
    return blob.sentiment.subjectivity

def get_pol_sub():
    review_df['polarity'] = review_df['text'].apply(polarity)
    review_df['subjectivity'] = review_df['text'].apply(subjectivity)
    
    print('\nMean Polarity: ' + str(review_df['polarity'].mean())\
          + '\nMean Subjectivity: ' + str(review_df['subjectivity'].mean()))

In [5]:
# A tokenizer that converts the input string to lowercase and then splits it by white spaces.
profiler.enable()

review_df = readDataset()
review_df = review_df[review_df['business_id'] == 'HQl28KMwrEKHqhFrrDqVNQ']

review_df = tokenize(review_df)
review_df = vectorize(review_df)
review_df = idf(review_df)

profile()

         22424 function calls (22048 primitive calls) in 84.054 seconds

   Random listing order was used

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 {method 'get' of 'mappingproxy' objects}
      247    0.002    0.000   84.021    0.340 {method 'readline' of '_io.BufferedReader' objects}
      247    0.000    0.000    0.000    0.000 {method '_checkClosed' of '_io._IOBase' objects}
      247    0.000    0.000    0.000    0.000 {method '_checkReadable' of '_io._IOBase' objects}
      247    0.000    0.000    0.000    0.000 {method 'append' of 'collections.deque' objects}
      247    0.000    0.000    0.000    0.000 {method 'pop' of 'collections.deque' objects}
        5    0.000    0.000    0.000    0.000 {built-in method posix.urandom}
        6    0.000    0.000    0.000    0.000 {built-in method builtins.compile}
       10    0.001    0.000    0.001    0.000 {built-in method builtins.dir}
        6    0.000   

      247    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:973(_get_connection)
      247    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:988(_give_back_connection)
      247    0.001    0.000   84.037    0.340 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1010(send_command)
      247    0.002    0.000   84.036    0.340 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1178(send_command)
      123    0.000    0.000    0.000    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1221(__init__)
   116/99    0.000    0.000    0.009    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1240(_get_args)
      116    0.000    0.000    0.007    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1266(<listcomp>)
   116/99    0.000    0.000    0.016    0.000 /opt/conda/lib/python3.6/site-packages/py4j/java_gateway.py:1258(_build_args)


In [25]:
kmeans = KMeans().setK(3).setFeaturesCol("tfidf_features")
model = kmeans.fit(review_df)
predictions = model.transform(review_df)

In [18]:
review_df.count()

453