In [None]:
from __future__ import print_function

import os
import json
import codecs
import cProfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import make_blobs

from pyspark import SparkContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

%matplotlib inline

In [2]:
MONGO_URL_INPUT = "mongodb://192.168.0.20:27017/yelp.review?ssl=false"
MONGO_URL_OUTPUT = "mongodb://192.168.0.20:27017/yelp.teste"

spark = SparkSession.builder \
    .appName("kmeans-spark") \
    .master("spark://spark:7077") \
    .config("spark.mongodb.input.uri", MONGO_URL_INPUT) \
    .config("spark.mongodb.output.uri", MONGO_URL_OUTPUT) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.0')\
    .getOrCreate()
sqlContext = SQLContext(spark)

In [None]:
df
def getDataFrame() {
    df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
}

cProfile.run('getDataFrame()')
df.head()

### Spark withColumn() is a DataFrame function that is used to add a new column to DataFrame, change the value of an existing column, convert the datatype of a column, derive a new column from an existing column.

In [None]:
FEATURES_COL = ['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']
for col in df.columns:
    if col in FEATURES_COL:
        df = df.withColumn(col, df[col].cast('float'))
    else:
        df = df.drop(col)
df = df.na.drop()
df.columns
df.head()

### Spark's implementation of KMeans is a bit different from for example scikit-learn's version. We need to store all features as an array of floats, and store this array as a column called "features". Since we do no longer need the original columns we filter them out with a select statement.

In [None]:
vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_kmeans = vecAssembler.transform(df)
df_kmeans.show()

In [None]:
k = 10
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(df_kmeans)
centers = model.clusterCenters()

In [None]:
transformed = model.transform(df_kmeans)
rows = transformed.collect()
print(rows[:3])

In [None]:
df_pred = sqlContext.createDataFrame(rows)
df_pred.show()

In [None]:
pddf_pred = df_pred.toPandas()
pddf_pred.head()

In [None]:
threedee = plt.figure(figsize=(12,10)).gca(projection='3d')
threedee.scatter(pddf_pred.average_stars, pddf_pred.cool, pddf_pred.fans, c=pddf_pred.prediction)
threedee.set_xlabel('x')
threedee.set_ylabel('y')
threedee.set_zlabel('z')
plt.show()