In [1]:
# spark must be first found using findspark package
import findspark
findspark.init()

In [2]:
# establish a spark session with 50 executors
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "70") \
        .config("spark.executor.memory","4g") \
        .config("spark.driver.memory","30g") \
        .config("spark.executor.cores",'1') \
        .config("spark.scheduler.mode","FIFO") \
        .getOrCreate()

In [3]:
# import the review dataset
dataset = spark.read.json('/yelp/review.json').repartition(300).cache()

In [4]:
# take a look at the dataset 
# 1) look at the schema of the data
# 2) count the number of rows
# 3) show a few rows of the dataset
# 4) use the take function to look at the actual values
dataset.printSchema()
print(dataset.count())
dataset.show(2)
dataset.take(1)

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)

4736897
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|      date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|bn_QrUHgavXKIk_lT...|   0|2016-06-10|    0|w2dz2KDnTlH0R90zi...|    5|I needed a ride f...|     2|sD2wQrkWlQIJDJQCE...|
|WRj8rLeJpU49wqU9E...|   2|2012-11-04|    3|aoRWVXGzobfYg0lBR...|    2|Hm.
I guess I don...|     4|JjicSN4Fyyi40DH7h...|
+--------------------+----+----------+-----+--

[Row(business_id='bn_QrUHgavXKIk_lTASErQ', cool=0, date='2016-06-10', funny=0, review_id='w2dz2KDnTlH0R90ziLughQ', stars=5, text='I needed a ride from Vegas to Tehachapi. Safe ride comfortable. Wifi worked great. Orange line. 62$ on a Saturday. Driver excellent!  Would do again to get to my daughter to visit. Little pricey but NO stress. Bus only had about 15 passengers.', useful=2, user_id='sD2wQrkWlQIJDJQCE6NffQ')]

In [5]:
import time
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
start = time.time()

# tokenize
tokenizer = Tokenizer(inputCol="text", outputCol="words")
dataset = tokenizer.transform(dataset)

# drop original text column
dataset = dataset.drop("text")

# Stop word removal
stopremove = StopWordsRemover(inputCol='words',outputCol='cleaned')
dataset = stopremove.transform(dataset)

dataset = dataset.drop('words')

#fit a word2vec model 
word2Vec = Word2Vec(vectorSize=30, minCount=5, numPartitions=300, inputCol="cleaned", outputCol="word2vec")
model = word2Vec.fit(dataset)
dataset = model.transform(dataset).drop('cleaned')

end = time.time()

In [6]:
dataset=dataset.drop('review_id').cache()

In [7]:
from pyspark.ml.feature import PCA

pca = PCA(k=3, inputCol= "word2vec",outputCol="pca_text")
pca_model = pca.fit(dataset)
result = pca_model.transform(dataset).select("pca_text")

In [8]:
print(end-start)

220.94469118118286


In [11]:
toDF(result.rdd)

AttributeError: 'SparkSession' object has no attribute 'toDF'