In [5]:
# spark must be first found using findspark package
import findspark
findspark.init()

In [6]:
# establish a spark session with 50 executors
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "50") \
        .config("spark.executor.memory","6g") \
        .config("spark.driver.memory","5g") \
        .config("spark.scheduler.mode","FIFO") \
        .getOrCreate()

In [7]:
# import the review dataset
dataset = spark.read.json('/yelp/review.json').repartition(100)

In [8]:
# take a look at the dataset 
# 1) look at the schema of the data
# 2) count the number of rows
# 3) show a few rows of the dataset
# 4) use the take function to look at the actual values
dataset.printSchema()
print(dataset.count())
dataset.show(2)
dataset.take(1)

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)

4736897
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|      date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+----------+-----+--------------------+-----+--------------------+------+--------------------+
|YRYO4dkcA26k-cmCY...|   0|2010-07-30|    0|fCXtOiW4XPHlHG-RJ...|    5|I know Booster Ju...|     0|SPhMDt2vOjENbXhdK...|
|YseUrxQw78h919da8...|   1|2015-01-02|    1|veUCW3aYX49gAdZIJ...|    3|Neat experience f...|     1|aOAqs5leMNiPSbszv...|
+--------------------+----+----------+-----+--

[Row(business_id='O9K_xqrua6sKD9xFlENymg', cool=0, date='2016-01-28', funny=0, review_id='rS6mp7mSO9tsSDRlNYdmPg', stars=5, text="Steve is a great attorney seems a little laid back but that's just him and not the way he conducts business.  I am very happy with my results,  he answers all email,  calls and text promptly and doesn't make you feel like you're bothering him.  If you need an attorney try him it's free to consult with him initially you'll be glad you did.  Glenn", useful=1, user_id='XBut-ocMiT3oyZhm7sJ19g')]

In [9]:
import time
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
start = time.time()

# tokenize
tokenizer = Tokenizer(inputCol="text", outputCol="words")
dataset = tokenizer.transform(dataset)

# drop original text column
dataset = dataset.drop("text")

# Stop word removal
stopremove = StopWordsRemover(inputCol='words',outputCol='cleaned')
dataset = stopremove.transform(dataset)

dataset = dataset.drop('words')

#fit a word2vec model 
word2Vec = Word2Vec(vectorSize=15, minCount=0, numPartitions=150, inputCol="cleaned", outputCol="word2vec")
model = word2Vec.fit(dataset)
dataset = model.transform(dataset).drop('cleaned').cache()

end = time.time()



Py4JJavaError: An error occurred while calling o146.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 146 in stage 13.0 failed 4 times, most recent failure: Lost task 146.3 in stage 13.0 (TID 780, slave-3, executor 9): ExecutorLostFailure (executor 9 exited caused by one of the running tasks) Reason: Container marked as failed: container_1507441462461_0004_01_000010 on host: slave-3. Exit status: -100. Diagnostics: Container released on a *lost* node
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.mllib.feature.Word2Vec$$anonfun$doFit$1.apply$mcVI$sp(Word2Vec.scala:438)
	at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
	at org.apache.spark.mllib.feature.Word2Vec.doFit(Word2Vec.scala:358)
	at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:319)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:187)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [4]:
spark.stop()