In [1]:
import findspark
findspark.init()

import pyspark
import random

print('init')

sc = pyspark.SparkContext(appName="Pi")
num_samples = 10000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()
print(sc)

pi = 4 * count / num_samples
print(pi)

sc.stop()


init
<SparkContext master=local[*] appName=Pi>
3.1492


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

spark

In [6]:
# spark is an existing SparkSession
df = spark.read.csv("../nlp/stanfordSentimentTreebank/dictionary_sm.txt")
# Displays the content of the DataFrame to stdout
df.show()

df2 = spark.read.csv("../nlp/stanfordSentimentTreebank/sentiment_labels_sm.txt")
df2.show()


+--------------------+
|                 _c0|
+--------------------+
|             text|id|
|                 !|0|
|           ! '|22935|
|          ! ''|18235|
|       ! Alas|179257|
|   ! Brilliant|22936|
| ! Brilliant !|40532|
|! Brilliant ! '|2...|
|       ! C'mon|60624|
|! Gollum 's ` per...|
|               ! Oh |
|  ! Romething|140882|
|        ! Run|179259|
|   ! The Movie|60625|
|! The camera twir...|
|! True Hollywood ...|
|        ! Wow|179261|
|     ! Zoom !|179262|
|           !?|220445|
|         !? '|220446|
+--------------------+
only showing top 20 rows

+------------+
|         _c0|
+------------+
|id|sentiment|
|       0|0.5|
|       1|0.5|
|   2|0.44444|
|       3|0.5|
|   4|0.42708|
|     5|0.375|
|   6|0.41667|
|   7|0.54167|
|   8|0.33333|
|   9|0.45833|
|  10|0.47222|
|  11|0.59722|
|  12|0.33333|
|  13|0.93056|
|  14|0.80556|
|  15|0.81944|
|  16|0.76389|
|      17|0.5|
|      18|0.5|
+------------+
only showing top 20 rows



In [18]:
df = spark.read.load("../nlp/stanfordSentimentTreebank/dictionary_sm.txt",
                     format="csv", sep="|", inferSchema="true", header="true")
df.show()


+--------------------+------+
|                text|    id|
+--------------------+------+
|                   !|     0|
|                 ! '| 22935|
|                ! ''| 18235|
|              ! Alas|179257|
|         ! Brilliant| 22936|
|       ! Brilliant !| 40532|
|     ! Brilliant ! '| 22937|
|             ! C'mon| 60624|
|! Gollum 's ` per...| 13402|
|! Oh , look at th...|179258|
|         ! Romething|140882|
|               ! Run|179259|
|         ! The Movie| 60625|
|! The camera twir...|179260|
|! True Hollywood ...|140883|
|               ! Wow|179261|
|            ! Zoom !|179262|
|                  !?|220445|
|                !? '|220446|
|                   #| 60626|
+--------------------+------+
only showing top 20 rows



In [14]:
df.printSchema()
#df.createGlobalTempView("sentences")
spark.sql("SELECT * FROM global_temp.sentences").show()


root
 |-- _c0: string (nullable = true)

+--------------------+
|                 _c0|
+--------------------+
|             text|id|
|                 !|0|
|           ! '|22935|
|          ! ''|18235|
|       ! Alas|179257|
|   ! Brilliant|22936|
| ! Brilliant !|40532|
|! Brilliant ! '|2...|
|       ! C'mon|60624|
|! Gollum 's ` per...|
|               ! Oh |
|  ! Romething|140882|
|        ! Run|179259|
|   ! The Movie|60625|
|! The camera twir...|
|! True Hollywood ...|
|        ! Wow|179261|
|     ! Zoom !|179262|
|           !?|220445|
|         !? '|220446|
+--------------------+
only showing top 20 rows



In [17]:
from pyspark.sql.types import *

sc = spark.sparkContext

# Load a text file and convert each line to a Row.
lines = sc.textFile("../nlp/stanfordSentimentTreebank/dictionary_sm.txt")
parts = lines.map(lambda l: l.split("|"))
# Each line is converted to a tuple.
wordid = parts.map(lambda p: (p[0].strip(), p[1].strip()))

# The schema is encoded in a string.
schemaString = "word id"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

# Apply the schema to the RDD.
schemaWord = spark.createDataFrame(wordid, schema)

# Creates a temporary view using the DataFrame
schemaWord.createOrReplaceTempView("word")

# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT * FROM word")

results.show()


+--------------------+------+
|                word|    id|
+--------------------+------+
|                text|    id|
|                   !|     0|
|                 ! '| 22935|
|                ! ''| 18235|
|              ! Alas|179257|
|         ! Brilliant| 22936|
|       ! Brilliant !| 40532|
|     ! Brilliant ! '| 22937|
|             ! C'mon| 60624|
|! Gollum 's ` per...| 13402|
|! Oh , look at th...|179258|
|         ! Romething|140882|
|               ! Run|179259|
|         ! The Movie| 60625|
|! The camera twir...|179260|
|! True Hollywood ...|140883|
|               ! Wow|179261|
|            ! Zoom !|179262|
|                  !?|220445|
|                !? '|220446|
+--------------------+------+
only showing top 20 rows



In [27]:
#df.write.saveAsTable("saved_words")

spark.sql("show databases").show()
spark.sql("show tables").show()
spark.sql("select * from saved_words").show()

+------------+
|databaseName|
+------------+
|     default|
+------------+

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
| default|saved_words|      false|
|        |       word|       true|
+--------+-----------+-----------+

+--------------------+------+
|                text|    id|
+--------------------+------+
|                   !|     0|
|                 ! '| 22935|
|                ! ''| 18235|
|              ! Alas|179257|
|         ! Brilliant| 22936|
|       ! Brilliant !| 40532|
|     ! Brilliant ! '| 22937|
|             ! C'mon| 60624|
|! Gollum 's ` per...| 13402|
|! Oh , look at th...|179258|
|         ! Romething|140882|
|               ! Run|179259|
|         ! The Movie| 60625|
|! The camera twir...|179260|
|! True Hollywood ...|140883|
|               ! Wow|179261|
|            ! Zoom !|179262|
|                  !?|220445|
|                !? '|220446|
|                   #| 60626|
+--------------------+-

In [33]:
df.write.mode("overwrite").parquet("words.parquet")
parquetFile = spark.read.parquet("words.parquet")
parquetFile.createOrReplaceTempView("parquetFile")
words = spark.sql("SELECT * FROM parquetFile ")
words.show()

+--------------------+------+
|                text|    id|
+--------------------+------+
|                   !|     0|
|                 ! '| 22935|
|                ! ''| 18235|
|              ! Alas|179257|
|         ! Brilliant| 22936|
|       ! Brilliant !| 40532|
|     ! Brilliant ! '| 22937|
|             ! C'mon| 60624|
|! Gollum 's ` per...| 13402|
|! Oh , look at th...|179258|
|         ! Romething|140882|
|               ! Run|179259|
|         ! The Movie| 60625|
|! The camera twir...|179260|
|! True Hollywood ...|140883|
|               ! Wow|179261|
|            ! Zoom !|179262|
|                  !?|220445|
|                !? '|220446|
|                   #| 60626|
+--------------------+------+
only showing top 20 rows



In [1]:
# from pyspark import SparkContext, SparkConf
# conf = SparkConf()
# conf.setMaster('spark://192.168.1.28:7077')
# conf.setAppName('mynewapp2')
# sc2 = SparkContext(conf=conf)
# print (sc2)

# def mod(x):
#     import numpy as np
#     return (x, np.mod(x, 2))
# rdd = sc2.parallelize(range(100)).map(mod).take(10)
# rdd

<SparkContext master=spark://192.168.1.28:7077 appName=mynewapp2>


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 192.168.1.28, executor 0): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:545)
	at java.net.ServerSocket.accept(ServerSocket.java:513)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:545)
	at java.net.ServerSocket.accept(ServerSocket.java:513)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
	... 14 more
