In [1]:
import findspark
findspark.init()

In [2]:
import sys
from operator import add

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [5]:
filename = './test_data.txt'

### WordCount

In [13]:
spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName("WCount")\
        .getOrCreate()

In [14]:
sc = spark.sparkContext

In [15]:
# .filter(lambda line: len(line)>1) \

#### RDD

 > RDD - these are the elements that run and operate on multiple nodes to do parallel processing on a cluster

In [16]:
lines = spark.read.text(filename).rdd.map(lambda r: r[0]) # RDD

In [17]:
type(lines)

pyspark.rdd.PipelinedRDD

In [18]:
lines.take(2)

['Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. At ultrices mi tempus imperdiet nulla malesuada pellentesque. Vitae congue mauris rhoncus aenean vel elit. Arcu dui vivamus arcu felis bibendum ut tristique et egestas. Nunc non blandit massa enim nec dui nunc mattis. Amet facilisis magna etiam tempor orci eu lobortis elementum. Quis risus sed vulputate odio ut enim. Leo vel orci porta non pulvinar neque laoreet suspendisse. In tellus integer feugiat scelerisque. Diam sit amet nisl suscipit adipiscing bibendum est ultricies integer. Lacinia quis vel eros donec ac odio tempor. Sit amet consectetur adipiscing elit ut aliquam purus sit. Aliquet eget sit amet tellus cras. Eget est lorem ipsum dolor sit amet consectetur adipiscing elit. Ut ornare lectus sit amet est.',
 '']

In [32]:
%%time
reduced_data = lines.flatMap(lambda x: x.split(' ')) \ # flatmap returns RDD
    .map(lambda x: (x, 1)) \
     .reduceByKey(add)

Wall time: 165 ms


In [37]:
type(reduced_data)

pyspark.rdd.PipelinedRDD

In [34]:
%%time
output = reduced_data.collect()

Wall time: 751 ms


In [38]:
type(output)

list

In [39]:
output

[('Lorem', 570),
 ('ipsum', 3420),
 ('consectetur', 3990),
 ('sed', 12540),
 ('do', 570),
 ('labore', 570),
 ('magna', 3420),
 ('At', 1710),
 ('mi', 2280),
 ('nulla', 2850),
 ('malesuada', 4560),
 ('congue', 570),
 ('rhoncus', 3420),
 ('aenean', 2280),
 ('elit.', 2280),
 ('Arcu', 1710),
 ('felis', 3420),
 ('bibendum', 4560),
 ('Nunc', 2850),
 ('nunc', 4560),
 ('etiam', 1140),
 ('lobortis', 1710),
 ('elementum.', 1710),
 ('vulputate', 3420),
 ('odio', 6270),
 ('enim.', 1140),
 ('Leo', 1140),
 ('tellus', 5130),
 ('integer', 1710),
 ('feugiat', 3420),
 ('scelerisque.', 1140),
 ('Diam', 570),
 ('amet', 9690),
 ('suscipit', 1140),
 ('est', 3420),
 ('quis', 8550),
 ('Sit', 3990),
 ('purus', 5700),
 ('sit.', 3420),
 ('Aliquet', 2280),
 ('eget', 6840),
 ('ornare', 3420),
 ('lectus', 6840),
 ('', 5759),
 ('Non', 1710),
 ('interdum', 2850),
 ('Dui', 1140),
 ('Purus', 1140),
 ('faucibus', 5130),
 ('nisi', 2850),
 ('lacus', 2850),
 ('sapien', 2280),
 ('pellentesque', 8550),
 ('habitant', 570),
 ('

In [123]:
spark.stop()

## Join Method ✔

In [5]:
spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName("GroupByMat")\
        .getOrCreate()

In [6]:
sc = spark.sparkContext

In [7]:
def splitter(l):
    i,j,v = l.strip().split(' ')
    return tuple([int(i),(int(j),float(v))])

In [249]:
# working dope
# mat_rdd = sc.textFile('./small_numbers.txt').map(lambda l: splitter(l))

In [9]:
# working dope
mat_rdd = sc.textFile('./test_data_jester2.txt').map(lambda l: splitter(l))

In [8]:
# mat_rdd = sc.textFile('./test_jester2_big2.txt').map(lambda l: splitter(l))

In [30]:
mat_rdd.take(2)

[(0, (0, 0.0)), (0, (1, 0.8109999999999999))]

In [211]:
# best working
# join_results = mat_rdd\
#     .map(lambda l: (l[0],(l[1],l[2])))\
#     .join(mat_rdd2.map(lambda l: (l[0],(l[1],l[2]))))\
#     .map(lambda t: ((t[1][0][0],t[1][1][0]), t[1][0][1]*t[1][1][1]))\
#     .reduceByKey(add)

In [31]:
# best working
join_results = mat_rdd\
    .join(mat_rdd2.map(lambda l: l))\
    .map(lambda t: ((t[1][0][0],t[1][1][0]), t[1][0][1]*t[1][1][1]))\
    .reduceByKey(add)

In [32]:
%%time
collected_results = join_results.collect()

Wall time: 9min 35s


In [213]:
collected_results = sorted(collected_results)

In [216]:
collected_results[0]

((0, 0), 4508.473709000027)

In [None]:
# ((0, 0), 4508.473709000027)

In [215]:
fileName = './res_test_jester_data.txt'
with open(fileName, 'w') as f:
    for t in collected_results:
        (i,j),v = t
        f.write(f"{i} {j} {v}\n")

### Results
- (23500,100) --> (100,100)
- rows in txt files 2350000
- test_data_jester2.txt: 40MB
- Time to collect results: 1m 30s

---------------------------------------------
- (47000,100) --> (100,100)
- rows in txt file 4700000
- test_jester2_big2.txt --> 80MB
- Time to collect results: 9m 35s

## Probabilistic Method

In [40]:
import numpy as np

In [99]:
def redfn(l):
    x = np.array([x[1][1] for x in l[1]])
    return (x.sum()**2 - x.dot(x)) / 2

In [100]:
mat_rdd.take(2)

[(0, 0, 0.13383441772821036), (0, 1, 0.7468473923116297)]

In [None]:
x = (0, (0, 0.13383441772821036))
y = (0, (0, 0.13383441772821036))

v -> x[1][1] * y[1][1]
k -> (x[1][0], y[1][0])

In [101]:
def customRed(a,b):
#     if a[0]==b[0]:
    return ((a[1][0],b[1][0]),a[1][1]*b[1][0])

In [None]:
[[ind,val], [ind,val], ]

In [None]:
mat_rdd.reduceByKey()

In [None]:
# 0,0,1
# 0,1,2
# 0,2,3

In [113]:
mat_rdd.map(lambda l: (l[0],tuple(l[1:]))).reduceByKey(lambda x,y: x+y).collect()

[(0, (0, 0.13383441772821036, 1, 0.7468473923116297, 2, 0.887473518814142)),
 (2, (0, 0.6743332231395642, 1, 0.5619214520690802, 2, 0.5996560847181328)),
 (1, (0, 0.528287706368358, 1, 0.6165535215356, 2, 0.5567147819096118))]

In [92]:
# mat_rdd.map(lambda l: (l[0],(l[1][0],l[1][1]))).reduceByKey(lambda x,y: y[1]).collect()
# mat_rdd.groupByKey().mapValues(list).collect()
# mat_rdd.groupByKey().reduceByKey(lambda x: )

[(0,
  [(0, 0.13383441772821036), (1, 0.7468473923116297), (2, 0.887473518814142)]),
 (2,
  [(0, 0.6743332231395642), (1, 0.5619214520690802), (2, 0.5996560847181328)]),
 (1, [(0, 0.528287706368358), (1, 0.6165535215356), (2, 0.5567147819096118)])]

In [47]:
# mat_rdd.groupBy(lambda x: x[0]).reduceByKey(lambda l: redfn(l)).collect() # mapValues(list)

[(0, <pyspark.resultiterable.ResultIterable at 0x1e4de5667b8>),
 (2, <pyspark.resultiterable.ResultIterable at 0x1e4de5664a8>),
 (1, <pyspark.resultiterable.ResultIterable at 0x1e4de566898>)]

In [None]:
num_rows = mat_rdd.groupBy(lambda x: x[0]).count()

In [117]:
num_rows

3

In [128]:
res_arr = []

In [135]:
%%time
mat_rdd.filter(lambda x: x[0]==0).collect()

Wall time: 968 ms


[(0, 0, 0.13383441772821036),
 (0, 1, 0.7468473923116297),
 (0, 2, 0.887473518814142)]

In [141]:
xv.collect()

[(1, 0, 0.528287706368358),
 (1, 1, 0.6165535215356),
 (1, 2, 0.5567147819096118)]

In [142]:
xv.cartesian(xv).collect()

[((1, 0, 0.528287706368358), (1, 0, 0.528287706368358)),
 ((1, 0, 0.528287706368358), (1, 1, 0.6165535215356)),
 ((1, 1, 0.6165535215356), (1, 0, 0.528287706368358)),
 ((1, 1, 0.6165535215356), (1, 1, 0.6165535215356)),
 ((1, 0, 0.528287706368358), (1, 2, 0.5567147819096118)),
 ((1, 1, 0.6165535215356), (1, 2, 0.5567147819096118)),
 ((1, 2, 0.5567147819096118), (1, 0, 0.528287706368358)),
 ((1, 2, 0.5567147819096118), (1, 1, 0.6165535215356)),
 ((1, 2, 0.5567147819096118), (1, 2, 0.5567147819096118))]

In [139]:
xv = mat_rdd.filter(lambda x: x[0]==1)
tres = xv.cartesian(xv)\
        .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
        .reduceByKey(lambda x,y: x+y)

In [140]:
%%time
tres.collect()

Wall time: 8.32 s


[((1, 1), 0.3801382449179495),
 ((0, 2), 0.2941055752363894),
 ((2, 0), 0.2941055752363894),
 ((0, 1), 0.3257176457453761),
 ((1, 2), 0.34324445927729463),
 ((0, 0), 0.27908790069994044),
 ((2, 2), 0.3099313483966666),
 ((1, 0), 0.3257176457453761),
 ((2, 1), 0.34324445927729463)]

In [144]:
%%time
for i in range(num_rows):
    g = mat_rdd.filter(lambda x: x[0]==i)
    res_arr.append(g.cartesian(g)\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y))

Wall time: 61.8 ms


In [145]:
%%time
[x.collect() for x in res_arr]

Wall time: 41.2 s


[[((0, 2), 0.11877450163969665),
  ((2, 0), 0.11877450163969665),
  ((1, 1), 0.5577810274026813),
  ((0, 1), 0.09995388588185924),
  ((1, 2), 0.662807283271968),
  ((0, 0), 0.01791165136864911),
  ((2, 2), 0.7876092465963553),
  ((1, 0), 0.09995388588185924),
  ((2, 1), 0.662807283271968)],
 [((1, 1), 0.3801382449179495),
  ((0, 2), 0.2941055752363894),
  ((2, 0), 0.2941055752363894),
  ((0, 1), 0.3257176457453761),
  ((1, 2), 0.34324445927729463),
  ((0, 0), 0.27908790069994044),
  ((2, 2), 0.3099313483966666),
  ((1, 0), 0.3257176457453761),
  ((2, 1), 0.34324445927729463)],
 [((0, 2), 0.4043680203832301),
  ((2, 0), 0.4043680203832301),
  ((1, 1), 0.3157557182954236),
  ((0, 1), 0.378922303925007),
  ((1, 2), 0.33695961786687256),
  ((0, 0), 0.4547252958297933),
  ((2, 2), 0.35958741993948046),
  ((1, 0), 0.378922303925007),
  ((2, 1), 0.33695961786687256)],
 [((0, 2), 0.11877450163969665),
  ((2, 0), 0.11877450163969665),
  ((1, 1), 0.5577810274026813),
  ((0, 1), 0.099953885881859

## Trial

In [99]:
spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName("MultMat")\
        .getOrCreate()

In [100]:
sc = spark.sparkContext

In [101]:
def splitter(l):
    i,j,v = l.strip().split(' ')
    return tuple([int(i),int(j),float(v)])

In [102]:
splitter('0 0 0.6556')

(0, 0, 0.6556)

In [103]:
mat_rdd = sc.textFile('./sample_numbers.txt').map(lambda l: splitter(l))

In [104]:
mat_rdd.take(2)

[(0, 0, 0.7608220245799869), (0, 1, 0.6460457520796447)]

In [105]:
mat_res = mat_rdd.cartesian(mat_rdd).filter(lambda x: x[0][0] == x[1][0])\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y)

In [49]:
# mat_results = mat_res.collect()

In [50]:
# mat_results

In [58]:
# %%time
# mat_res.take(2)

In [42]:
# mat_res = mat_res.map(lambda x: ' '.join([str(p) for p in [x[0][0], x[0][1], x[1]]]))

In [60]:
# mat_res.take(2)

#### For a (3000,10) matrix
time collect: 1m 32s

#### For a (5000,25) matrix
collect time: Erro

In [97]:
mat_res.take(2)

[((0, 2), 750.3056621030034), ((0, 6), 748.7303530580234)]

In [106]:
%%time
mat_res.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 1.0 failed 1 times, most recent failure: Lost task 1.0 in stage 1.0 (TID 2, LAPTOP-C83P3CPF, executor driver): java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:323)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:350)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:803)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:982)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:391)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:628)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.net.SocketException: Connection reset
	at java.base/sun.nio.ch.NioSocketImpl.implRead(NioSocketImpl.java:323)
	at java.base/sun.nio.ch.NioSocketImpl.read(NioSocketImpl.java:350)
	at java.base/sun.nio.ch.NioSocketImpl$1.read(NioSocketImpl.java:803)
	at java.base/java.net.Socket$SocketInputStream.read(Socket.java:982)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:391)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:628)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:488)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)


#### For a (2000,20) matrix
Time taken: to write to disk -> 2m 43s

In [59]:
# %%time
# mat_res.coalesce(1,True).saveAsTextFile("file:///D:/Sem 6/Data Science/Post Midsem/project/test-result/result.txt")

Wall time: 2min 43s


#### For a (100,10) matrix
Time taken to write to disk -> 18.2s

In [85]:
# %%time
# mat_res.saveAsTextFile("file:///D:/Sem 6/Data Science/Post Midsem/project/test-result/result.txt")

Wall time: 18.2 s


#### For a (3000, 30) matrix
Time take: to write to disk -> 

In [98]:
spark.stop()

### Work

In [158]:
spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName("MultMat")\
        .getOrCreate()

In [159]:
fn = lambda x: list(x)

In [160]:
sc = spark.sparkContext

In [None]:
# rows_rdd = spark.read.text('./jester2.txt').rdd.map(lambda l: map(float, l.strip().split(' ')))

In [162]:
rows = spark.read.text('./test_data_jester2.txt').rdd.map(lambda r: r[0])
# rows = sc.textFile('./test_data_jester2.txt')

In [163]:
type(rows)

pyspark.rdd.PipelinedRDD

In [167]:
# rows_rdd = rows.map(lambda l: map(float, l.strip().split(' ')))
rows_rdd = rows.map(lambda l: tuple(float(x) for x in l.strip().split(' ')))

In [168]:
rows_rdd.take(2)

[(0.0, 0.0, 0.0), (0.0, 1.0, 0.8109999999999999)]

In [102]:
rows_rdd.take(3)

[(0.0, 0.0, 0.0), (0.0, 1.0, 0.8109999999999999), (0.0, 2.0, 0.0)]

In [None]:
temp_arr = '0 0 1\n0 1 3\n1 0 4\n 1 1 7'

In [198]:
temp_rdd = sc.textFile('./test_data_jester2.txt').map(lambda l: tuple(float(x) for x in l.strip().split(' ')))

In [199]:
temp_rdd.take(2)

[(0.0, 0.0, 0.0), (0.0, 1.0, 0.8109999999999999)]

In [200]:
type(temp_rdd)

pyspark.rdd.PipelinedRDD

In [201]:
temp_res = temp_rdd\
    .cartesian(temp_rdd).filter(lambda x: x[0][0] == x[1][0])\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y)

In [203]:
%%time
temp_ress = temp_res.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 81.0 failed 1 times, most recent failure: Lost task 3.0 in stage 81.0 (TID 2150, LAPTOP-C83P3CPF, executor driver): java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:420)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:826)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1052)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:106)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
	at org.apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:465)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:285)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:420)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:826)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1052)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:81)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:127)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:106)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
	at org.apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:465)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:285)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)


In [204]:
%%time
rdd_temp = sc.parallelize(temp_rdd.collect())

Wall time: 17.2 s


In [205]:
rdd_temp.take(2)

[(0.0, 0.0, 0.0), (0.0, 1.0, 0.8109999999999999)]

In [206]:
rdd_temp_res = rdd_temp\
    .cartesian(rdd_temp).filter(lambda x: x[0][0] == x[1][0])\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y)

In [None]:
rdd_tress = rdd_temp_res.collect()

In [103]:
ex_rdd = sc.parallelize([(0,0,1), (0,1, 3), (1,0, 4), (1,1,7)])

In [157]:
type(ex_rdd)

pyspark.rdd.RDD

In [85]:
vv = (((0, 0), 1), ((0, 0), 1))

In [91]:
vv[1][0]

(0, 0)

In [105]:
# sorted(ex_rdd.cartesian(ex_rdd).collect())
sorted(ex_rdd.cartesian(ex_rdd).filter(lambda x: x[0][0] == x[1][0]).collect())

[((0, 0, 1), (0, 0, 1)),
 ((0, 0, 1), (0, 1, 3)),
 ((0, 1, 3), (0, 0, 1)),
 ((0, 1, 3), (0, 1, 3)),
 ((1, 0, 4), (1, 0, 4)),
 ((1, 0, 4), (1, 1, 7)),
 ((1, 1, 7), (1, 0, 4)),
 ((1, 1, 7), (1, 1, 7))]

In [None]:
((1,2), 5), ((2,4),6)

In [117]:
ex_res = ex_rdd\
    .cartesian(ex_rdd).filter(lambda x: x[0][0] == x[1][0])\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y)

In [118]:
ex_res.collect()

[((1, 1), 58), ((0, 0), 17), ((1, 0), 31), ((0, 1), 31)]

In [169]:
rows_rdd.take(4)

[(0.0, 0.0, 0.0),
 (0.0, 1.0, 0.8109999999999999),
 (0.0, 2.0, 0.0),
 (0.0, 3.0, 0.0)]

In [137]:
ex_res.map(lambda xv: (xv[0][0], xv[0][1], xv[1])).saveAsTextFile('./ex_res.txt')

In [154]:
ex_res.take(2)

[((1, 1), 58), ((0, 0), 17)]

In [141]:
ress = ex_res.map(lambda xv: (xv[0][0], xv[0][1], xv[1])).collect()

In [148]:
' '.join([str(x) for x in ress[0]])

'1 1 58'

In [150]:
with open('./exres.txt', 'w') as f:
    for t in ress:
        f.write(' '.join([str(x) for x in t])+'\n')

In [138]:
read_res = sc.textFile("./ex_res.txt")

In [139]:
read_res.collect()

['(1, 1, 58)', '(0, 0, 17)', '(1, 0, 31)', '(0, 1, 31)']

In [170]:
%%time
res_rdd = rows_rdd.cartesian(rows_rdd)\
    .filter(lambda x: x [0][0] == x[1][0])\
    .map(lambda x: ((x[0][1],x[1][1]), x[0][2]*x[1][2]))\
    .reduceByKey(lambda x,y: x+y)

Wall time: 27.3 ms


In [171]:
results = res_rdd.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 68.0 failed 1 times, most recent failure: Lost task 0.0 in stage 68.0 (TID 2121, LAPTOP-C83P3CPF, executor driver): java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:420)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:826)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1052)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:123)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:106)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:283)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:148)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:420)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:826)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1052)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:123)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:106)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:108)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:283)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:295)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:148)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)


In [16]:
# rows_rdd = rows.map(lambda l: map(float, l.strip().split(' ')))
rows_rdd = rows.map(lambda l: l.flatMap())

In [17]:
rows_rdd.take(2)

[<map at 0x1d4c8c512e8>, <map at 0x1d4c8c51358>]

In [13]:
type(rows_rdd)

pyspark.rdd.PipelinedRDD