In [1]:
from pyspark import SparkContext, SparkConf

# Initialize Spark
conf = SparkConf().setAppName("MatrixVectorMultiplication")
sc = SparkContext(conf=conf)

# Sample data
matrix = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
vector = [2, 3, 4]

# Create an RDD from the matrix
matrix_rdd = sc.parallelize(matrix)

# Broadcast the vector to all worker nodes
# Broadcasting is an optimization to efficiently share data across all nodes
broadcast_vector = sc.broadcast(vector)

# MAP OPERATION
# For each row in the matrix, multiply each element by the corresponding element in the vector
def element_wise_product(row):
    vector_value = broadcast_vector.value
    return [row[i] * vector_value[i] for i in range(len(row))]

# Apply the map function to get element-wise products
element_wise_products = matrix_rdd.map(element_wise_product)

# MAP OPERATION (to get dot products)
# Sum the products for each row to complete the matrix-vector multiplication
dot_products = element_wise_products.map(lambda row: sum(row))

# REDUCE OPERATION
# Combine all results to get a single value (e.g., sum of all dot products)
total_sum = dot_products.reduce(lambda x, y: x + y)

# Display results
print("Original Matrix:")
for row in matrix:
    print(row)

print("\nVector:", vector)

print("\nElement-wise Products:")
for row in element_wise_products.collect():
    print(row)

print("\nMatrix-Vector Multiplication Result:")
print(dot_products.collect())

print("\nSum of All Dot Products:", total_sum)



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/17 13:13:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/17 13:13:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/17 13:13:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
                                                                                

Original Matrix:
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]

Vector: [2, 3, 4]

Element-wise Products:
[2, 6, 12]
[8, 15, 24]
[14, 24, 36]

Matrix-Vector Multiplication Result:
[20, 47, 74]

Sum of All Dot Products: 141


In [2]:
dot_products.collect()

[20, 47, 74]

In [4]:
import numpy as np 
np.random.seed(1225)   

n=5 # number of features
N=1000 # number of samples
Y= np.random.normal(loc=0, scale=1, size= n).reshape(n,1)
X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
I=np.arange(N).reshape(N,1)

P=X@Y
P[:5]
broadcastY = sc.broadcast(Y)
rddX=sc.parallelize(np.hstack((I,X)))
rddProduct=rddX.map(lambda x: np.dot(x[1:],broadcastY.value))
rddProduct.take(5)
def mapf(x):
    n=len(x)-1
    outm=[]
    for i in np.arange(n):
        outm.append((int(x[0]),float(x[i+1]*broadcastY.value[i])))
    return(outm)
rddProduct=rddX.map(lambda x: mapf(x))
rddProduct.take(2)

[[(0, 0.4860302674032222),
  (0, -2.9223853950644703),
  (0, -0.6073669829256055),
  (0, -0.7761487545467428),
  (0, 0.534491719201233)],
 [(1, -1.795105026731332),
  (1, 2.265246181738276),
  (1, 0.8447886373244555),
  (1, -0.31181171114138806),
  (1, -0.2362335879465642)]]

In [5]:
rddProduct.reduce( lambda x, y: (x[0],x[1:]+y[1:])).collect()

25/03/17 13:16:21 ERROR Executor: Exception in task 15.0 in stage 6.0 (TID 161)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^

	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at

	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

Driver stacktrace:)
25/03/17 13:16:22 WARN TaskSetManager: Lost task 3.0 in stage 6.0 (TID 149) (iprodegianluca.info.ulb.ac.be executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 6.0 failed 1 times, most recent failure: Lost task 15.0 in stage 6.0 (TID 161) (iprodegianluca.info.ulb.ac.be executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 6.0 failed 1 times, most recent failure: Lost task 15.0 in stage 6.0 (TID 161) (iprodegianluca.info.ulb.ac.be executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/var/folders/nz/_89k7w_x1kgcxrnw8r7j7qyh0000gn/T/ipykernel_32197/4206972321.py", line 1, in <lambda>
TypeError: can only concatenate tuple (not "list") to tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 1922, in func
    yield reduce(f, iterator, initial)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/var/folders/nz/_89k7w_x1kgcxrnw8r7j7qyh0000gn/T/ipykernel_32197/4206972321.py", line 1, in <lambda>
TypeError: can only concatenate tuple (not "list") to tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	... 1 more


ecutor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

Driver stacktrace:)
25/03/17 13:16:22 WARN TaskSetManager: Lost task 17.0 in stage 6.0 (TID 163) (iprodegianluca.info.ulb.ac.be executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 6.0 failed 1 times, most recent failure: Lost task 15.0 in stage 6.0 (TID 161) (iprodegianluca.info.ulb.ac.be executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_st

Driver stacktrace:)
25/03/17 13:16:22 WARN TaskSetManager: Lost task 27.0 in stage 6.0 (TID 173) (iprodegianluca.info.ulb.ac.be executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 15 in stage 6.0 failed 1 times, most recent failure: Lost task 15.0 in stage 6.0 (TID 161) (iprodegianluca.info.ulb.ac.be executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/p

In [8]:
# Initialize Spark
conf = SparkConf().setAppName("MatrixVectorMultiplication")
sc = SparkContext(conf=conf)

# Sample data
matrix = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]
vector = [2, 3, 4]

# Create an RDD of key-value pairs: (row_index, row_values)
indexed_matrix_rdd = sc.parallelize([(i, row) for i, row in enumerate(matrix)])

# Broadcast the vector to all worker nodes
broadcast_vector = sc.broadcast(vector)

# MAP OPERATION
# For each row, calculate element-wise products with the vector
def multiply_row_with_vector(row_tuple):
    row_idx, row = row_tuple
    vector_value = broadcast_vector.value
    
    # For each element in the row, multiply by the corresponding vector element
    # Return (row_index, [products])
    return (row_idx, [row[i] * vector_value[i] for i in range(len(row))])

# Apply the map function
element_wise_products = indexed_matrix_rdd.map(multiply_row_with_vector)

# MAP OPERATION to convert element-wise products to row sums (dot products)
# This transforms (row_index, [products]) to (row_index, sum_of_products)
dot_products = element_wise_products.mapValues(sum)

# We can also demonstrate the use of reduceByKey by restructuring our data
# First, let's explode the matrix into (i, j, value) format
matrix_elements = []
for i, row in enumerate(matrix):
    for j, value in enumerate(row):
        matrix_elements.append((i, j, value))

# Create RDD of ((row_idx, col_idx), matrix_value)
matrix_element_rdd = sc.parallelize([((i, j), value) for i, j, value in matrix_elements])

# Map each matrix element to ((row_idx, col_idx), matrix_value * vector_value[col_idx])
def multiply_with_vector_element(element):
    (row_idx, col_idx), value = element
    return (row_idx, value * broadcast_vector.value[col_idx])

# Apply the multiplication
product_elements = matrix_element_rdd.map(multiply_with_vector_element)

# REDUCE OPERATION
# Use reduceByKey to sum up all products for each row
result_by_key = product_elements.reduceByKey(lambda x, y: x + y)

# Display results
print("Original Matrix:")
for row in matrix:
    print(row)

print("\nVector:", vector)

print("\nMatrix-Vector Multiplication Result (using mapValues):")
print(dot_products.collect())

print("\nMatrix-Vector Multiplication Result (using reduceByKey):")
print(result_by_key.collect())

# Convert the result to a list in the original order
final_result = [value for _, value in sorted(result_by_key.collect())]
print("\nResult as a list:", final_result)

# Stop Spark context



25/03/17 13:19:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/17 13:19:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Original Matrix:
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]

Vector: [2, 3, 4]

Matrix-Vector Multiplication Result (using mapValues):


                                                                                

[(0, 20), (1, 47), (2, 74)]

Matrix-Vector Multiplication Result (using reduceByKey):
[(0, 20), (1, 47), (2, 74)]

Result as a list: [20, 47, 74]


In [10]:
product_elements.take(20)

[(0, 2), (0, 6), (0, 12), (1, 8), (1, 15), (1, 24), (2, 14), (2, 24), (2, 36)]

In [11]:
matrix_element_rdd.take(3)

[((0, 0), 1), ((0, 1), 2), ((0, 2), 3)]