In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col, expr, sqrt
import numpy as np
import math
import time

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

def backward_substitution(A_dict, y):
    n = len(A_dict)
    b = np.zeros(n)

    for i in range(n - 1, -1, -1):
        row = A_dict.get(i)
        if row is None:
            raise ValueError(f"No row found in A_rdd for index {i}")
        if row[i] == 0:
            raise ValueError("Matrix A contains a zero on the diagonal; no unique solution possible.")
        
        b[i] = (y[i] - np.dot(row[i+1:], b[i+1:])) / row[i]

    return b

def gram_schmidt(X_df):
    X_rdd = X_df.rdd.cache()
    m = len(X_df.first().features)
    Q = []
    R = np.zeros((m, m))

    for j in range(m):
        v = X_rdd.map(lambda row: row.features[j]).collect()
        
        for i in range(j):
            R[i, j] = np.dot(Q[i], v)
            v -= R[i, j] * Q[i]

        R[j, j] = np.linalg.norm(v)
        Q.append(v / R[j, j])
    
    R_dict = {i: R[i, :] for i in range(m)}

    Q_df = spark.createDataFrame([Row(features=DenseVector(q)) for q in zip(*Q)])

    return Q_df, R_dict

def create_data(n, p, beta_true):
    np.random.seed(42)
    X = np.random.rand(n, p)
    X = np.column_stack([np.ones(X.shape[0]), X])
    y = X @ beta_true + np.random.randn(n) * 0.1
    data = [Row(y=float(y_i), **{f"x{i}": float(x_i) for i, x_i in enumerate(x)}) for x, y_i in zip(X, y)]
    df = spark.createDataFrame(data)
    feature_columns = [f"x{i}" for i in range(p + 1)]  # Include intercept column
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    return assembler.transform(df).select("features", "y")

def linear_regression_manual_qr(X, y):
    start_time = time.time()
    Q, R = gram_schmidt(X)
    Qt_y = np.array(Q.rdd.map(lambda row: sum(row.features[i] * y[i] for i in range(len(row.features)))).collect())
    beta = backward_substitution(R, Qt_y)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return beta, elapsed_time

def run_benchmark(n_list, repetitions=5):
    results = []
    beta_true = [-8, -1.6, 4.1, -10, -9.2, 1.3, 1.6, 2.3]
    p = 7

    for n in n_list:
        times = []
        for _ in range(repetitions):
            X_y_df = create_data(n, p, beta_true)
            X = X_y_df.select("features")
            y = X_y_df.select("y").rdd.flatMap(lambda x: x).collect()
            beta, elapsed_time = linear_regression_manual_qr(X, y)
            times.append(elapsed_time)

        avg_time = sum(times) / repetitions
        std_time = (sum((x - avg_time) ** 2 for x in times) / repetitions) ** 0.5
        results.append([n, avg_time, std_time, beta])
        print("\nData Rows:", n)
        print("Average Run Time:", avg_time)

    return results

n_values = [200000, 500000, 1000000]
benchmark_results = run_benchmark(n_values)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/11 17:24:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


Data Rows: 200000
Average Run Time: 13.37696352005005


ERROR:root:Exception while sending command.                      (11 + 16) / 27]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=57>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 40888)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 271, in accum_updates
   