In [1]:
from pyspark.sql import SparkSession
from scipy.stats import t, f
from pyspark.sql.types import StructType, StructField, DoubleType
from pyspark.ml.linalg import DenseMatrix, DenseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as F
from pyspark.sql import Row
import numpy as np
import time

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

def create_data(n, p, beta_true):
    np.random.seed(42)
    X = np.random.rand(n, p)
    X = np.column_stack([np.ones(X.shape[0]), X])
    y = X @ beta_true + np.random.randn(n) * 0.1
    data = [Row(y=float(y_i), **{f"x{i}": float(x_i) for i, x_i in enumerate(x)}) for x, y_i in zip(X, y)]
    df = spark.createDataFrame(data)
    feature_columns = [f"x{i}" for i in range(p + 1)]  # Include intercept column
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    return assembler.transform(df).select("features", "y")

def run_benchmark(n_list, repetitions=5):
    results = []
    beta_true = [-8, -1.6, 4.1, -10, -9.2, 1.3, 1.6, 2.3]
    p = 7

    for n in n_list:
        times = []
        for _ in range(repetitions):
            X_y_df = create_data(n, p, beta_true)
            X = X_y_df.select("features").rdd
            y = X_y_df.select("y").rdd
            beta, elapsed_time = linear_regression_manual(X, y)
            times.append(elapsed_time)

        avg_time = np.mean(times)
        std_time = np.std(times)
        results.append([n, avg_time, std_time, beta])
        print("\n Datenzeilen: ", n)
        print("Laufzeit: ", avg_time)

    return results

def matrix_outer(x):
    return [[x_i * x_j for x_j in x] for x_i in x]

def linear_regression_manual(X, Y):
    start_time = time.time()

    X_rdd = X.map(lambda row: [float(x) for x in row[0]])
    Y_rdd = Y.map(lambda row: float(row[0]))
    
    n = Y.count()
    k = len(X_rdd.first()) - 1
    
    XtX = X_rdd.map(lambda x: matrix_outer(x)).reduce(
        lambda a, b: [[a[i][j] + b[i][j] for j in range(len(a[0]))] for i in range(len(a))]
    )
    XtY = X_rdd.zip(Y_rdd).map(lambda x: [x_i * x[1] for x_i in x[0]]).reduce(lambda a, b: [a_i + b_i for a_i, b_i in zip(a, b)])
    XtX_inv = np.linalg.inv(XtX)
    beta = [sum(XtX_inv[i][j] * XtY[j] for j in range(len(XtY))) for i in range(len(XtX_inv))]
    
    beta_broadcast = spark.sparkContext.broadcast(beta)
    
    Y_pred_rdd = X_rdd.map(lambda x: sum(beta_broadcast.value[i] * x[i] for i in range(len(x))))
    residuals_rdd = Y_rdd.zip(Y_pred_rdd).map(lambda x: float(x[0]) - float(x[1]))
    f_residuals = n - k - 1
    
    Y_mean = Y_rdd.reduce(lambda a, b: a + b) / n
    Y_mean_broadcast = spark.sparkContext.broadcast(Y_mean)
    
    SSE = residuals_rdd.map(lambda res: res ** 2).sum()
    SST = Y_rdd.map(lambda y: (y - Y_mean_broadcast.value) ** 2).sum()
    
    sigma_hat = np.sqrt(SSE / f_residuals)
    cov_matrix = sigma_hat ** 2 * XtX_inv
    
    se = np.sqrt(np.diag(cov_matrix))
    se_broadcast = spark.sparkContext.broadcast(se)
    
    t_values_rdd = spark.sparkContext.parallelize(range(len(beta))).map(
        lambda i: beta_broadcast.value[i] / se_broadcast.value[i]
    )
    p_values_rdd = t_values_rdd.map(
        lambda t_value: 2 * (1 - t.cdf(np.abs(t_value), df=n - k - 1))
    )
    
    p_values = p_values_rdd.collect()
    
    SSR = np.sum((beta - np.mean(Y_rdd.collect())) ** 2)
    f_statistics = (SSR / k) / (SSE / f_residuals)
    f_p_value = 1 - f.cdf(f_statistics, k, f_residuals)
    
    r_squared = 1 - (SSE / SST)
    adjusted_r_squared = 1 - (((n - 1) / f_residuals) * (SSE / SST))
    end_time = time.time()
    elapsed_time = end_time - start_time
    return beta, elapsed_time

n_values = [100, 500, 1000, 5000, 10000, 50000, 100000, 200000, 500000, 1000000]
benchmark_results = run_benchmark(n_values)

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/09 15:51:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


 Datenzeilen:  100
Laufzeit:  17.89895396232605


ERROR:root:Exception while sending command.                       (0 + 27) / 27]
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=57>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe

----------------------------------------                                        
Exception occurred during processing of request from ('127.0.0.1', 47490)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumula