In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()
spark.sparkContext.addPyFile('numpy.zip')

In [22]:
import requests
import csv
from io import StringIO

url = 'https://drive.usercontent.google.com/download?id=1z_AfT9UYwG7XqExz95MZOr_Ix4jDUXFw&export=download'

response = requests.get(url)

csv_data = response.text
csv_reader = csv.reader(StringIO(csv_data))

header = next(csv_reader)
rows = [tuple(row) for row in csv_reader]

# Read the CSV using PySpark
df = spark.createDataFrame(rows, schema=header)

# Show the DataFrame
df.show()

In [23]:
from scipy.stats import t, f
from pyspark.ml.linalg import DenseMatrix, DenseVector
from pyspark.sql import functions as F
from pyspark.sql import Row
import numpy as np

def matrix_outer(x):
    return [[x_i * x_j for x_j in x] for x_i in x]

X_df = df.withColumn("intercept", F.lit(1))

n = df.count()
k = len(df.columns) - 1

X_rdd = df.select(df.columns[1:]).rdd.map(lambda row: [1.0] + [float(x) for x in row])
Y_rdd = df.select(df.columns[:1]).rdd.map(lambda row: float(row[0]))

XtX = X_rdd.map(lambda x: matrix_outer(x)).reduce(
    lambda a, b: [[a[i][j] + b[i][j] for j in range(len(a[0]))] for i in range(len(a))]
)
XtY = X_rdd.zip(Y_rdd).map(lambda x: [x_i * x[1] for x_i in x[0]]).reduce(lambda a, b: [a_i + b_i for a_i, b_i in zip(a, b)])
XtX_inv = np.linalg.inv(XtX)
beta = [sum(XtX_inv[i][j] * XtY[j] for j in range(len(XtY))) for i in range(len(XtX_inv))]

beta_broadcast = spark.sparkContext.broadcast(beta)

Y_pred_rdd = X_rdd.map(lambda x: sum(beta_broadcast.value[i] * x[i] for i in range(len(x))))
residuals_rdd = Y_rdd.zip(Y_pred_rdd).map(lambda x: float(x[0]) - float(x[1]))
f_residuals = n - k - 1
Y_mean = Y_rdd.reduce(lambda a, b: a + b) / n
SSE = residuals_rdd.map(lambda res: res ** 2).sum()
#SST = Y_rdd.map(lambda y: (y - Y_mean) ** 2).reduce(lambda a, b: a + b)

#sigma_hat = np.sqrt(SSE / f_residuals)
#cov_matrix = [[sigma_hat ** 2 * XtX_inv[i][j] for j in range(len(XtX_inv[0]))] for i in range(len(XtX_inv))]

Y_mean

In [65]:
from scipy.stats import t, f
from pyspark.ml.linalg import DenseMatrix, DenseVector
from pyspark.sql import functions as F
from pyspark.sql import Row

n = df.count()
k = len(df.columns) - 1

X = np.array(df.select(df.columns[1:]).rdd.map(lambda row: [1] + list(row)).collect(), dtype=float)
Y = np.array(df.select(df.columns[:1]).rdd.map(lambda row: row[0]).collect(), dtype=float)


XtX = np.dot(X.T, X)
XtY = np.dot(X.T, Y)
XtX_inv = np.linalg.inv(XtX)

beta = np.dot(XtX_inv, XtY)

Y_pred = np.dot(X, beta)

residuals = Y - Y_pred
f_residuals = n - k - 1

SSE = np.sum(residuals ** 2)
SSR = np.sum((beta - np.mean(Y)) ** 2)
SST = SSR + SSE

sigma_hat = np.sqrt(SSE / f_residuals)
cov_matrix = (sigma_hat ** 2) * XtX_inv
se = np.sqrt(np.diag(cov_matrix))
t_werte = beta / se
p_werte = 2 * (1 - t.cdf(np.abs(t_werte), f_residuals))
f_statistics = (SSR / k) / (SSE / f_residuals)
f_p_wert = 1 - f.cdf(f_statistics, k, f_residuals)
quantile1_3 = np.quantile(residuals, (0.25, 0.75))
quantile = [np.min(residuals), quantile1_3[0], np.median(residuals), quantile1_3[1], np.max(residuals)]
r_quadrat = 1 - (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))
adjusted_r_quadrat = 1 - ((n - 1) / f_residuals) * (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))

In [20]:
spark.stop()

In [36]:
import numpy as np

X = np.array(df.select(df.columns[1:]).rdd.map(lambda row: [1] + list(row)).collect(), dtype=float)
XtX = np.dot(X.T, X)
XtX