In [14]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()

In [15]:
import requests
import csv
from io import StringIO

url = 'https://drive.usercontent.google.com/download?id=1z_AfT9UYwG7XqExz95MZOr_Ix4jDUXFw&export=download'

response = requests.get(url)

csv_data = response.text
csv_reader = csv.reader(StringIO(csv_data))

header = next(csv_reader)
rows = [tuple(row) for row in csv_reader]

# Read the CSV using PySpark
df = spark.createDataFrame(rows, schema=header)

# Show the DataFrame
df.show()

In [19]:
from scipy.stats import t, f
import numpy as np
from pyspark.sql.functions import length

n = df.count()
k = len(df.columns) - 1

X = np.array(df.select(df.columns[1:]).rdd.map(lambda row: [1] + list(row)).collect(), dtype=float)
Y = np.array(df.select(df.columns[:1]).rdd.map(lambda row: row[0]).collect(), dtype=float)

XtX = np.dot(X.T, X)
XtY = np.dot(X.T, Y)
XtX_inv = np.linalg.inv(XtX)

beta = np.dot(XtX_inv, XtY)

Y_pred = np.dot(X, beta)

residuals = Y - Y_pred
f_residuals = n - k - 1

SSE = np.sum(residuals ** 2)
SSR = np.sum((beta - np.mean(Y)) ** 2)
SST = SSR + SSE

sigma_hat = np.sqrt(SSE / f_residuals)
cov_matrix = (sigma_hat ** 2) * XtX_inv
se = np.sqrt(np.diag(cov_matrix))
t_werte = beta / se
p_werte = 2 * (1 - t.cdf(np.abs(t_werte), f_residuals))
f_statistics = (SSR / k) / (SSE / f_residuals)
f_p_wert = 1 - f.cdf(f_statistics, k, f_residuals)
quantile1_3 = np.quantile(residuals, (0.25, 0.75))
quantile = [np.min(residuals), quantile1_3[0], np.median(residuals), quantile1_3[1], np.max(residuals)]
r_quadrat = 1 - (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))
adjusted_r_quadrat = 1 - ((n - 1) / f_residuals) * (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))

In [11]:
import pandas as pd
import numpy as np

# Assuming the following variables are defined:
# beta, se, t_werte, p_werte, residuals, quantile, sigma_hat, df_residuals,
# r_quadrat, adjusted_r_quadrat, f_statistiken, f_p_wert, n, k, Y, y_pred, X, dataframe, cov_matrix

# Step 1: Create the summary dictionary with nested DataFrames
result = {
    'summary': {
        'Coefficients': pd.DataFrame({
            'Estimate': np.round(beta, 5),
            'Std_Error': np.round(se, 5),
            't_value': np.round(t_werte, 3),
            'p_value': np.round(p_werte, 6)
        }),
        'Residuals': pd.DataFrame({
            'Min': [np.round(min(residuals), 5)],
            'Quantil1': [np.round(quantile[1], 5)],
            'Med': [np.round(np.median(residuals), 5)],
            'Quantil3': [np.round(quantile[3], 5)],
            'Max': [np.round(max(residuals), 5)]
        }),
        'Properties': pd.DataFrame({
            'n': [n],
            'Sigma': [np.round(sigma_hat, 4)],
            'df': [f_residuals],
            'R_squared': [np.round(r_quadrat, 4)],
            'R_squared_a': [np.round(adjusted_r_quadrat, 4)],
            'F_statistic': [np.round(f_statistics, 1)],
            'P_value': [f_p_wert]
        })
    },
    'data': {
        'n': n,
        'k': k,
        'Y': Y,
        'Y_Predicted': Y_pred,
        'X': X,
        'Residuals': residuals,
        'DataFrame': df,
        'Beta': beta,
        'Cov_Matrix': cov_matrix
    }
}
result

In [23]:
spark.stop()

In [20]:
XtX

In [21]:
XtY

In [22]:
XtX_inv