In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()

In [20]:
import requests
import csv
from io import StringIO

url = 'https://drive.usercontent.google.com/download?id=1z_AfT9UYwG7XqExz95MZOr_Ix4jDUXFw&export=download'

response = requests.get(url)

csv_data = response.text
csv_reader = csv.reader(StringIO(csv_data))

header = next(csv_reader)
rows = [tuple(row) for row in csv_reader]

# Read the CSV using PySpark
df = spark.createDataFrame(rows, schema=header)

# Show the DataFrame
df.show()

                                                                                

+---+---+---+---+---+---+---+
|  Y| X1| X2| X3| X4| X5| X6|
+---+---+---+---+---+---+---+
| 43| 51| 30| 39| 61| 92|45 |
| 63| 64| 51| 54| 63| 73|47 |
| 71| 70| 68| 69| 76| 86|48 |
| 61| 63| 45| 47| 54| 84|35 |
| 81| 78| 56| 66| 71| 83|47 |
| 43| 55| 49| 44| 54| 49|34 |
| 58| 67| 42| 56| 66| 68|35 |
| 71| 75| 50| 55| 70| 66|41 |
| 72| 82| 72| 67| 71| 83|31 |
| 67| 61| 45| 47| 62| 80|41 |
| 64| 53| 53| 58| 58| 67|34 |
| 67| 60| 47| 39| 59| 74|41 |
| 69| 62| 57| 42| 55| 63|25 |
| 68| 83| 83| 45| 59| 77|35 |
| 77| 77| 54| 72| 79| 77|46 |
| 81| 90| 50| 72| 60| 54|36 |
| 74| 85| 64| 69| 79| 79|63 |
| 65| 60| 65| 75| 55| 80|60 |
| 65| 70| 46| 57| 75| 85|46 |
| 50| 58| 68| 54| 64| 78|52 |
+---+---+---+---+---+---+---+
only showing top 20 rows



In [22]:
from scipy.stats import t, f
from pyspark.ml.linalg import DenseMatrix, DenseVector
from pyspark.sql import functions as F
from pyspark.sql import Row
import numpy as np

def matrix_outer(x):
    return [[x_i * x_j for x_j in x] for x_i in x]

n = df.count()
k = len(df.columns) - 1

X_rdd = df.select(df.columns[1:]).rdd.map(lambda row: [1.0] + [float(x) for x in row])
Y_rdd = df.select(df.columns[:1]).rdd.map(lambda row: float(row[0]))

XtX = X_rdd.map(lambda x: matrix_outer(x)).reduce(
    lambda a, b: [[a[i][j] + b[i][j] for j in range(len(a[0]))] for i in range(len(a))]
)
XtY = X_rdd.zip(Y_rdd).map(lambda x: [x_i * x[1] for x_i in x[0]]).reduce(lambda a, b: [a_i + b_i for a_i, b_i in zip(a, b)])
XtX_inv = np.linalg.inv(XtX)
beta = [sum(XtX_inv[i][j] * XtY[j] for j in range(len(XtY))) for i in range(len(XtX_inv))]

beta_broadcast = spark.sparkContext.broadcast(beta)

Y_pred_rdd = X_rdd.map(lambda x: sum(beta_broadcast.value[i] * x[i] for i in range(len(x))))
residuals_rdd = Y_rdd.zip(Y_pred_rdd).map(lambda x: float(x[0]) - float(x[1]))
f_residuals = n - k - 1

Y_mean = Y_rdd.reduce(lambda a, b: a + b) / n
Y_mean_broadcast = spark.sparkContext.broadcast(Y_mean)

SSE = residuals_rdd.map(lambda res: res ** 2).sum()
SST = Y_rdd.map(lambda y: (y - Y_mean_broadcast.value) ** 2).sum()

sigma_hat = np.sqrt(SSE / f_residuals)
cov_matrix = sigma_hat ** 2 * XtX_inv

se = np.sqrt(np.diag(cov_matrix))
se_broadcast = spark.sparkContext.broadcast(se)

t_values_rdd = spark.sparkContext.parallelize(range(len(beta))).map(
    lambda i: beta_broadcast.value[i] / se_broadcast.value[i]
)
p_values_rdd = t_values_rdd.map(
    lambda t_value: 2 * (1 - t.cdf(np.abs(t_value), df=n - k - 1))
)

p_values = p_values_rdd.collect()

SSR = np.sum((beta - np.mean(Y_rdd.collect())) ** 2)
f_statistics = (SSR / k) / (SSE / f_residuals)
f_p_value = 1 - f.cdf(f_statistics, k, f_residuals)

r_squared = 1 - (SSE / SST)
adjusted_r_squared = 1 - (((n - 1) / f_residuals) * (SSE / SST))

                                                                                

In [23]:
import pandas as pd
import numpy as np

# Assuming the following variables are defined:
# beta, se, t_werte, p_werte, residuals, quantile, sigma_hat, df_residuals,
# r_quadrat, adjusted_r_quadrat, f_statistiken, f_p_wert, n, k, Y, y_pred, X, dataframe, cov_matrix

# Step 1: Create the summary dictionary with nested DataFrames
result = {
    'summary': {
        'Coefficients': pd.DataFrame({
            'Estimate': np.round(beta, 5),
            'Std_Error': np.round(se, 5),
            't_value': np.round(t_values, 3),
            'p_value': np.round(p_values, 6)
        }),
        'Properties': pd.DataFrame({
            'n': [n],
            'Sigma': [np.round(sigma_hat, 4)],
            'df': [f_residuals],
            'R_squared': [np.round(r_squared, 4)],
            'R_squared_a': [np.round(adjusted_r_squared, 4)],
            'F_statistic': [np.round(f_statistics, 1)],
            'P_value': [f_p_value]
        })
    },
    'data': {
        'n': n,
        'k': k,
        'Y': Y_rdd,
        'Y_Predicted': Y_pred_rdd,
        'X': X_rdd,
        'Residuals': residuals_rdd,
        'DataFrame': df,
        'Beta': beta,
        'Cov_Matrix': cov_matrix
    }
}
result

{'summary': {'Coefficients':    Estimate  Std_Error  t_value   p_value
  0  10.78708   11.58926    0.931  0.361634
  1   0.61319    0.16098    3.809  0.000903
  2  -0.07305    0.13572   -0.538  0.595594
  3   0.32033    0.16852    1.901  0.069925
  4   0.08173    0.22148    0.369  0.715480
  5   0.03838    0.14700    0.261  0.796334
  6  -0.21706    0.17821   -1.218  0.235577,
  'Properties':     n  Sigma  df  R_squared  R_squared_a  F_statistic       P_value
  0  30  7.068  23     0.7326       0.6628         93.0  5.884182e-15},
 'data': {'n': 30,
  'k': 6,
  'Y': PythonRDD[59] at collect at /tmp/ipykernel_31/2237910587.py:50,
  'Y_Predicted': PythonRDD[60] at RDD at PythonRDD.scala:53,
  'X': PythonRDD[61] at RDD at PythonRDD.scala:53,
  'Residuals': PythonRDD[62] at RDD at PythonRDD.scala:53,
  'DataFrame': DataFrame[Y: string, X1: string, X2: string, X3: string, X4: string, X5: string, X6: string],
  'Beta': [np.float64(10.787076385734736),
   np.float64(0.6131876078096674),
   np.

In [16]:
spark.stop()