In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("spark://spark-master:7077").appName("Test").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 12:34:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import requests
import csv
from io import StringIO

url = 'https://drive.usercontent.google.com/download?id=1z_AfT9UYwG7XqExz95MZOr_Ix4jDUXFw&export=download'

response = requests.get(url)

csv_data = response.text
csv_reader = csv.reader(StringIO(csv_data))

header = next(csv_reader)
rows = [tuple(row) for row in csv_reader]

# Read the CSV using PySpark
df = spark.createDataFrame(rows, schema=header)

# Show the DataFrame
df.show()

                                                                                

+---+---+---+---+---+---+---+
|  Y| X1| X2| X3| X4| X5| X6|
+---+---+---+---+---+---+---+
| 43| 51| 30| 39| 61| 92|45 |
| 63| 64| 51| 54| 63| 73|47 |
| 71| 70| 68| 69| 76| 86|48 |
| 61| 63| 45| 47| 54| 84|35 |
| 81| 78| 56| 66| 71| 83|47 |
| 43| 55| 49| 44| 54| 49|34 |
| 58| 67| 42| 56| 66| 68|35 |
| 71| 75| 50| 55| 70| 66|41 |
| 72| 82| 72| 67| 71| 83|31 |
| 67| 61| 45| 47| 62| 80|41 |
| 64| 53| 53| 58| 58| 67|34 |
| 67| 60| 47| 39| 59| 74|41 |
| 69| 62| 57| 42| 55| 63|25 |
| 68| 83| 83| 45| 59| 77|35 |
| 77| 77| 54| 72| 79| 77|46 |
| 81| 90| 50| 72| 60| 54|36 |
| 74| 85| 64| 69| 79| 79|63 |
| 65| 60| 65| 75| 55| 80|60 |
| 65| 70| 46| 57| 75| 85|46 |
| 50| 58| 68| 54| 64| 78|52 |
+---+---+---+---+---+---+---+
only showing top 20 rows



In [3]:
from scipy.stats import t, f
import numpy as np
from pyspark.sql.functions import length

n = df.count()
k = len(df.columns) - 1

X = np.array(df.select(df.columns[1:]).rdd.map(lambda row: [1] + list(row)).collect(), dtype=float)
Y = np.array(df.select(df.columns[:1]).rdd.map(lambda row: row[0]).collect(), dtype=float)

XtX = np.dot(X.T, X)
XtY = np.dot(X.T, Y)
XtX_inv = np.linalg.inv(XtX)

beta = np.dot(XtX_inv, XtY)

Y_pred = np.dot(X, beta)

residuals = Y - Y_pred
f_residuals = n - k - 1

SSE = np.sum(residuals ** 2)
SSR = np.sum((beta - np.mean(Y)) ** 2)
SST = SSR + SSE

sigma_hat = np.sqrt(SSE / f_residuals)
cov_matrix = (sigma_hat ** 2) * XtX_inv
se = np.sqrt(np.diag(cov_matrix))
t_werte = beta / se
p_werte = 2 * (1 - t.cdf(np.abs(t_werte), f_residuals))
f_statistics = (SSR / k) / (SSE / f_residuals)
f_p_wert = 1 - f.cdf(f_statistics, k, f_residuals)
quantile1_3 = np.quantile(residuals, (0.25, 0.75))
quantile = [np.min(residuals), quantile1_3[0], np.median(residuals), quantile1_3[1], np.max(residuals)]
r_quadrat = 1 - (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))
adjusted_r_quadrat = 1 - ((n - 1) / f_residuals) * (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))

                                                                                

In [4]:
import pandas as pd
import numpy as np

result = {
    'summary': {
        'Coefficients': pd.DataFrame({
            'Estimate': np.round(beta, 5),
            'Std_Error': np.round(se, 5),
            't_value': np.round(t_werte, 3),
            'p_value': np.round(p_werte, 6)
        }),
        'Residuals': pd.DataFrame({
            'Min': [np.round(min(residuals), 5)],
            'Quantil1': [np.round(quantile[1], 5)],
            'Med': [np.round(np.median(residuals), 5)],
            'Quantil3': [np.round(quantile[3], 5)],
            'Max': [np.round(max(residuals), 5)]
        }),
        'Properties': pd.DataFrame({
            'n': [n],
            'Sigma': [np.round(sigma_hat, 4)],
            'df': [f_residuals],
            'R_squared': [np.round(r_quadrat, 4)],
            'R_squared_a': [np.round(adjusted_r_quadrat, 4)],
            'F_statistic': [np.round(f_statistics, 1)],
            'P_value': [f_p_wert]
        })
    },
    'data': {
        'n': n,
        'k': k,
        'Y': Y,
        'Y_Predicted': Y_pred,
        'X': X,
        'Residuals': residuals,
        'DataFrame': df,
        'Beta': beta,
        'Cov_Matrix': cov_matrix
    }
}
result

{'summary': {'Coefficients':    Estimate  Std_Error  t_value   p_value
  0  10.78708   11.58926    0.931  0.361634
  1   0.61319    0.16098    3.809  0.000903
  2  -0.07305    0.13572   -0.538  0.595594
  3   0.32033    0.16852    1.901  0.069925
  4   0.08173    0.22148    0.369  0.715480
  5   0.03838    0.14700    0.261  0.796334
  6  -0.21706    0.17821   -1.218  0.235577,
  'Residuals':         Min  Quantil1      Med  Quantil3       Max
  0 -10.94185  -4.35552  0.31583   5.54255  11.59897,
  'Properties':     n  Sigma  df  R_squared  R_squared_a  F_statistic       P_value
  0  30  7.068  23     0.7326       0.6628         93.0  5.884182e-15},
 'data': {'n': 30,
  'k': 6,
  'Y': array([43., 63., 71., 61., 81., 43., 58., 71., 72., 67., 64., 67., 69.,
         68., 77., 81., 74., 65., 65., 50., 50., 64., 53., 40., 63., 66.,
         78., 48., 85., 82.]),
  'Y_Predicted': array([51.11029526, 61.3527663 , 69.93944113, 61.22684164, 74.45379903,
         53.94184988, 67.14841399, 70.0970

In [5]:
spark.stop()