# Imports

In [1]:
#!pip install statsmodels matplotlib

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import numpy as np
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import DoubleType
import time
import numpy as np
import pandas as pd
import statsmodels.api as sm
import scipy.stats as scipy_stats
from scipy.stats import t, f
import time
import matplotlib.pyplot as plt  # Diesen Import hinzufügen
from numpy.linalg import LinAlgError


# Initialize Spark session

In [3]:
spark = SparkSession.builder.master("spark://spark-master:7077").appName("ManualQRDecomposition").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/30 17:24:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 1) Funktionen für Lineare Regression mit QR Decomposition

## 1.1) Lösen eines LGS durch Rückwärts Einsetzen

In [4]:
def backward_substitution(R, b):
    """
    Solves the upper triangular system Rx = b for x using backward substitution.
    
    Parameters
    ----------
    R : ndarray
        Upper triangular matrix from QR decomposition
    b : ndarray
        The right-hand side vector
        
    Returns
    -------
    ndarray
        Solution vector x that satisfies Rx = b
        
    Notes
    -----
    - Implements classic backward substitution algorithm
    - Includes numerical stability check (threshold of 1e-10)
    - Returns zero for numerically unstable solutions
    """
    n = len(b)
    x = np.zeros(n)
    
    for i in range(n-1, -1, -1):
        sum_val = 0
        for j in range(i+1, n):
            sum_val += R[i, j] * x[j]
        
        if abs(R[i, i]) > 1e-10:  # Check for numerical stability
            x[i] = (b[i] - sum_val) / R[i, i]
        else:
            x[i] = 0
            
    return x

## 1.2) QR Zerlegung

### 1.2.1) Gram Schmidt ohne Sparkoptimierung

In [5]:
def manual_qr_decomposition(X_array):
    """
    Performs QR decomposition using modified Gram-Schmidt with numerical stability handling.
    
    Parameters
    ----------
    X_array : ndarray
        Input matrix to decompose, shape (n, m)
        
    Returns
    -------
    Q : ndarray
        Orthogonal matrix, shape (n, m)
    R : ndarray
        Upper triangular matrix, shape (m, m)
        
    Notes
    -----
    - Implements modified Gram-Schmidt process
    - Includes numerical stability threshold (1e-12)
    - Returns zeros for numerically unstable columns
    """
    n, m = X_array.shape
    Q = np.zeros((n, m))
    R = np.zeros((m, m))
    
    for j in range(m):
        v = X_array[:, j]
        for i in range(j):
            R[i, j] = np.dot(Q[:, i], X_array[:, j])
            v = v - R[i, j] * Q[:, i]
        norm = np.linalg.norm(v)
        if norm > 1e-12:  # Only normalize if norm is significant
            Q[:, j] = v / norm
            R[j, j] = norm
        else:
            R[j, j] = 0
    return Q, R

### 1.2.1) Gram Schmidt mit Sparkoptimierung

In [6]:
def gram_schmidt(X):
    """
    Performs classical Gram-Schmidt orthogonalization.
    
    Parameters
    ----------
    X : ndarray
        Input matrix to orthogonalize, shape (n, m)
        
    Returns
    -------
    Q : ndarray
        Orthogonal matrix, shape (n, m)
    R : ndarray
        Upper triangular matrix, shape (m, m)
        
    Notes
    -----
    - Implements classical Gram-Schmidt orthogonalization
    - No explicit numerical stability checks
    - May be less stable than modified Gram-Schmidt for ill-conditioned matrices
    """
    n, m = X.shape
    Q = np.zeros((n, m))
    R = np.zeros((m, m))

    for j in range(m):
        v = X[:, j]
        
        for i in range(j):
            R[i, j] = np.dot(Q[:, i], X[:, j])
            v = v - R[i, j] * Q[:, i]

        R[j, j] = np.linalg.norm(v)
        Q[:, j] = v / R[j, j]

    return Q, R

## 1.3) Datensatz simulieren

### 1.3.1) Datensatz simulieren ohne Sparkoptimierung

In [7]:
def create_data_numpy(n, p, beta_true):
    """
    Generates synthetic regression data using NumPy.
    
    Parameters
    ----------
    n : int
        Number of samples
    p : int
        Number of features (excluding intercept)
    beta_true : ndarray
        True coefficient values including intercept
        
    Returns
    -------
    X : ndarray
        Design matrix with intercept column, shape (n, p+1)
    y : ndarray
        Target variable vector, shape (n,)
        
    Notes
    -----
    - Uses fixed random seed (42) for reproducibility
    - Adds constant term (intercept) to X
    - Adds Gaussian noise with std=0.1 to y
    """
    np.random.seed(42)
    X = np.random.rand(n, p)
    X = np.column_stack([np.ones(X.shape[0]), X])
    y = X @ beta_true + np.random.randn(n) * 0.3
    return X, y

### 1.3.2) Datensatz simulieren mit Sparkoptimierung

In [8]:
def create_data_spark(n_samples, n_features, beta_true, noise_std=0.3, partition_size=10000):
    """
    Generates distributed synthetic regression data optimized for Spark.
    
    Parameters
    ----------
    n_samples : int
        Total number of observations
    n_features : int
        Number of features (excluding intercept)
    beta_true : ndarray
        True coefficient values including intercept
    noise_std : float, optional
        Standard deviation of Gaussian noise, default=0.1
    partition_size : int, optional
        Number of samples per partition, default=10000
        
    Returns
    -------
    DataFrame
        Spark DataFrame with features and target columns
        
    Notes
    -----
    - Generates data in parallel across partitions
    - Uses unique random seeds per partition
    - Optimized for distributed processing
    """
    def generate_partition(partition_index, partition_size):
        np.random.seed(42 + partition_index)  # Unique seed for each partition to avoid duplicate data
        X = np.random.randn(partition_size, n_features)
        X = np.column_stack([np.ones(partition_size), X])  # Add intercept
        y = X @ beta_true + np.random.normal(0, noise_std, partition_size)
        return [(Vectors.dense(x), float(y_i)) for x, y_i in zip(X, y)]
    
    num_partitions = max(n_samples // partition_size, spark.sparkContext.defaultParallelism)
    samples_per_partition = n_samples // num_partitions
    
    # Parallelize the data generation across partitions
    rdd = (spark.sparkContext
           .parallelize(range(num_partitions), num_partitions)
           .flatMap(lambda i: generate_partition(i, samples_per_partition)))
    
    return spark.createDataFrame(rdd, ["features", "y"])


## 1.4) Funktion zur Durchführung der linearen Regression 

- simuliert einen Datensatz
- führt die lineare Regression durch
- berechnet alle Metriken, die auch statmodels `summary()` ausgibt
- misst die Zeit

### 1.4.1) Funktion zur Berechnung der Statistik

In [9]:
def compute_statistics_numpy(X, y, beta, residuals):
    """
    Computes comprehensive regression statistics using NumPy.
    
    Parameters
    ----------
    X : ndarray
        Design matrix including intercept
    y : ndarray
        Target variable vector
    beta : ndarray
        Estimated coefficients
    residuals : ndarray
        Model residuals (y - X @ beta)
        
    Returns
    -------
    dict
        Dictionary containing various statistics including:
        - R-squared and Adjusted R-squared
        - F-statistic and p-value
        - Standard errors and t-statistics
        - AIC and BIC
        - Durbin-Watson statistic
        - Jarque-Bera test results
        
    Notes
    -----
    - Computes full suite of regression diagnostics
    - Includes both model fit and residual diagnostics
    """
    n, k = X.shape
    SSE = np.sum(residuals ** 2)
    SST = np.sum((y - np.mean(y)) ** 2)
    SSR = SST - SSE
    df_residuals = n - k
    df_model = k - 1
    
    r_squared = 1 - (SSE / SST)
    adj_r_squared = 1 - ((1 - r_squared) * (n - 1) / df_residuals)
    
    MSE = SSE / df_residuals
    MSR = SSR / df_model
    f_statistic = MSR / MSE
    f_p_value = scipy_stats.f.sf(f_statistic, df_model, df_residuals)
    
    sigma_squared = MSE
    XtX_inv = np.linalg.inv(X.T @ X)
    se = np.sqrt(np.diag(sigma_squared * XtX_inv))
    t_values = beta / se
    p_values = 2 * (1 - scipy_stats.t.cdf(np.abs(t_values), df_residuals))
    
    skewness = scipy_stats.skew(residuals)
    kurtosis = scipy_stats.kurtosis(residuals, fisher=False)
    log_likelihood = -0.5 * n * (np.log(2 * np.pi * sigma_squared) + 1)
    AIC = 2 * k - 2 * log_likelihood
    BIC = n * np.log(SSE / n) + k * np.log(n)
    
    dw_statistic = np.sum(np.diff(residuals) ** 2) / SSE
    
    jarque_bera_stat = (n / 6) * (skewness**2 + (kurtosis - 3)**2 / 4)
    prob_jb = 1 - scipy_stats.chi2.cdf(jarque_bera_stat, df=2)
    
    return {
        'r_squared': r_squared,
        'Adjusted R-squared': adj_r_squared,
        'F-statistic': f_statistic,
        'Prob (F-statistic)': f_p_value,
        'Log-Likelihood': log_likelihood,
        'AIC': AIC,
        'BIC': BIC,
        'coef': beta,
        'std err': se,
        't': t_values,
        'P>|t|': p_values,
        'Skew': skewness,
        'Kurtosis': kurtosis,
        'Durbin-Watson': dw_statistic,
        'Jarque-Bera (JB)': jarque_bera_stat,
        'Prob(JB)': prob_jb,
        'mse': MSE
    }

In [10]:
def compute_statistics_partition(iterator, beta):
    """
    Computes partial statistics for a partition of data in Spark.
    
    Parameters
    ----------
    iterator : iterator
        Iterator over partition rows
    beta : ndarray
        Estimated coefficients
        
    Returns
    -------
    tuple
        (SSE_local, SST_local, SSR_local, y_sum_local, count_local)
        Local statistics for the partition
        
    Notes
    -----
    - Designed for distributed computation in Spark
    - Handles empty partitions gracefully
    - Computes sufficient statistics for later aggregation
    """
    rows = list(iterator)
    SSE_local, SST_local, SSR_local = 0, 0, 0
    y_sum_local, count_local = 0, 0

    if len(rows) == 0:
        print("Empty partition in compute_statistics_partition.")
        return (0.0, 0.0, 0.0, 0.0, 0)

    # First pass to get mean
    for row in rows:
        y = row.y
        y_sum_local += y
        count_local += 1

    if count_local > 0:
        y_mean_local = y_sum_local / count_local

        # Second pass for calculations
        for row in rows:
            X = row.features.toArray()
            y = row.y
            y_pred = np.dot(X, beta)
            residual = y - y_pred
            SSE_local += residual ** 2
            SST_local += (y - y_mean_local) ** 2

        SSR_local = SST_local - SSE_local

    return (SSE_local, SST_local, SSR_local, y_sum_local, count_local)

In [11]:
def aggregate_statistics(rdd_stats, n, beta_length, XtX_inv, residuals, beta):
    """
    Aggregates statistics across all partitions in Spark.
    
    Parameters
    ----------
    rdd_stats : RDD
        RDD containing partial statistics from each partition
    n : int
        Total number of samples
    beta_length : int
        Number of coefficients (including intercept)
        
    Returns
    -------
    dict
        Aggregated statistics including:
        - R-squared and Adjusted R-squared
        - F-statistic
        - AIC and BIC
        - Sample size and feature count
        
    Notes
    -----
    - Implements safe reduction operations
    - Handles edge cases and potential errors
    - Includes comprehensive error checking
    """
    def safe_reduce(a, b):
        try:
            return (
                a[0] + b[0],
                a[1] + b[1],
                a[2] + b[2],
                a[3] + b[3],
                a[4] + b[4]
            )
        except (IndexError, TypeError) as e:
            print(f"Error in safe_reduce: {str(e)}")
            return a

    try:
        (SSE, SST, SSR, y_sum_total, total_count) = rdd_stats.reduce(safe_reduce)
    except Exception as e:
        print(f"Error in aggregating statistics: {str(e)}")
        return {}

    # Mean of y over all partitions
    y_mean = y_sum_total / total_count if total_count > 0 else 0

    # Calculate R-squared and Adjusted R-squared
    r_squared = 1 - (SSE / SST) if SST > 0 else 0
    adj_r_squared = (
        1 - ((1 - r_squared) * (total_count - 1) / (total_count - beta_length))
        if total_count > beta_length else 0
    )

    # Calculate Mean Squared Error (MSE) and F-statistic
    MSE = SSE / (total_count - beta_length) if total_count > beta_length else 0
    MSR = SSR / (beta_length - 1) if beta_length > 1 else 0
    df_model = beta_length - 1
    df_residuals = total_count - beta_length
    f_statistic = MSR / MSE if MSE > 0 else float('inf')
    f_p_value = scipy_stats.f.sf(f_statistic, df_model, df_residuals)
    log_likelihood = -0.5 * total_count * (np.log(2 * np.pi * MSE) + 1) if MSE > 0 else float('-inf')

    # Calculate AIC and BIC
    AIC = total_count * np.log(SSE / total_count) + 2 * beta_length if total_count > 0 else 0
    BIC = total_count * np.log(SSE / total_count) + beta_length * np.log(total_count) if total_count > 0 else 0

    sigma_squared = MSE
    XtX_inv = XtX_inv  # Ensure X and beta are available in scope
    std_err = np.sqrt(np.diag(sigma_squared * XtX_inv)) if XtX_inv is not None else [float('nan')] * beta_length
    dw_statistic = np.sum(np.diff(residuals) ** 2) / SSE
    skewness = scipy_stats.skew(residuals)
    kurtosis = scipy_stats.kurtosis(residuals, fisher=False)
    jb_statistic = (n / 6) * (skewness**2 + (kurtosis - 3)**2 / 4)
    prob_jb = 1 - scipy_stats.chi2.cdf(jb_statistic, df=2)
    skew = scipy_stats.skew(residuals)
    kurtosis = scipy_stats.kurtosis(residuals, fisher=False)
    t_values = beta / std_err
    p_values = 2 * (1 - scipy_stats.t.cdf(np.abs(t_values), df_residuals))

    # Compile final statistics
    stats = {
    'r_squared': r_squared,
    'Adjusted R-squared': adj_r_squared,
    'F-statistic': f_statistic,
    'Prob (F-statistic)': f_p_value,
    'Log-Likelihood': log_likelihood,
    'AIC': AIC,
    'BIC': BIC,
    'SSE': SSE,
    'SSR': SSR,
    'SST': SST,
    'mse': MSE,
    'n_samples': total_count,
    'n_features': beta_length,
    'std err': std_err,
    't': t_values,
    'P>|t|': p_values,
    'Durbin-Watson': dw_statistic,
    'Jarque-Bera (JB)': jb_statistic,
    'Prob(JB)': prob_jb,
    'Skew': skew,
    'Kurtosis': kurtosis
}


    return stats

### 1.4.1 Lineare Regression ohne Sparkoptimierung

In [12]:
def linear_regression_manual_qr(X, y):
    """
    Fits linear regression using manual QR decomposition.
    
    Parameters
    ----------
    X : ndarray
        Design matrix including intercept
    y : ndarray
        Target variable vector
        
    Returns
    -------
    dict
        Regression results including coefficients and statistics
    float
        Computation time in seconds
        
    Notes
    -----
    - Uses Gram-Schmidt QR decomposition
    - Computes comprehensive statistics
    - Includes timing information
    """
    start_time = time.time()
    Q, R = gram_schmidt(X)
    beta = backward_substitution(R, Q.T @ y)

    y_pred = np.dot(X, beta) # mit den durch QR berechneten Betas die vorhergesagten y-Werte des linReg Modells bestimmen
    residuals = y - y_pred

    result = compute_statistics_numpy(X, y, beta, residuals)
    
    end_time = time.time()
    elapsed_time = end_time - start_time

    return result, elapsed_time

### 1.4.2 Optimiert für Spark mit manueller QR Zerlegung

In [13]:
def fit_ols_manual2(df):
    """
    Fits distributed OLS regression using manual QR decomposition in Spark.
    
    Parameters
    ----------
    df : DataFrame
        Spark DataFrame with features and target columns
        
    Returns
    -------
    ndarray
        Estimated coefficients
    dict
        Regression statistics
        
    Notes
    -----
    - Implements distributed QR decomposition
    - Handles computation in partitions
    - Includes timing information
    """
    start_time = time.time()

    # Step 1: Calculate combined QR decomposition across partitions
    def process_partition(iterator):
        X_local, y_local = [], []
        for row in iterator:
            X_local.append(row.features.toArray())
            y_local.append(row.y)

        if not X_local:
            print("Empty partition encountered in process_partition.")
            return [(np.zeros((1, 1)), np.zeros(1))]  # Return a default structure to avoid empty lists

        X_local = np.vstack(X_local)
        y_local = np.array(y_local)

        # Perform local QR decomposition
        Q_local, R_local = manual_qr_decomposition(X_local)

        # Calculate local Q^T y
        Qty_local = np.dot(Q_local.T, y_local)

        return [(R_local, Qty_local)]

    # Aggregate QR decomposition results
    results = df.rdd.mapPartitions(process_partition).filter(lambda x: len(x) > 0).reduce(
        lambda a, b: (
            a[0] + b[0] if isinstance(a[0], np.ndarray) and isinstance(b[0], np.ndarray) else np.zeros((1, 1)),
            a[1] + b[1] if isinstance(a[1], np.ndarray) and isinstance(b[1], np.ndarray) else np.zeros(1)
        )
    )

    R_total, Qty_total = results

    # Step 2: Final QR on combined R_total
    _, R_final = manual_qr_decomposition(R_total)
    beta = backward_substitution(R_final, Qty_total)

    # Step 3: Distributed calculation of statistics with additional checks
    rdd_stats = df.rdd.mapPartitions(lambda iter: [compute_statistics_partition(iter, beta)])
    X = np.array(df.select("features").rdd.map(lambda row: row[0].toArray()).collect())  # Or however X is obtained
    XtX_inv = np.linalg.inv(np.dot(X.T, X))
    y_pred = np.dot(X, beta)  # Predicted values based on beta
    residuals = df.select("y").rdd.map(lambda row: row[0]).collect() - y_pred  # Calculate residuals

# Pass residuals to aggregate_statistics
    stats = aggregate_statistics(rdd_stats, n=df.count(), beta_length=len(beta), XtX_inv=XtX_inv, residuals=residuals, beta=beta)

    # Add computation time to stats
    computation_time = time.time() - start_time
    stats['computation_time'] = computation_time
    
    return beta, stats

### Optimiert für Spark mit Built-In Functions für die QR Zerlegung

In [14]:
def fit_ols_spark(df):
    """
    Fits linear regression using Spark's built-in implementation.
    
    Parameters
    ----------
    df : DataFrame
        Spark DataFrame with features and target columns
        
    Returns
    -------
    LinearRegressionModel
        Fitted Spark linear regression model
    LinearRegressionTrainingSummary
        Training summary with model statistics
        
    Notes
    -----
    - Uses Spark's MLlib implementation
    - Uses QR decomposition solver
    - Provides standard Spark model metrics
    """
    lr = LinearRegression(featuresCol="features", labelCol="y", solver="normal")
    model = lr.fit(df)
    
    # Get the training summary to extract information like R^2, RMSE, etc.
    training_summary = model.summary
    
    return model, training_summary


## 1.5) Funktion für Durchführung des Benchmarks

### 1.5.1 summary

In [15]:
def format_summary(beta, statistics, computation_time, X):
    """
    Formats regression results to match statsmodels summary output.
    
    Parameters
    ----------
    beta : ndarray
        Estimated coefficients
    statistics : dict
        Dictionary of computed statistics
    computation_time : float
        Model fitting time in seconds
    X : ndarray
        Design matrix used in the regression
        
    Returns
    -------
    str
        Formatted summary string
    """
    n_obs = statistics['n_samples']
    df_residuals = n_obs - len(beta)
    df_model = len(beta) - 1
    
    # Header
    summary = []
    summary.append("=" * 78)
    summary.append("{: ^78}".format("OLS Regression Results"))
    summary.append("=" * 78)
    
    # Format current date and time
    current_time = time.localtime()
    formatted_date = time.strftime("%a, %d %b %Y", current_time)
    formatted_time = time.strftime("%H:%M:%S", current_time)
    
    # Left and right columns with exact spacing
    summary.append("{:<20} {:<12} {:<20} {: >20.3f}".format(
        "Dep. Variable:", "y",
        "R-squared:", statistics['r_squared']))
    
    summary.append("{:<20} {:<12} {:<20} {: >20.3f}".format(
        "Model:", "OLS",
        "Adj. R-squared:", statistics['Adjusted R-squared']))
    
    summary.append("{:<20} {:<12} {:<20} {: >20.3e}".format(
        "Method:", "Least Squares",
        "F-statistic:", statistics['F-statistic']))
    
    summary.append("{:<20} {:<12} {:<20} {: >20.3e}".format(
        "Date:", formatted_date,
        "Prob (F-statistic):", statistics['Prob (F-statistic)']))
    
    # Log-likelihood (already provided in statistics)
    summary.append("{:<20} {:<12} {:<20} {: >20.3f}".format(
        "Time:", formatted_time,
        "Log-Likelihood:", statistics['Log-Likelihood']))
    
    summary.append("{:<20} {:<12} {:<20} {: >20.3f}".format(
        "No. Observations:", str(n_obs),
        "AIC:", statistics['AIC']))
    
    summary.append("{:<20} {:<12} {:<20} {: >20.3f}".format(
        "Df Residuals:", str(df_residuals),
        "BIC:", statistics['BIC']))
    
    summary.append("{:<20} {:<12}".format(
        "Df Model:", str(df_model)))
    
    summary.append("Covariance Type:            nonrobust")
    
    # Parameter estimates table with exact spacing
    summary.append("=" * 78)
    summary.append("{:>16} {:>10} {:>10} {:>10} {:>12} {:>10} {:>10}".format(
        "", "coef", "std err", "t", "P>|t|", "[0.025", "0.975]"))
    summary.append("-" * 78)
    
    # Use standard errors, t-stats, p-values, and confidence intervals from statistics
    var_names = ["const"] + [f"x{i+1}" for i in range(len(beta)-1)]
    for i, name in enumerate(var_names):
        conf_int_lower = beta[i] - 1.96 * statistics['std err'][i]
        conf_int_upper = beta[i] + 1.96 * statistics['std err'][i]
        summary.append("{:>16} {: >9.4f} {: >9.3f} {: >10.3f} {: >11.3f} {: >9.3f} {: >9.3f}".format(
            name,
            beta[i],
            statistics['std err'][i],
            statistics['t'][i],
            statistics['P>|t|'][i],
            conf_int_lower,
            conf_int_upper
        ))
    
    # Diagnostics
    summary.append("=" * 78)
    summary.append("{:<21} {: >10.3f} {:<24} {: >19.3f}".format(
        "Durbin-Watson:", statistics['Durbin-Watson'],
        "Jarque-Bera (JB):", statistics['Jarque-Bera (JB)']))
    
    summary.append("{:<21} {: >10.3f} {:<24} {: >19.3f}".format(
        "Prob(JB):", statistics['Prob(JB)'],
        "Skew:", statistics['Skew']))
    
    summary.append("{:<21} {: >10.3f} {:<24} {: >19.3f}".format(
        "Kurtosis:", statistics['Kurtosis'],
        "Cond. No.:", np.linalg.cond(X)))
    
    summary.append("=" * 78)
    summary.append("[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.")
    
    return "\n".join(summary)


### Berechnung für Numpy

In [16]:
def run_benchmark_numpy(n_list, beta_true, p, repetitions=10):
    """
    Runs benchmarking for different implementations of linear regression.
    
    Parameters
    ----------
    n_list : list
        Sample sizes to test
    p : int
        Number of features
    beta_true : ndarray
        True coefficients for data generation
    repetitions : int, optional
        Number of benchmark repetitions, default=10
        
    Returns
    -------
    DataFrame
        Benchmark results including:
        - Sample sizes
        - Mean and std of computation times
        - Mean and std of R-squared values
        - Success rates
        
    Notes
    -----
    - Includes error handling
    - Prints progress information
    - Generates comprehensive summaries
    """
    results = []  # Use a list to accumulate results across sample sizes

    for n in n_list:
        times = []
        r_squared_values = []
        print(f"\nBenchmarking for n={n:,} samples...")

        for i in range(repetitions):
            # Generate synthetic data for each sample size
            X = np.hstack([np.ones((n, 1)), np.random.randn(n, p)])  # Add intercept
            y = X @ beta_true + np.random.normal(0, 0.1, n)  # Generate y with some noise

            # Run the manual QR-based regression and measure time
            result, elapsed_time = linear_regression_manual_qr(X, y)
            times.append(elapsed_time)
            r_squared_values.append(result['r_squared'])

        # Compute average computation time and R-squared statistics
        avg_time = np.mean(times)
        avg_r_squared = np.mean(r_squared_values)
        std_time = np.std(times)
        std_r_squared = np.std(r_squared_values)

        # Prepare the statistics dictionary for format_summary
        avg_statistics = {
            'n_samples': n,
            'SSE': np.sum((y - X @ result['coef']) ** 2),
            'r_squared': result['r_squared'],
            'Adjusted R-squared': result['Adjusted R-squared'],
            'F-statistic': result['F-statistic'],
            'Prob (F-statistic)': result['Prob (F-statistic)'],
            'Log-Likelihood': result['Log-Likelihood'],
            'AIC': result['AIC'],
            'BIC': result['BIC'],
            'coef': result['coef'],
            'std err': result['std err'],
            't': result['t'],
            'P>|t|': result['P>|t|'],
            'Skew': result['Skew'],
            'Kurtosis': result['Kurtosis'],
            'Durbin-Watson': result['Durbin-Watson'],
            'Jarque-Bera (JB)': result['Jarque-Bera (JB)'],
            'Prob(JB)': result['Prob(JB)'],
            'mse': result['mse']
        }

        # Print a formatted summary for the last repetition
        print("\nSummary for final run:")
        summary_str = format_summary(result['coef'], avg_statistics, avg_time, X)

        print(summary_str)

        # Append results for this sample size to the results list
        results.append({
            'n_samples': n,
            'n_features': p,
            'mean_time': avg_time,
            'std_time': std_time,
            'mean_r_squared': avg_r_squared,
            'std_r_squared': std_r_squared
        })
        
        print(f"\nSummary for n={n:,}:")
        print(f"  Mean time: {avg_time:.2f}s ± {std_time:.2f}s")
        print(f"  Mean R²: {avg_r_squared:.4f} ± {std_r_squared:.4f}")

    # Return DataFrame with summary of all results
    return pd.DataFrame(results)


### Berechnung für einen festen Wert

In [17]:
def run_benchmark_point(n, p, true_coefficients, repetitions=10):
    """
    Runs benchmarking for different implementations of linear regression.
    
    Parameters
    ----------
    n : int
        Sample size(s) to test
    p : int
        Number of features
    true_coefficients : ndarray
        True coefficients for data generation
    repetitions : int, optional
        Number of benchmark repetitions, default=10
        
    Returns
    -------
    DataFrame
        Benchmark results including:
        - Sample sizes
        - Mean and std of computation times
        - Mean and std of R-squared values
        - Success rates
    """
    times = []
    r_squares = []
    successful_runs = 0

    print(f"\nBenchmarking n={n:,} samples...")
    
    for i in range(repetitions):
        try:
            # Profile data creation time
            start_data_creation = time.time()
            df = create_data_spark(n, p, true_coefficients)
            end_data_creation = time.time()

            # Extract feature matrix X from DataFrame
            X = np.array(df.select("features").rdd.map(lambda row: row[0].toArray()).collect())

            # Profile model fitting time
            start_fit = time.time()
            beta, summary = fit_ols_manual2(df)  # Replace with fit_ols_manual if needed
            end_fit = time.time()

            # Measure total run time
            elapsed_time = end_fit - start_data_creation
            times.append(elapsed_time)
            r_squares.append(summary['r_squared'])
            successful_runs += 1
            
            print(f"  Run {i+1}: time={elapsed_time:.2f}s, R²={summary['r_squared']:.4f}")
            
        except Exception as e:
            print(f"  Run {i+1} failed: {str(e)}")
    
    if successful_runs > 0:
        result = {
            'n_samples': n,
            'n_features': p,
            'mean_time': np.mean(times),
            'std_time': np.std(times),
            'mean_r_squared': np.mean(r_squares),
            'std_r_squared': np.std(r_squares),
            'successful_runs': successful_runs
        }
        print(format_summary(beta, summary, result['mean_time'], X))
        print(f"\nSummary for n={n:,}:")
        print(f"  Mean time: {result['mean_time']:.2f}s ± {result['std_time']:.2f}s")
        print(f"  Mean R²: {result['mean_r_squared']:.4f} ± {result['std_r_squared']:.4f}")
    
    return pd.DataFrame([result])


### Berechnung für Spark Functions

In [18]:
def run_benchmark_spark(n, p, beta_true, repetitions=10):
    """
    Runs benchmarking for different implementations of linear regression.
    
    Parameters
    ----------
    n : int
        Sample size(s) to test
    p : int
        Number of features
    beta_true : ndarray
        True coefficients for data generation
    repetitions : int, optional
        Number of benchmark repetitions, default=10
        
    Returns
    -------
    DataFrame
        Benchmark results including:
        - Sample sizes
        - Mean and std of computation times
        - Mean and std of R-squared values
        - Success rates
        
    Notes
    -----
    - Includes error handling
    - Prints progress information
    - Generates comprehensive summaries
    """
    times = []
    r_squares = []
    successful_runs = 0
    results = []

    print(f"\nBenchmarking for n={n:,} samples...")

    for i in range(repetitions):
        try:
            # Generate synthetic data
            start_data_creation = time.time()
            df = create_data_spark(n, p, beta_true)
            end_data_creation = time.time()

            # Fit the model
            start_fit = time.time()
            model, summary = fit_ols_spark(df)
            end_fit = time.time()

            # Measure total elapsed time
            elapsed_time = end_fit - start_data_creation
            times.append(elapsed_time)
            r_squares.append(summary.r2)
            successful_runs += 1

            print(f"  Run {i+1}: time={elapsed_time:.2f}s, R²={summary.r2:.4f}, Adjusted R²={summary.r2adj:.4f}")

        except Exception as e:
            print(f"  Run {i+1} failed: {str(e)}")

    
    if successful_runs > 0:
        result = {
        'n_samples': n,
        'n_features': p,
        'mean_time': np.mean(times),
        'std_time': np.std(times),
        'mean_r_squared': np.mean(r_squares),
        'std_r_squared': np.std(r_squares),
        'successful_runs': successful_runs
    }

        print(f"\nSummary for n={n:,}:")
        print(f"  Mean time: {result['mean_time']:.2f}s ± {result['std_time']:.2f}s")
        print(f"  Mean R²: {result['mean_r_squared']:.4f} ± {result['std_r_squared']:.4f}")
    
    return pd.DataFrame([result])


# 2) Durchführung des Benchmarks und Ausgeben der Ergebnisse

## 2.1) Durchführung

In [19]:
n_values = [200000, 500000, 1000000, 5000000, 10000000]
n_features = 10
beta_true = np.random.randn(n_features + 1)

In [20]:
n_values2 = [15000000, 20000000, 25000000, 30000000, 35000000]

### 2.1.1 Numpy

In [21]:
results_numpy = run_benchmark_numpy(n_list=n_values, beta_true=beta_true, p=n_features) 


Benchmarking for n=200,000 samples...

Summary for final run:
                            OLS Regression Results                            
Dep. Variable:       y            R-squared:                          0.996
Model:               OLS          Adj. R-squared:                     0.996
Method:              Least Squares F-statistic:                    5.241e+06
Date:                Wed, 30 Oct 2024 Prob (F-statistic):             0.000e+00
Time:                14:20:42     Log-Likelihood:                176861.927
No. Observations:    200000       AIC:                          -353701.853
Df Residuals:        199989       BIC:                          -921176.000
Df Model:            10          
Covariance Type:            nonrobust
                       coef    std err          t        P>|t|     [0.025     0.975]
------------------------------------------------------------------------------
           const    3.2920     0.000  14731.398       0.000     3.292     3.292
     

In [22]:
results_numpy

Unnamed: 0,n_samples,n_features,mean_time,std_time,mean_r_squared,std_r_squared
0,200000,10,0.628908,0.675015,0.99621,1.2e-05
1,500000,10,0.482134,0.031184,0.996214,1e-05
2,1000000,10,0.952605,0.112846,0.996215,9e-06
3,5000000,10,4.790818,0.350156,0.996212,4e-06
4,10000000,10,8.715023,0.215088,0.996214,2e-06


In [23]:
results_numpy2 = run_benchmark_numpy(n_list=n_values2, beta_true=beta_true, p=n_features)


Benchmarking for n=15,000,000 samples...

Summary for final run:
                            OLS Regression Results                            
Dep. Variable:       y            R-squared:                          0.996
Model:               OLS          Adj. R-squared:                     0.996
Method:              Least Squares F-statistic:                    3.949e+08
Date:                Wed, 30 Oct 2024 Prob (F-statistic):             0.000e+00
Time:                14:27:44     Log-Likelihood:              13255065.561
No. Observations:    15000000     AIC:                        -26510109.123
Df Residuals:        14999989     BIC:                        -69078116.360
Df Model:            10          
Covariance Type:            nonrobust
                       coef    std err          t        P>|t|     [0.025     0.975]
------------------------------------------------------------------------------
           const    3.2924     0.000 127516.145       0.000     3.292     3.292
  

In [24]:
results_numpy_df = pd.concat([results_numpy, results_numpy2], ignore_index=True)

In [25]:
results_numpy_df

Unnamed: 0,n_samples,n_features,mean_time,std_time,mean_r_squared,std_r_squared
0,200000,10,0.628908,0.675015,0.99621,1.154623e-05
1,500000,10,0.482134,0.031184,0.996214,1.025484e-05
2,1000000,10,0.952605,0.112846,0.996215,9.006029e-06
3,5000000,10,4.790818,0.350156,0.996212,3.67175e-06
4,10000000,10,8.715023,0.215088,0.996214,1.577912e-06
5,15000000,10,12.493335,0.12473,0.996214,2.164683e-06
6,20000000,10,16.336071,0.22827,0.996215,2.053358e-06
7,25000000,10,19.804174,0.144288,0.996213,1.766141e-06
8,30000000,10,23.787051,0.263583,0.996214,1.898277e-06
9,35000000,10,30.125023,0.865177,0.996213,9.051791e-07


### Benchmark Spark mit manueller QR-Zerlegung

In [None]:
results_point = []
results_point = [run_benchmark_point(n=n, p=n_features, true_coefficients=beta_true) for n in n_values]


Benchmarking n=200,000 samples...


                                                                                

  Run 1: time=18.89s, R²=0.9864


                                                                                

  Run 2: time=10.76s, R²=0.9864


                                                                                

In [None]:
results_point2 = [run_benchmark_point(n=n, p=n_features, true_coefficients=beta_true) for n in n_values2]

In [None]:
results_point_df = pd.concat(results_point , ignore_index=True)

In [None]:
results_point_df2 = pd.concat(results_point2, ignore_index=True)
results_point_df2

In [None]:
results_point_final =  pd.concat([results_point_df, results_point_df2], ignore_index=True)

In [None]:
results_point_final

### Benchmark Spark

In [None]:
results_spark = []
results_spark = [run_benchmark_spark(n=n, p=n_features, beta_true=beta_true) for n in n_values]

In [None]:
results_spark2 = [run_benchmark_spark(n=n, p=n_features, beta_true=beta_true) for n in n_values2]

In [None]:
results_spark_df = pd.concat(results_spark, ignore_index=True)
results_spark_df2 = pd.concat(results_spark2, ignore_index=True)

In [None]:
results_spark_final =  pd.concat([results_spark_df, results_spark_df2], ignore_index=True)

In [None]:
results_spark_final

## 2.2) Ausgeben der Ergebnisse

In [None]:
import pandas as pd

def extract_info(dataframes, labels=None):
    """
    Extracts and compares R² values and computation times from multiple DataFrames,
    structured for easier comparison between approaches.
    
    Parameters
    ----------
    dataframes : list of pd.DataFrame
        List of DataFrames containing benchmarking results with columns 'n_samples', 
        'n_features', 'mean_time', and 'mean_r_squared'.
    
    labels : list of str, optional
        List of labels for each DataFrame to identify the source/approach (default is None).
    
    Returns
    -------
    pd.DataFrame
        Combined DataFrame with columns 'n_samples', 'n_features', and the mean R² values 
        and computation times for each approach in separate columns, structured for 
        side-by-side comparison.
    """
    if labels and len(labels) != len(dataframes):
        raise ValueError("Length of labels must match length of dataframes")

    # Start with the first dataframe and rename columns for merging
    merged_df = dataframes[0].rename(columns={'mean_time': f'{labels[0]}_mean_time', 
                                              'mean_r_squared': f'{labels[0]}_mean_r_squared'})
    
    # Merge each subsequent dataframe
    for i, df in enumerate(dataframes[1:], start=1):
        label = labels[i]
        df = df.rename(columns={'mean_time': f'{label}_mean_time', 
                                'mean_r_squared': f'{label}_mean_r_squared'})
        merged_df = pd.merge(merged_df, df[['n_samples', 'n_features', f'{label}_mean_time', f'{label}_mean_r_squared']],
                             on=['n_samples', 'n_features'], how='outer')

    return merged_df

In [None]:
dataframes = [results_numpy_df, results_point_final, results_spark_final]
labels = ["Numpy", "Spark and manual QR decomposition", "Spark Linear Regressions"]

# Extract and compare information
comparison_df = extract_info(dataframes, labels)


In [None]:
comparison_df

## 2.3) Plotten der Ergebnisse

In [None]:
import matplotlib.pyplot as plt

def plot_benchmark_results(n_values, avg_times, std_times):
    """
    Plots benchmark results of computation times for a QR decomposition-based regression model 
    across varying sample sizes, displaying mean computation time with error bars representing 
    standard deviation.

    Parameters
    ----------
    n_values : list
        List of sample sizes (number of observations) for which benchmarks were performed.
    avg_times : list
        List of average computation times (in seconds) corresponding to each sample size in n_values.
    std_times : list
        List of standard deviations of computation times (in seconds) for each sample size in n_values.
    
    Returns
    -------
    None
        The function directly displays a plot, with no return value.
    
    Notes
    -----
    - The plot uses a logarithmic scale on both axes to improve visibility of differences across a range of sample sizes.
    - Error bars represent the standard deviation of computation times, offering a visual indication of variability.
    - Plot labels, legend, and title are set in English.
    """
    plt.figure(figsize=(10, 6))
    plt.errorbar(n_values, avg_times, yerr=std_times, fmt='o', ecolor='r', capsize=5,
                 label='Average Computation Time with Standard Deviation', linestyle='None')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Number of Observations (n)')
    plt.ylabel('Computation Time (s)')
    plt.title('Computation Time of QR Decomposition-Based Regression for Increasing n')
    plt.grid(True)
    plt.legend()
    plt.show()


In [None]:
plot_benchmark_results(results_numpy_df["n_samples"], results_numpy_df["mean_time"], results_numpy_df["std_time"])

In [None]:
plot_benchmark_results(results_point_final["n_samples"], results_point_final["mean_time"], results_point_final["std_time"])

In [None]:
results_point_final

In [None]:
results_spark_final

In [None]:
plot_benchmark_results(results_spark_final["n_samples"], results_spark_final["mean_time"], results_spark_final["std_time"])