In [2]:
import numpy as np
from scipy.stats import t, f
import time

def create_data(n, p, beta_true):
    np.random.seed(42)
    X = np.random.rand(n, p)
    X = np.column_stack([np.ones(X.shape[0]), X])
    y = X @ beta_true + np.random.randn(n) * 0.1
    return X, y

def run_benchmark(n_list, repetitions=5):
    results = []
    beta_true = [-8, -1.6, 4.1, -10, -9.2, 1.3, 1.6, 2.3]
    p = 7

    for n in n_list:
        times = []
        for _ in range(repetitions):
            X, y = create_data(n, p, beta_true)
            beta, elapsed_time = linear_regression_manual(X, y)
            times.append(elapsed_time)

        avg_time = np.mean(times)
        std_time = np.std(times)
        results.append([n, avg_time, std_time, beta])
        print("\n Datenzeilen: ", n)
        print("Laufzeit: ", avg_time)

    return results

def linear_regression_manual(X, Y):
    start_time = time.time()
    n = len(Y)
    k = X.shape[1] - 1
    
    XtX = np.dot(X.T, X)
    XtY = np.dot(X.T, Y)
    XtX_inv = np.linalg.inv(XtX)
    
    beta = np.dot(XtX_inv, XtY)
    
    Y_pred = np.dot(X, beta)
    
    residuals = Y - Y_pred
    f_residuals = n - k - 1
    
    SSE = np.sum(residuals ** 2)
    SSR = np.sum((beta - np.mean(Y)) ** 2)
    SST = SSR + SSE
    
    sigma_hat = np.sqrt(SSE / f_residuals)
    cov_matrix = (sigma_hat ** 2) * XtX_inv
    se = np.sqrt(np.diag(cov_matrix))
    t_werte = beta / se
    p_werte = 2 * (1 - t.cdf(np.abs(t_werte), f_residuals))
    f_statistics = (SSR / k) / (SSE / f_residuals)
    f_p_wert = 1 - f.cdf(f_statistics, k, f_residuals)
    quantile1_3 = np.quantile(residuals, (0.25, 0.75))
    quantile = [np.min(residuals), quantile1_3[0], np.median(residuals), quantile1_3[1], np.max(residuals)]
    r_quadrat = 1 - (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))
    adjusted_r_quadrat = 1 - ((n - 1) / f_residuals) * (np.sum((Y - Y_pred) ** 2) / np.sum((Y - np.mean(Y)) ** 2))
    end_time = time.time()
    elapsed_time = end_time - start_time
    return beta, elapsed_time

n_values = [100, 500, 1000, 5000, 10000, 50000, 100000, 200000, 500000, 1000000, 5000000, 10000000, 50000000]
benchmark_results = run_benchmark(n_values)


 Datenzeilen:  100
Laufzeit:  0.0007869243621826172

 Datenzeilen:  500
Laufzeit:  0.00032362937927246096

 Datenzeilen:  1000
Laufzeit:  0.00024166107177734376

 Datenzeilen:  5000
Laufzeit:  0.0004180908203125

 Datenzeilen:  10000
Laufzeit:  0.0006337642669677734

 Datenzeilen:  50000
Laufzeit:  0.0028793811798095703

 Datenzeilen:  100000
Laufzeit:  0.0045257568359375

 Datenzeilen:  200000
Laufzeit:  0.009740829467773438

 Datenzeilen:  500000
Laufzeit:  0.024773120880126953

 Datenzeilen:  1000000
Laufzeit:  0.040907430648803714

 Datenzeilen:  5000000
Laufzeit:  0.20268149375915528

 Datenzeilen:  10000000
Laufzeit:  0.41918277740478516

 Datenzeilen:  50000000
Laufzeit:  2.3308142185211183
