In [14]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression

def rolling_regression(df, window_size, dependent_var, independent_vars):
    """
    Performs rolling regression in parallel using joblib.

    Args:
        df: Pandas DataFrame containing the data.
        window_size: Size of the rolling window.
        dependent_var: Name of the dependent variable column.
        independent_vars: List of names of independent variable columns.

    Returns:
        Pandas DataFrame with the regression coefficients for each window.
        Returns None if there are issues.
    """

    n_rows = len(df)
    results = []

    def _regress_window(i):
        if i < window_size -1:
            return None # Handle edge cases at beginning of dataframe
        window_data = df.iloc[i - window_size + 1:i + 1]

        X = window_data[independent_vars].values
        y = window_data[dependent_var].values.reshape(-1,1) # Reshape y for sklearn

        if len(window_data) < window_size or np.any(np.isnan(X)) or np.any(np.isnan(y)):
            return None  # Handle cases where the window is incomplete or contains NaNs.
        
        model = LinearRegression()
        model.fit(X, y)
        coefs = model.coef_.flatten() # Flatten the coefficients to a 1D array
        intercept = model.intercept_

        return {'index': df.index[i], 'intercept': intercept, **dict(zip(independent_vars, coefs))} # Include index for proper merging


    results = Parallel(n_jobs=-1)(delayed(_regress_window)(i) for i in range(n_rows))

    # Filter out None results (from edge cases or NaN windows)
    valid_results = [r for r in results if r is not None]

    if not valid_results: # Check if all results are invalid
        return None

    results_df = pd.DataFrame(valid_results).set_index('index')
    return results_df

In [7]:
# Example usage:
np.random.seed(42)  # for reproducibility
data = {'A': np.random.rand(100), 'B': np.random.rand(100), 'C': np.random.rand(100), 'Y': 2*np.random.rand(100) + 0.5}
df = pd.DataFrame(data)

window_size = 20
dependent_var = 'Y'
independent_vars = ['A', 'B', 'C']

results_df = rolling_regression(df, window_size, dependent_var, independent_vars)

if results_df is not None:
    print(results_df.head())
    print(results_df.tail())
else:
    print("Rolling regression could not be performed due to data issues.")



# Example with NaNs (added for demonstration)
df_nan = df.copy()
df_nan.iloc[5:10, 1] = np.nan  # Introduce some NaNs in column 'B'
results_df_nan = rolling_regression(df_nan, window_size, dependent_var, independent_vars)

if results_df_nan is not None:
    print("\nResults with NaNs (some rows will be missing):")
    print(results_df_nan.head())
else:
    print("\nRolling regression could not be performed due to data issues (NaNs).")

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression

# ... (rolling_regression function from previous response remains the same) ...

# Example data with ID and DATE

In [15]:
np.random.seed(42)
n_rows = 100
data = {
    'ID': np.random.choice(['A', 'B', 'C'], n_rows),
    'DATE': pd.to_datetime('2023-01-01') + pd.to_timedelta(np.arange(n_rows), unit='D'),
    'A': np.random.rand(n_rows),
    'B': np.random.rand(n_rows),
    'C': np.random.rand(n_rows),
    'Y': 2 * np.random.rand(n_rows) + 0.5
}
df = pd.DataFrame(data)
df = df.set_index('DATE') # Set DATE as index for proper rolling

window_size = 20
dependent_var = 'Y'
independent_vars = ['A', 'B', 'C']

def apply_rolling_regression(group):
    """Applies rolling regression to a group of data."""
    return rolling_regression(group, window_size, dependent_var, independent_vars)

results_grouped = df.groupby('ID').apply(apply_rolling_regression)