### read data and calc returns

In [48]:
import pandas as pd


df = pd.read_csv('data.csv', parse_dates = True).set_index(['ID','DATE'])

In [49]:
df = df[~df.index.duplicated()].reset_index() # causes multiindex issues later

In [50]:
df.shape

(150, 18)

### functions - ret calc and makeready

In [51]:
def calculate_returns(df):
    """Helper function to calculate returns and excess returns"""
    df = df.copy()
    df['stock_return'] = df.groupby('ID')['px_last_splits'].pct_change()
    df['spy_return'] = df.groupby('ID')['spy_splits'].pct_change()
    df['stock_excess_return'] = df['stock_return'] - df['spy_return']

    return df

### functions - rolling reg / var

In [52]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression

def rolling_residual_variance(df, window_size, dependent_var, independent_vars):
    """
    Performs rolling regression in parallel using joblib.

    Args:
        df: Pandas DataFrame containing the data.
        window_size: Size of the rolling window.
        dependent_var: Name of the dependent variable column.
        independent_vars: List of names of independent variable columns.

    Returns:
        Pandas DataFrame with the regression coefficients for each window.
        Returns None if there are issues.
    """

    n_rows = len(df)
    results = []

    def _regress_window(i):
        if i < window_size -1:
            return None # Handle edge cases at beginning of dataframe
        window_data = df.iloc[i - window_size + 1:i + 1]

        X = window_data[independent_vars].values
        y = window_data[dependent_var].values.reshape(-1,1) # Reshape y for sklearn

        if len(window_data) < window_size or np.any(np.isnan(X)) or np.any(np.isnan(y)):
            return None  # Handle cases where the window is incomplete or contains NaNs.
        
        model = LinearRegression()
        model.fit(X, y)
        y_pred = model.predict(X)

        # Step 3: Calculate the residuals
        residuals = y - y_pred

        # Step 4: Compute the residual variance
        residual_variance = np.var(residuals, ddof=1) 

        return {'index': df.index[i], 'residual_variance': residual_variance} # Include index for proper merging


    results = Parallel(n_jobs=-1)(delayed(_regress_window)(i) for i in range(n_rows))
    
    

    # Filter out None results (from edge cases or NaN windows)
    valid_results = [r for r in results if r is not None]

    if not valid_results: # Check if all results are invalid
        return None

    results_df = pd.DataFrame(valid_results)#.set_index('index')
    
    # Handle multiindex
    if isinstance(results_df['index'].iloc[0], tuple):
        results_df[list(df.index.names)] = results_df['index'].apply(pd.Series)
        results_df = results_df.drop(columns='index').set_index(list(df.index.names))
    else:
        results_df = results_df.set_index('index')
        
    return results_df

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from joblib import Parallel, delayed

# # def rolling_regression(df, window_size, dependent_var, independent_vars, reg_type='OLS', alpha=1.0):
# #     """
# #     Performs rolling regression in parallel using joblib, with options for OLS, Ridge, and Lasso.

# #     Args:
# #         df: Pandas DataFrame containing the data.
# #         window_size: Size of the rolling window.
# #         dependent_var: Name of the dependent variable column.
# #         independent_vars: List of names of independent variable columns.
# #         reg_type: Type of regression to perform ('OLS', 'Ridge', 'Lasso'). Default is 'OLS'.
# #         alpha: Regularization strength for Ridge and Lasso. Default is 1.0.

# #     Returns:
# #         Pandas DataFrame with the regression coefficients for each window, indexed by the original DataFrame's index.
# #         Returns None if there are issues.
# #     """

# #     n_rows = len(df)
# #     results = []

# #     def _regress_window(i):
# #         if i < window_size -1:
# #             return None # Handle edge cases at beginning of dataframe
# #         window_data = df.iloc[i - window_size + 1:i + 1]

# #         X = window_data[independent_vars].values
# #         y = window_data[dependent_var].values.reshape(-1,1) # Reshape y for sklearn

# #         if len(window_data) < window_size or np.any(np.isnan(X)) or np.any(np.isnan(y)):
# #             return None  # Handle cases where the window is incomplete or contains NaNs.

# #         if reg_type.upper() == 'OLS':
# #             model = LinearRegression()
# #         elif reg_type.upper() == 'RIDGE':
# #             model = Ridge(alpha=alpha)
# #         elif reg_type.upper() == 'LASSO':
# #             model = Lasso(alpha=alpha)
# #         else:
# #             raise ValueError("Invalid reg_type. Choose 'OLS', 'Ridge', or 'Lasso'.")

# #         model.fit(X, y)
# #         coefs = model.coef_.flatten() # Flatten the coefficients to a 1D array
# #         intercept = model.intercept_

# #         return {'index': df.index[i], 'intercept': intercept, **dict(zip(independent_vars, coefs))} # Include index for proper merging


# #     results = Parallel(n_jobs=-1)(delayed(_regress_window)(i) for i in range(n_rows))

# #     # Filter out None results (from edge cases or NaN windows)
# #     valid_results = [r for r in results if r is not None]

# #     if not valid_results: # Check if all results are invalid
# #         return None

# #     results_df = pd.DataFrame(valid_results)#.set_index('index')
    
# #     # handle multiinex
# #     if(isinstance(results_df['index'].iloc[0],tuple)):
# #         results_df[list(df.index.names)] = results_df['index'].apply(pd.Series)
# #         results_df = results_df.drop(columns = 'index').set_index(list(df.index.names))
    
# #     return results_df

def rolling_regression(df, window_size, dependent_var, independent_vars, reg_type='OLS', alpha=1.0):
    """
    Performs rolling regression in parallel using joblib, with options for OLS, Ridge, and Lasso.
    Returns np.nan for coefficients when X or y contains NaN values.

    Args:
        df: Pandas DataFrame containing the data.
        window_size: Size of the rolling window.
        dependent_var: Name of the dependent variable column.
        independent_vars: List of names of independent variable columns.
        reg_type: Type of regression to perform ('OLS', 'Ridge', 'Lasso'). Default is 'OLS'.
        alpha: Regularization strength for Ridge and Lasso. Default is 1.0.

    Returns:
        Pandas DataFrame with the regression coefficients for each window, indexed by the original DataFrame's index.
        Returns None if there are issues.
    """

    n_rows = len(df)
    results = []

    def _regress_window(i):
        if i < window_size - 1:
            return {'index': df.index[i],
                   'intercept': np.nan,
                   **dict(zip(independent_vars, [np.nan] * len(independent_vars)))}

        window_data = df.iloc[i - window_size + 1:i + 1]

        X = window_data[independent_vars].values
        y = window_data[dependent_var].values.reshape(-1,1)

        # Return NaN coefficients if window contains NaN or is incomplete
        if len(window_data) < window_size or np.any(np.isnan(X)) or np.any(np.isnan(y)):
            return {'index': df.index[i],
                   'intercept': np.nan,
                   **dict(zip(independent_vars, [np.nan] * len(independent_vars)))}

        if reg_type.upper() == 'OLS':
            model = LinearRegression()
        elif reg_type.upper() == 'RIDGE':
            model = Ridge(alpha=alpha)
        elif reg_type.upper() == 'LASSO':
            model = Lasso(alpha=alpha)
        else:
            raise ValueError("Invalid reg_type. Choose 'OLS', 'Ridge', or 'Lasso'.")

        model.fit(X, y)
        coefs = model.coef_.flatten()
        intercept = model.intercept_

        return {'index': df.index[i], 'intercept': intercept, **dict(zip(independent_vars, coefs))}

    results = Parallel(n_jobs=-1)(delayed(_regress_window)(i) for i in range(n_rows))

    # All results should be valid now since we're returning NaN instead of None
    results_df = pd.DataFrame(results)

    # Handle multiindex
    if isinstance(results_df['index'].iloc[0], tuple):
        results_df[list(df.index.names)] = results_df['index'].apply(pd.Series)
        results_df = results_df.drop(columns='index').set_index(list(df.index.names))
    else:
        results_df = results_df.set_index('index')

    return results_df

# def rolling_regression(df, window_size, dependent_var, independent_vars):
#     """
#     Performs rolling regression in parallel using joblib.

#     Args:
#         df: Pandas DataFrame containing the data.
#         window_size: Size of the rolling window.
#         dependent_var: Name of the dependent variable column.
#         independent_vars: List of names of independent variable columns.

#     Returns:
#         Pandas DataFrame with the regression coefficients for each window.
#         Returns None if there are issues.
#     """

#     n_rows = len(df)
#     results = []

#     def _regress_window(i):
#         if i < window_size -1:
#             return None # Handle edge cases at beginning of dataframe
#         window_data = df.iloc[i - window_size + 1:i + 1]

#         X = window_data[independent_vars].values
#         y = window_data[dependent_var].values.reshape(-1,1) # Reshape y for sklearn

#         if len(window_data) < window_size or np.any(np.isnan(X)) or np.any(np.isnan(y)):
#             return None  # Handle cases where the window is incomplete or contains NaNs.
        
#         model = LinearRegression()
#         model.fit(X, y)
#         coefs = model.coef_.flatten() # Flatten the coefficients to a 1D array
#         intercept = model.intercept_

#         return {'index': df.index[i], 'intercept': intercept, **dict(zip(independent_vars, coefs))} # Include index for proper merging


#     results = Parallel(n_jobs=-1)(delayed(_regress_window)(i) for i in range(n_rows))

#     # Filter out None results (from edge cases or NaN windows)
#     valid_results = [r for r in results if r is not None]

#     if not valid_results: # Check if all results are invalid
#         return None

#     results_df = pd.DataFrame(valid_results).set_index('index')
#     return results_df




### calculate metrics 

In [53]:
mdf = calculate_returns(df).set_index(['ID','DATE'])

### technical factors

In [54]:
mdf['beta'] = mdf\
.groupby('ID', group_keys = False).apply(lambda x: rolling_regression(x, window_size = 24 , dependent_var = 'stock_return', independent_vars = ['spy_return']))\
.drop(columns = 'intercept')\
.rename(columns = {'spy_return':'beta'})

mdf['volatility'] = mdf.groupby('ID',group_keys = False)['stock_return'].rolling(24).std().mul(np.sqrt(12)).reset_index(level = 0, drop = True).rename('volatility')

mdf['avg_volm_to_cap'] = mdf.groupby('ID', group_keys = False).apply(lambda x: x['px_volume'].rolling(12).mean()/(x['cur_mkt_cap']/1000000)).rename('avg_volm_to_cap')


mdf['volume_trend'] = mdf.groupby('ID', group_keys = False).apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'px_volume', independent_vars = ['trend'])).rename(columns = {'trend':'volume_trend'}).drop(columns = 'intercept')



mdf['residual_variance'] = mdf.groupby('ID', group_keys = False).apply(lambda x: rolling_residual_variance(x, window_size = 24 , dependent_var = 'stock_return', independent_vars = ['spy_return']))



In [55]:
def compound_returns(returns):
    return (1 + returns).prod() - 1

# List of rolling periods to calculate
periods = [1, 2, 3, 6, 12]

# Calculate rolling compounded returns for each period
for period in periods:
    # Stock returns
    mdf[f'stock_return_{period}m'] = mdf.groupby('ID')['stock_return'].rolling(
        window=period, min_periods=period
    ).apply(compound_returns).reset_index(level = 0, drop = True).values

    # SPY returns
    mdf[f'spy_return_{period}m'] = mdf.groupby('ID')['spy_return'].rolling(
        window=period, min_periods=period
    ).apply(compound_returns).reset_index(level = 0, drop = True).values

    # Calculate excess returns (stock - spy)
    mdf[f'rs_{period}m'] = mdf[f'stock_return_{period}m'] - mdf[f'spy_return_{period}m']

In [56]:
mdf['3mrs_3mago'] = mdf.groupby('ID')['rs_3m'].shift(3)
mdf['3mrs_6mago'] = mdf.groupby('ID')['rs_3m'].shift(6)
mdf['3mrs_9mago'] = mdf.groupby('ID')['rs_3m'].shift(9)

### value factors

In [57]:
# earnings to price

In [58]:
mdf['eps_to_price'] = mdf.groupby('ID').apply(lambda x: x['eps']/x['px_last_splits']).reset_index(0, drop = True)


mdf['eps_to_price_trend']= mdf.groupby('ID', group_keys = False)\
.apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'eps_to_price', independent_vars = ['trend']))\
.rename(columns = {'trend':'eps_to_price_trend'}).drop(columns = 'intercept')


In [59]:
# sales to price 

In [60]:
mdf['sales_to_price'] = mdf.groupby('ID').apply(lambda x: x['sales']/x['px_last_splits']).reset_index(0, drop = True)


mdf['sales_to_price_trend']= mdf.groupby('ID', group_keys = False)\
.apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'sales_to_price', independent_vars = ['trend']))\
.rename(columns = {'trend':'sales_to_price_trend'}).drop(columns = 'intercept')


In [61]:
# cash to price

In [62]:
mdf['fcf_calc'] = mdf['cfo_ltm_a'] + mdf['capex'] + mdf['dvd'] 

mdf['cash_to_price'] = mdf['fcf_calc'] / mdf['px_last_splits']

mdf['cash_to_price_trend']= mdf.groupby('ID', group_keys = False)\
.apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'cash_to_price', independent_vars = ['trend']))\
.rename(columns = {'trend':'cash_to_price_trend'}).drop(columns = 'intercept')


In [63]:
# dividend to price 

In [64]:
mdf['div_to_price'] = np.abs(mdf['dvd']) / mdf['px_last_splits']

mdf['div_to_price_trend']= mdf.groupby('ID', group_keys = False)\
.apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'div_to_price', independent_vars = ['trend']))\
.rename(columns = {'trend':'div_to_price_trend'}).drop(columns = 'intercept')


In [65]:
# book to price


In [66]:
mdf['book_to_price'] = mdf['book_value'] / mdf['cur_mkt_cap']

mdf['book_to_price_trend']= mdf.groupby('ID', group_keys = False)\
.apply(lambda x: rolling_regression(x.assign(trend = lambda x:np.arange(len(x))), window_size = 24 , dependent_var = 'book_to_price', independent_vars = ['trend']))\
.rename(columns = {'trend':'book_to_price_trend'}).drop(columns = 'intercept')


In [74]:
mdf.columns.tolist()

['Unnamed: 0',
 'px_last_splits',
 'px_last_full',
 'cur_mkt_cap',
 'px_volume',
 'spy_splits',
 'spy_tr',
 'eps',
 'sales',
 'cfo_ltm_a',
 'fcf_ltm_a',
 'capex',
 'dvd',
 'opmargin1',
 'book_value',
 'shares_outstanding',
 'stock_return',
 'spy_return',
 'stock_excess_return',
 'beta',
 'volatility',
 'avg_volm_to_cap',
 'volume_trend',
 'residual_variance',
 'stock_return_1m',
 'spy_return_1m',
 'rs_1m',
 'stock_return_2m',
 'spy_return_2m',
 'rs_2m',
 'stock_return_3m',
 'spy_return_3m',
 'rs_3m',
 'stock_return_6m',
 'spy_return_6m',
 'rs_6m',
 'stock_return_12m',
 'spy_return_12m',
 'rs_12m',
 '3mrs_3mago',
 '3mrs_6mago',
 '3mrs_9mago',
 'eps_to_price',
 'eps_to_price_trend',
 'sales_to_price',
 'sales_to_price_trend',
 'fcf_calc',
 'cash_to_price',
 'cash_to_price_trend',
 'div_to_price',
 'div_to_price_trend',
 'book_to_price',
 'book_to_price_trend']

In [81]:
fdf = mdf[['stock_return',
           'cur_mkt_cap',
           'px_last_splits',
           'beta',
             'volatility',
             'avg_volm_to_cap',
             'volume_trend',
             'residual_variance',
             'stock_return_1m',
             'rs_1m',
             'stock_return_2m',

             'rs_2m',
             'stock_return_3m',

             'rs_3m',
             'stock_return_6m',

             'rs_6m',
             'stock_return_12m',

             'rs_12m',
             '3mrs_3mago',
             '3mrs_6mago',
             '3mrs_9mago',
             'eps_to_price',
             'eps_to_price_trend',
             'sales_to_price',
             'sales_to_price_trend',
             'fcf_calc',
             'cash_to_price',
             'cash_to_price_trend',
             'div_to_price',
             'div_to_price_trend',
             'book_to_price',
             'book_to_price_trend']]
           

In [91]:
fcols = fdf.columns.to_list()
fcols.remove('stock_return')

In [76]:
fdf = fdf.groupby('DATE').transform(lambda x: x.fillna(x.mean()))

In [95]:
from scipy.stats.mstats import winsorize
from scipy.stats import zscore

# df[numcols] = df[numcols].groupby('ITERATION_DATE').transform(lambda x: winsorize(x, limits = (0.01,0.01)))
fdf[fcols] = fdf[fcols].groupby('DATE').transform(lambda x: zscore(x).clip(-3,3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf[fcols] = fdf[fcols].groupby('DATE').transform(lambda x: zscore(x).clip(-3,3))


In [96]:
fdf[fcols] = fdf.groupby(level = 'ID')[fcols].shift(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fdf[fcols] = fdf.groupby(level = 'ID')[fcols].shift(1)


In [97]:
betas = fdf\
.groupby('DATE', group_keys = False).apply(lambda x: rolling_regression(x, window_size = 24 , dependent_var = 'stock_return', independent_vars = fcols))\
.drop(columns = 'intercept')

In [98]:
betas

Unnamed: 0_level_0,Unnamed: 1_level_0,cur_mkt_cap,px_last_splits,beta,volatility,avg_volm_to_cap,volume_trend,residual_variance,stock_return_1m,rs_1m,stock_return_2m,...,eps_to_price_trend,sales_to_price,sales_to_price_trend,fcf_calc,cash_to_price,cash_to_price_trend,div_to_price,div_to_price_trend,book_to_price,book_to_price_trend
ID,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
IBM US Equity,2018-12-31,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2019-01-31,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2019-02-28,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2019-03-31,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2019-04-30,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AAPL US Equity,2024-12-31,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2025-01-31,,,,,,,,,,,...,,,,,,,,,,
IBM US Equity,2025-02-07,,,,,,,,,,,...,,,,,,,,,,
AAPL US Equity,2025-01-31,,,,,,,,,,,...,,,,,,,,,,


### testing dataframe 

In [None]:
# Example Usage with MultiIndex:
import pandas as pd
import numpy as np

# Create a sample DataFrame with MultiIndex
np.random.seed(0)
dates = pd.to_datetime(range(100), unit='D', origin=pd.Timestamp('2025-02-07'))
levels = ['A', 'B']
level_values = np.random.choice(levels, 100)
multi_index = pd.MultiIndex.from_arrays([level_values, dates], names=['level', 'date'])
data = np.random.randn(100, 3)
df_multi = pd.DataFrame(data, columns=['var1', 'var2', 'dep_var'], index=multi_index)

window_size = 20
dependent_var = 'dep_var'
independent_vars = ['var1', 'var2']

# Perform OLS Rolling Regression (default)
ols_results_multi = rolling_regression(df_multi, window_size, dependent_var, independent_vars)
print("OLS Results with MultiIndex:")
print(ols_results_multi.head())
print(ols_results_multi.index[:5]) # Print first 5 indices to verify MultiIndex is preserved

# Perform Ridge Rolling Regression
ridge_results_multi = rolling_regression(df_multi, window_size, dependent_var, independent_vars, reg_type='Ridge', alpha=0.5)
print("\nRidge Results with MultiIndex:")
print(ridge_results_multi.head())
print(ridge_results_multi.index[:5]) # Print first 5 indices to verify MultiIndex is preserved

# Perform Lasso Rolling Regression
lasso_results_multi = rolling_regression(df_multi, window_size, dependent_var, independent_vars, reg_type='Lasso', alpha=0.1)
print("\nLasso Results with MultiIndex:")
print(lasso_results_multi.head())
print(lasso_results_multi.index[:5]) # Print first 5 indices to verify MultiIndex is preserved