### read data and calc returns

In [94]:
import pandas as pd
import numpy as np

In [124]:
df = pd.read_csv('data_analytics_univ_rty_2014_m.csv', parse_dates = ['DATE'])

In [126]:
df.columns.tolist()

['ID',
 'DATE',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'market_cap',
 'bid',
 'ask',
 'baspd',
 'idx_close',
 'stock_return',
 'idx_return',
 'stock_excess_return',
 'beta',
 'volatility',
 'avg_volm_to_cap',
 'volume_trend',
 'residual_variance',
 'stock_return_1m',
 'idx_return_1m',
 'rs_1m',
 'stock_return_2m',
 'idx_return_2m',
 'rs_2m',
 'stock_return_3m',
 'idx_return_3m',
 'rs_3m',
 'stock_return_6m',
 'idx_return_6m',
 'rs_6m',
 'stock_return_12m',
 'idx_return_12m',
 'rs_12m',
 '3mrs_3mago',
 '3mrs_6mago',
 '3mrs_9mago',
 'forward_return',
 'idx_forward_return',
 'relative_return']

In [127]:
drop_cols = [ c for c in df.columns if 'idx_return' in c ]


drop_cols1 = ['open',
 'high',
 'low',
 'close',
 'volume',
 'bid',
 'ask',
 'baspd',
 'idx_close',
 'forward_return',
 'idx_forward_return',
 'relative_return',
 'stock_excess_return']

drop_cols1 = [ c for c in drop_cols1 if c in df.columns.tolist() ]

In [128]:

df = df.drop(columns = (drop_cols + drop_cols1))

In [129]:
# stats and returns are as of given date
# so either shift returns up, or stats down to do regression on returns vs stats as of beginning of returns period 
# lets shift stats down to align with returns end date

In [130]:
# shift down everything except ID, DATE, stock_return 
df.columns.tolist()

['ID',
 'DATE',
 'market_cap',
 'stock_return',
 'beta',
 'volatility',
 'avg_volm_to_cap',
 'volume_trend',
 'residual_variance',
 'stock_return_1m',
 'rs_1m',
 'stock_return_2m',
 'rs_2m',
 'stock_return_3m',
 'rs_3m',
 'stock_return_6m',
 'rs_6m',
 'stock_return_12m',
 'rs_12m',
 '3mrs_3mago',
 '3mrs_6mago',
 '3mrs_9mago']

In [131]:
shift_cols = [ c for c in df.columns if c not in ['ID', 'DATE', 'stock_return'] ]

In [132]:
df[shift_cols] = df.groupby('ID')[shift_cols].shift(1)

### functions - rolling reg / var

### calculate simple returns 

In [133]:
df = df.sort_values(['DATE','ID'])

In [134]:
first_dates = df.apply(lambda x: x.first_valid_index())

first_dates = first_dates.rename('first_idx').to_frame()
value_columns = df.columns.difference(['ID', 'DATE'])

# For each column
dseries = pd.DataFrame()
for column in value_columns:
    first_date = df[df[column].notna()]['DATE'].min()
    last_date = df[df[column].notna()]['DATE'].max()
    first_dates.loc[column,'first_date'] = first_date
    first_dates.loc[column,'last_date'] = last_date

In [135]:
first_dates

Unnamed: 0,first_idx,first_date,last_date
ID,197,NaT,NaT
DATE,197,NaT,NaT
market_cap,198,2014-01-31,2025-02-28
stock_return,198,2014-01-31,2025-02-28
beta,222,2016-01-31,2025-02-28
volatility,222,2016-01-31,2025-02-28
avg_volm_to_cap,221,2015-12-31,2025-02-28
volume_trend,221,2015-12-31,2025-02-28
residual_variance,222,2016-01-31,2025-02-28
stock_return_1m,199,2014-02-28,2025-02-28


In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194138 entries, 197 to 194137
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   ID                 194138 non-null  object        
 1   DATE               194138 non-null  datetime64[ns]
 2   market_cap         191383 non-null  float64       
 3   stock_return       192221 non-null  float64       
 4   beta               147090 non-null  float64       
 5   volatility         147090 non-null  float64       
 6   avg_volm_to_cap    148231 non-null  float64       
 7   volume_trend       148930 non-null  float64       
 8   residual_variance  147090 non-null  float64       
 9   stock_return_1m    190304 non-null  float64       
 10  rs_1m              190304 non-null  float64       
 11  stock_return_2m    188387 non-null  float64       
 12  rs_2m              188387 non-null  float64       
 13  stock_return_3m    186471 non-null  float64    

In [137]:
fdf = df.set_index(['ID','DATE'])

In [138]:
fdf.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 194138 entries, ('AAOI UQ Equity', Timestamp('2013-12-31 00:00:00')) to ('ZYXI UW Equity', Timestamp('2025-02-28 00:00:00'))
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   market_cap         191383 non-null  float64
 1   stock_return       192221 non-null  float64
 2   beta               147090 non-null  float64
 3   volatility         147090 non-null  float64
 4   avg_volm_to_cap    148231 non-null  float64
 5   volume_trend       148930 non-null  float64
 6   residual_variance  147090 non-null  float64
 7   stock_return_1m    190304 non-null  float64
 8   rs_1m              190304 non-null  float64
 9   stock_return_2m    188387 non-null  float64
 10  rs_2m              188387 non-null  float64
 11  stock_return_3m    186471 non-null  float64
 12  rs_3m              186471 non-null  float64
 13  stock_return_6m    180742 non-null  float64
 14  rs_

In [139]:
idx = pd.IndexSlice
cutoff_date = '2015-12-31'
fdf = fdf[fdf.index.get_level_values('DATE') > cutoff_date]

In [140]:
fdf[value_columns] = fdf.groupby('DATE')[value_columns].transform(lambda x: x.fillna(x.mean()))

In [141]:
Ycols = value_columns.difference(['stock_return'])

In [142]:
from scipy.stats.mstats import winsorize
from scipy.stats import zscore

# df[numcols] = df[numcols].groupby('ITERATION_DATE').transform(lambda x: winsorize(x, limits = (0.01,0.01)))
fdf[Ycols] = fdf[Ycols].groupby('DATE').transform(lambda x: zscore(x).clip(-3,3))

In [143]:
def cross_sectional_regression(df, dependent_var, independent_vars, reg_type='OLS', alpha=1.0):
    """
    Performs regression in parallel for each group using joblib, with options for OLS, Ridge, and Lasso.
    Returns np.nan for coefficients when X or y contains NaN values.

    Args:
        df: Pandas DataFrame containing the data with columns ID, DATE, and other variables
        dependent_var: Name of the dependent variable column
        independent_vars: List of names of independent variable columns
        reg_type: Type of regression to perform ('OLS', 'Ridge', 'Lasso'). Default is 'OLS'
        alpha: Regularization strength for Ridge and Lasso. Default is 1.0

    Returns:
        Pandas DataFrame with the regression coefficients for each ID and DATE
    """
    from sklearn.linear_model import LinearRegression, Ridge, Lasso
    from joblib import Parallel, delayed
    import numpy as np
    import pandas as pd
    
    def _regress_group(group_data):
        X = group_data[independent_vars].values
        y = group_data[dependent_var].values.reshape(-1,1)
        
        # Return NaN coefficients if data contains NaN
        if np.any(np.isnan(X)) or np.any(np.isnan(y)):
            return pd.Series({
                'DATE': group_data['DATE'].iloc[0],
                'intercept': np.nan,
                **dict(zip(independent_vars, [np.nan] * len(independent_vars)))
            })
            
        if reg_type.upper() == 'OLS':
            model = LinearRegression()
        elif reg_type.upper() == 'RIDGE':
            model = Ridge(alpha=alpha)
        elif reg_type.upper() == 'LASSO':
            model = Lasso(alpha=alpha)
        else:
            raise ValueError("Invalid reg_type. Choose 'OLS', 'Ridge', or 'Lasso'.")
            
        model.fit(X, y)
        coefs = model.coef_.flatten()
        intercept = model.intercept_
        
        return pd.Series({
            'DATE': group_data['DATE'].iloc[0],
            'intercept': intercept[0] if isinstance(intercept, np.ndarray) else intercept,
            **dict(zip(independent_vars, coefs))
        })
    
    # Group by DATE and perform regression
    results = Parallel(n_jobs=-1)(
        delayed(_regress_group)(group) 
        for _, group in df.groupby('DATE')
    )
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.set_index('DATE')
    
    return results_df

# Let's create an example to demonstrate the usage

In [144]:
fdf = fdf.reset_index()

In [145]:
betas = cross_sectional_regression(fdf, dependent_var='stock_return', independent_vars=Ycols)

In [146]:
betas = betas.rolling(12).mean()
betas = betas.reset_index().set_index('DATE')
betas.index = pd.to_datetime(betas.index)

In [147]:
betas_shifted = pd.concat([
    betas.shift(1),
    pd.DataFrame([betas.iloc[-1]], index=[betas.index[-1] + pd.offsets.MonthEnd(1)])
])
betas_shifted.index.name = 'DATE'

In [148]:
# for manual confirmation 
betas_shifted[list(Ycols) + ['intercept']].to_csv('betas.csv')
fdf[['ID','DATE'] + list(Ycols)].to_csv('fdf.csv')

In [149]:
fdf = fdf.set_index(['ID','DATE'])

In [153]:
yfit = fdf[Ycols].assign(intercept = 1).mul(betas_shifted, level = 'DATE').sum(axis = 1).rename('exp_ret').to_frame()

In [155]:
yfit['quantile'] = yfit.groupby('DATE', group_keys = False)['exp_ret'].apply(lambda x: pd.cut(x, 10, labels = False)).rename('quantile')

In [None]:
# using stats as of date 
# Dec 1st stats (shifted) with Dec end returns gives Dec beta. 
# avearge of Jan to Dec betas gives Dec average beta 
# Dec beta with Dec returns gives dec ranks 

In [156]:
yfit = yfit.merge(fdf[['stock_return']], left_index = True, right_index = True)

In [None]:
# this is ignoring a lot of the features, but need this for now to proceed with just model fitting 
# goal is daily data, 


In [161]:
yfit_clean = yfit[yfit.index.get_level_values('DATE')> pd.Timestamp('2016-12-31')]

In [None]:
yfit_clean.to_csv('../../bbgfactor/rty_2014_smallfeatures.csv')

In [None]:
# what output do we need?
# 1. betas 
# 2. tstats
# 3. ranks and realized returns for decile testing => goes to rank backtester 
# 4. current portfolio 
# 5. can redo ranks based on combining top betas differently 
