In [None]:
from pathlib import Path

import numpy as np
import pandas as pd


In [2]:
data_dir = Path.cwd().parent / "data"

In [17]:
df = pd.read_csv(data_dir / "factors.csv", parse_dates=["date"])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45929 entries, 0 to 45928
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PERMNO         45929 non-null  int64         
 1   date           45929 non-null  datetime64[ns]
 2   tic            45929 non-null  object        
 3   conm           45929 non-null  object        
 4   cusip          45929 non-null  object        
 5   market_cap     45929 non-null  float64       
 6   n_shares       45929 non-null  float64       
 7   n_months       45929 non-null  int64         
 8   ret_arith      45928 non-null  float64       
 9   ret_geo        45928 non-null  float64       
 10  vol_36m        45275 non-null  float64       
 11  value          45802 non-null  float64       
 12  profitability  45683 non-null  float64       
 13  investment     45903 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(2), object(3)
memory usage: 4.

In [28]:
factor_cols = ["date", "ret_geo", "vol_36m", "value", "investment", "profitability"]

factor = df[factor_cols].replace([np.inf, -np.inf], np.nan)

In [29]:
factor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45929 entries, 0 to 45928
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           45929 non-null  datetime64[ns]
 1   ret_geo        45928 non-null  float64       
 2   vol_36m        45275 non-null  float64       
 3   value          44948 non-null  float64       
 4   investment     45903 non-null  float64       
 5   profitability  44833 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 2.1 MB


In [30]:
def winsorize_cross_section(df, cols, lower=0.01, upper=0.99):
    df = df.copy()
    for col in cols:
        # Apply separately for each year (cross-sectional clean)
        df[col] = df.groupby(df["date"].dt.year)[col].transform(
            lambda x: x.clip(lower=x.quantile(lower), upper=x.quantile(upper))
        )
    return df


# Apply:
factor_winsorized = winsorize_cross_section(factor, factor_cols)
