In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

warnings.simplefilter(action="ignore", category=FutureWarning)
sns.set_theme(color_codes=True, style="whitegrid")

In [2]:
data = pd.read_csv("characteristics_data_feb2017.csv")

In [3]:
data.drop(
    columns=[
        "q10",
        "q20",
        "q50",
        "prc",
        "a2me",
        "ato",
        "beme",
        "c",
        "cto",
        "d2a",
        "dpi2a",
        "e2p",
        "fc2y",
        "free_cf",
        "investment",
        "lturnover",
        "noa",
        "oa",
        "ol",
        "pcm",
        "pm",
        "prof",
        "q",
        "rna",
        "roa",
        "roe",
        "s2p",
        "sga2m",
        "at",
        "cum_return_12_2",
        "cum_return_12_7",
        "cum_return_1_0",
        "cum_return_36_13",
        "idio_vol",
        "spread_mean",
        "suv",
        "rel_to_high_price",
        "lev",
    ],
    inplace=True,
)

data.sort_values(by=["date"], ascending=[True], inplace=True)
data["date"] = pd.to_datetime(data["date"])
data.head()

Unnamed: 0.1,Unnamed: 0,yy,mm,date,permno,ret,lme,beta
214585,214586,1962,7,1962-07-31,19940,-0.010899,395763.625,0.599815
286044,286045,1962,7,1962-07-31,25160,-0.039216,39780.0,0.835357
290255,290256,1962,7,1962-07-31,25478,-0.056452,61984.5,-0.012614
214586,214587,1962,8,1962-08-31,19940,0.104683,389952.75,0.581311
286045,286046,1962,8,1962-08-31,25160,0.027211,38220.0,0.79409


In [46]:
def valid_entries(period, data):
    """Return ids of entries that exist throughout test and validation period"""

    if max(period) > data["date"].max():
        raise Exception(f"No data beyond 2014-05-31 available")

    elif min(period) < data["date"].min():
        raise Exception(f"No data before 1962-07-31 available")

    filtered_data = data.loc[data["date"].isin(period)]
    n_months = len(period)
    
    valid_ids = (
        filtered_data
        .groupby("permno")
        .filter(lambda x: x["date"].nunique() == n_months)
        ["permno"]
        .unique()
        .tolist()
    )

    return valid_ids

In [57]:
def get_top_N_stocks(start_year, end_year, data=data, N=500):
    """Get top N stocks by market cap in a given time period, only for stocks that have existed in the entire period (test data) and the year after given period (validation)"""

    period = pd.date_range(
        start=f"{start_year}-01-01", end=f"{end_year}-12-31", freq="M"
    )

    valid_ids = valid_entries(period, data)

    test_data = data.loc[
        data["date"].dt.year.between(start_year, end_year - 1)
        & data["permno"].isin(valid_ids)
    ]
    validation_data = data.loc[
        data["date"].dt.year.eq(end_year) & data["permno"].isin(valid_ids)
    ]

    top_N_test = (
        test_data.sort_values(["date", "lme"], ascending=[True, False])
        .groupby("date")
        .head(N)
        .reset_index(drop=True)
    )
    
    top_N_test_permnos = top_N_test["permno"].unique()

    top_N_validation = (
        validation_data
        .loc[validation_data["permno"].isin(top_N_test_permnos)]
        .sort_values(["date", "lme"], ascending=[True, False]).set_index("date")
    )


    return top_N_test, top_N_validation

In [58]:
def construct_index(t, v):

    constructed_index = pd.DataFrame(columns=["date", "index_ret"])
    t["weight"] = t.groupby("date")["lme"].transform(lambda x: x / x.sum())
    t["weighted_ret"] = t["ret"] * t["weight"]
    t["weighted_beta"] = t["beta"] * t["weight"]

    constructed_index = (
        t.groupby("date", as_index=False)
        .agg({"weighted_ret": "sum", "weighted_beta": "sum"})
        .rename(columns={"weighted_ret": "index_ret", "weighted_beta": "index_beta"})
    )
    constructed_index.set_index("date", inplace=True)

    actual_returns = pd.DataFrame(columns=["date", "actual_ret"])
    v["weight"] = v.groupby("date")["lme"].transform(lambda x: x / x.sum())
    v["weighted_ret"] = v["ret"] * v["weight"]

    actual_returns = v.groupby("date")["weighted_ret"].sum().reset_index()
    actual_returns.rename(columns={"weighted_ret": "actual_index_ret"}, inplace=True)
    actual_returns.set_index("date", inplace=True)

    return constructed_index, actual_returns

In [59]:
kenneth_french_df = pd.read_csv("F-F_Research_Data_Factors.CSV") # monthly factors

kenneth_french_df["Date"] = pd.to_datetime(kenneth_french_df["Date"], format="%Y%m")
kenneth_french_df = kenneth_french_df.set_index("Date")
kenneth_french_df /= 100

kenneth_french_df

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1926-07-01,0.0296,-0.0256,-0.0243,0.0022
1926-08-01,0.0264,-0.0117,0.0382,0.0025
1926-09-01,0.0036,-0.0140,0.0013,0.0023
1926-10-01,-0.0324,-0.0009,0.0070,0.0032
1926-11-01,0.0253,-0.0010,-0.0051,0.0031
...,...,...,...,...
2024-07-01,0.0124,0.0680,0.0574,0.0045
2024-08-01,0.0161,-0.0355,-0.0113,0.0048
2024-09-01,0.0174,-0.0017,-0.0259,0.0040
2024-10-01,-0.0097,-0.0101,0.0089,0.0039


In [60]:
start_year = 1978
end_year = 2013
window_t = 5
N = 500

In [76]:
# expanding rolling window
comparison_rows = []

for i in range(start_year + window_t, end_year):

    t, v = get_top_N_stocks(start_year, i, data, N)
    constructed_index, actual_returns = construct_index(t, v)

    validation_annual = (
        v
        .groupby(["permno", v.index.year])["ret"]
        .agg(lambda x: (1 + x).prod() - 1)
        .reset_index(name="annual_return")
    )

    validation_annual_dict = dict(
        zip(validation_annual["permno"], validation_annual["annual_return"])
    )

    start_date = f"{start_year}-01-01"
    end_date   = f"{i - 1}-12-31"
    
    constructed_index["index_ret"].add(-kenneth_french_df.loc[start_date: end_date]["RF"]) # index minus bezrizična stopa
    e = (1 + constructed_index["index_ret"].mean())**12 - 1

    for permno in t["permno"].unique():
        bi = t.loc[t["permno"] == permno]["beta"].mean() # average bete kao reprezentativna
        mu = e * bi
        real = validation_annual_dict.get(permno, None)

        comparison_rows.append({
            "test_period": f"{start_year} - {i-1}",
            "permno": permno,
            "predicted": mu,
            "real": real
        })

comparison = pd.DataFrame(comparison_rows)
comparison.head()

Unnamed: 0,test_period,permno,predicted,real
0,1978 - 1982,12490,0.178995,0.310232
1,1978 - 1982,11850,0.162408,0.373379
2,1978 - 1982,12079,0.163796,0.242247
3,1978 - 1982,12060,0.167449,0.279014
4,1978 - 1982,11754,0.216911,-0.071935
