In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

warnings.simplefilter(action='ignore', category=FutureWarning)
sns.set_theme(color_codes=True, style='whitegrid')

In [None]:
data = pd.read_csv('characteristics_data_feb2017.csv')

In [7]:
data.drop(columns=['q10','q20','q50','prc','a2me','ato','beme','c','cto','d2a','dpi2a','e2p','fc2y','free_cf','investment',
                   'lturnover','noa','oa','ol','pcm','pm','prof','q','rna','roa','roe','s2p','sga2m','at',
                   'cum_return_12_2','cum_return_12_7','cum_return_1_0','cum_return_36_13','idio_vol','spread_mean','suv','rel_to_high_price','lev']
                   , inplace=True)

data.sort_values(by=['date'], ascending=[True], inplace=True)
data['date'] = pd.to_datetime(data['date'])
data.head()

Unnamed: 0.1,Unnamed: 0,yy,mm,date,permno,ret,lme,beta
214585,214586,1962,7,1962-07-31,19940,-0.010899,395763.625,0.599815
286044,286045,1962,7,1962-07-31,25160,-0.039216,39780.0,0.835357
290255,290256,1962,7,1962-07-31,25478,-0.056452,61984.5,-0.012614
214586,214587,1962,8,1962-08-31,19940,0.104683,389952.75,0.581311
286045,286046,1962,8,1962-08-31,25160,0.027211,38220.0,0.79409


In [8]:
def valid_entries(period, data):
    '''Return ids of entries that exist throughout test and validation period'''

    if max(period) > data['date'].max():
        raise Exception(f'No data beyond 2014-05-31 available')

    elif min(period) < data['date'].min():
        raise Exception(f'No data before 1962-07-31 available')

    filtered_data = data.loc[data['date'].isin(period)]
    unique_date_count = filtered_data['date'].nunique()

    valid_ids = (
        filtered_data.groupby('permno')
        .filter(lambda x: x['date'].nunique() == unique_date_count)['permno']
        .unique()
        .tolist()
    )

    return valid_ids

In [9]:
def get_top_N_stocks(start_year, end_year, data=data, N=500):
    '''Get top N stocks by market cap in a given time period, only for stocks that have existed in the entire period (test data) and the year after given period (validation)'''

    period = pd.date_range(
        start=f'{start_year}-01-01', end=f'{end_year-1}-12-31', freq='M'
    )

    valid_ids = valid_entries(period, data)

    test_data = data.loc[
        data['date'].dt.year.between(start_year, end_year-1)
        & data['permno'].isin(valid_ids)
    ]
    validation_data = data.loc[
        data['date'].dt.year.eq(end_year) & data['permno'].isin(valid_ids)
    ]

    top_N_test = test_data.sort_values(['date', 'lme'], ascending=[True, False]).groupby('date').head(N).reset_index(drop=True)
    top_N_validation = validation_data.sort_values(['date', 'lme'], ascending=[True, False]).groupby('date').head(N).reset_index(drop=True)

    return top_N_test, top_N_validation

In [19]:
constructed_index = pd.DataFrame(columns=['date', 'index_ret'])
actual_returns = pd.DataFrame(columns=['date', 'actual_ret'])

N = 500
t,v  = get_top_N_stocks(2005, 2010, N=N)

t['weight'] = t.groupby('date')['lme'].transform(lambda x: x / x.sum())  
t['weighted_ret'] = t['ret'] * t['weight']

v['weight'] = v.groupby('date')['lme'].transform(lambda x: x / x.sum())
v['weighted_ret'] = v['ret'] * v['weight']

constructed_index = t.groupby('date')['weighted_ret'].sum().reset_index()
constructed_index.rename(columns={'weighted_ret': 'index_ret'}, inplace=True)
constructed_index.set_index('date', inplace=True)

constructed_index

Unnamed: 0_level_0,index_ret
date,Unnamed: 1_level_1
2005-01-31,-0.027269
2005-02-28,0.035838
2005-03-31,-0.01816
2005-04-30,-0.029299
2005-05-31,0.039692
2005-06-30,0.000556
2005-07-31,0.047788
2005-08-31,-0.010586
2005-09-30,0.007894
2005-10-31,-0.025011
