In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

## GET (SEMI-)ANNUAL AVERAGE RETURNS OVER TIME OF THE AVAILABLE FUNDS

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
# Connection to WRDS
import wrds
db = wrds.Connection(wrds_username='etiennebruno')
#db = wrds.Connection(wrds_username='zadaf')

Loading library list...
Done


In [5]:
# Import CIKs from the dataframe of all cleaned filings
from utils import *
ciks = ', '.join(list(df_from_filings().cik.unique()))

In [6]:
# Get all cik number available (intersection between our dataframe and the data on CRSP)
ciks_data = db.raw_sql(f'''
        select distinct comp_cik
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({ciks})
''')
cik_available = list(ciks_data.comp_cik)
cik_available = ', '.join([str(i) for i in cik_available])

In [7]:
# With the mapping table of CRSP, get all companies' fund number
crsp_fundno_available = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_available})
''')
crsp_fundno_available

Unnamed: 0,crsp_fundno,comp_cik,series_cik,contract_cik
0,5.0,351895.0,S000004050,C000011336
1,357.0,883496.0,,
2,743.0,848012.0,S000006084,C000016714
3,3537.0,1311981.0,S000004764,C000012957
4,4456.0,1105877.0,S000005190,C000014168
...,...,...,...,...
181,98751.0,1848758.0,S000071907,
182,98870.0,1760588.0,S000066608,
183,99118.0,1508033.0,S000072822,C000229393
184,99243.0,1849998.0,S000074173,


In [8]:
print(len(crsp_fundno_available['crsp_fundno'].unique()))

print(len(crsp_fundno_available['comp_cik'].unique()))

186
186


In [9]:
sum(crsp_fundno_available['comp_cik'].value_counts() == 2)

0

In [10]:
# Convert the list of fund numbers to a list of string type for future SQL queries
list_float_fundnos = list(crsp_fundno_available.crsp_fundno.unique())
crsp_fundno_available_list_str = ', '.join([str(i) for i in list_float_fundnos])

In [11]:
# Get all available returns for each fund
monthly_returns_all_funds = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.monthly_returns
        where crsp_fundno in ({crsp_fundno_available_list_str})
        order by caldt desc
''')
monthly_returns_all_funds = pd.merge(left=monthly_returns_all_funds,right=crsp_fundno_available, how='inner', left_on='crsp_fundno', right_on='crsp_fundno')
monthly_returns_all_funds

Unnamed: 0,crsp_fundno,caldt,mret,comp_cik,series_cik,contract_cik
0,38899.0,2022-03-31,-0.000767,1000249.0,S000011490,C000031720
1,38899.0,2022-02-28,-0.001149,1000249.0,S000011490,C000031720
2,38899.0,2022-01-31,-0.010614,1000249.0,S000011490,C000031720
3,38899.0,2021-12-31,0.037927,1000249.0,S000011490,C000031720
4,38899.0,2021-11-30,-0.032331,1000249.0,S000011490,C000031720
...,...,...,...,...,...,...
46417,34968.0,1977-12-30,-0.103429,80946.0,S000010007,
46418,34968.0,1976-12-31,0.235968,80946.0,S000010007,
46419,34968.0,1975-12-31,0.412130,80946.0,S000010007,
46420,34968.0,1974-12-31,-0.290340,80946.0,S000010007,


In [12]:
# Create a new dataframe with some statistic and the average return
monthly_returns_all_funds_avg = monthly_returns_all_funds[['caldt', 'mret', 'comp_cik', 'crsp_fundno']].groupby(by=['comp_cik','caldt']).agg(
    count=('crsp_fundno', 'count'),
    mret=('mret', 'mean'),
)

monthly_returns_all_funds_avg.sort_values(by=['comp_cik', 'caldt'], inplace=True)
monthly_returns_all_funds_avg['year'] = pd.to_datetime(monthly_returns_all_funds_avg.index.get_level_values(1))
monthly_returns_all_funds_avg['year'] = monthly_returns_all_funds_avg['year'].dt.year

# Create temporary column to compute later the cumulative return
monthly_returns_all_funds_avg['mret_add_1']=monthly_returns_all_funds_avg['mret']+1

# Compite the cumulative return of each CIK number
monthly_returns_all_funds_avg['cum_return']=monthly_returns_all_funds_avg.groupby(['comp_cik', 'year'])['mret_add_1'].cumprod()-1

# Display dataframe
display(monthly_returns_all_funds_avg)
monthly_returns_all_funds_avg.reset_index().to_pickle('data/funds_returns.pkl')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mret,year,mret_add_1,cum_return
comp_cik,caldt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5100.0,1976-12-31,1,,1976,,
5100.0,1977-01-31,1,-0.041893,1977,0.958107,-0.041893
5100.0,1977-02-28,1,-0.026721,1977,0.973279,-0.067494
5100.0,1977-03-31,1,-0.016639,1977,0.983361,-0.083010
5100.0,1977-04-29,1,-0.007614,1977,0.992386,-0.089992
...,...,...,...,...,...,...
1849998.0,2022-02-28,1,-0.008032,2022,0.991968,-0.020258
1849998.0,2022-03-31,1,-0.017062,2022,0.982938,-0.036974
1873280.0,2022-01-31,1,,2022,,
1873280.0,2022-02-28,1,-0.016778,2022,0.983222,-0.016778


In [13]:
# Add specific columns to the aggregated dataframe
# in order to be able to merge it with the df containing the filings
all_funds_cum_ret = monthly_returns_all_funds_avg.reset_index()
all_funds_cum_ret['caldt'] = pd.to_datetime(all_funds_cum_ret['caldt']) # add date
all_funds_cum_ret = all_funds_cum_ret[all_funds_cum_ret['caldt'].dt.month.isin([6,12])] # only keep semi annual return and annual return
all_funds_cum_ret['report_type'] = all_funds_cum_ret.caldt.apply(lambda caldt: 'N-CSR' if caldt.month == 12 else 'N-CSRS') # label if semi-annual or annual
all_funds_cum_ret['year'] = all_funds_cum_ret.caldt.apply(lambda caldt: caldt.year) # add year
all_funds_cum_ret['comp_cik'] = all_funds_cum_ret.comp_cik.apply(lambda comp_cik: '000'+str(int(comp_cik))) # reformat the cik

# Process the the merge
new_df = pd.merge(all_funds_cum_ret, df_from_filings(),  how='inner', left_on=['comp_cik','report_type', 'year'], right_on = ['cik','report_type', 'year'])

In [14]:
# Save to pkl the new dataframe containing the texts, cumulative returns, date information and, other metadata
path_save = os.path.join('data', 'data_frame_complete.pkl')
save_pkl(new_df, path_save)

In [15]:
path_finbert_df = os.path.join('data', 'sentiment_analysis', 'df_finbert_predictions.pkl')