In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

## GET (SEMI-)ANNUAL AVERAGE RETURNS OVER TIME OF THE AVAILABLE FUNDS

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
# Connection to WRDS
import wrds
db = wrds.Connection(wrds_username='etiennebruno')
#db = wrds.Connection(wrds_username='zadaf')

In [None]:
# Import CIKs from the dataframe of all cleaned filings
from utils import *
ciks = ', '.join(list(df_from_filings().cik.unique()))

In [None]:
# Get all cik number available (intersection between our dataframe and the data on CRSP)
ciks_data = db.raw_sql(f'''
        select distinct comp_cik
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({ciks})
''')
cik_available = list(ciks_data.comp_cik)
cik_available = ', '.join([str(i) for i in cik_available])

In [None]:
# With the mapping table of CRSP, get all companies' fund number
crsp_fundno_available = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_available})
''')
crsp_fundno_available

In [None]:
print(len(crsp_fundno_available['crsp_fundno'].unique()))

print(len(crsp_fundno_available['comp_cik'].unique()))

In [None]:
sum(crsp_fundno_available['comp_cik'].value_counts() == 2)

In [None]:
# Convert the list of fund numbers to a list of string type for future SQL queries
list_float_fundnos = list(crsp_fundno_available.crsp_fundno.unique())
crsp_fundno_available_list_str = ', '.join([str(i) for i in list_float_fundnos])

In [None]:
# Get all available returns for each fund
monthly_returns_all_funds = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.monthly_returns
        where crsp_fundno in ({crsp_fundno_available_list_str})
        order by caldt desc
''')
monthly_returns_all_funds = pd.merge(left=monthly_returns_all_funds,right=crsp_fundno_available, how='inner', left_on='crsp_fundno', right_on='crsp_fundno')
monthly_returns_all_funds

In [None]:
# Create a new dataframe with some statistic and the average return
monthly_returns_all_funds_avg = monthly_returns_all_funds[['caldt', 'mret', 'comp_cik', 'crsp_fundno']].groupby(by=['comp_cik','caldt']).agg(
    count=('crsp_fundno', 'count'),
    mret=('mret', 'mean'),
)

monthly_returns_all_funds_avg.sort_values(by=['comp_cik', 'caldt'], inplace=True)
monthly_returns_all_funds_avg['year'] = pd.to_datetime(monthly_returns_all_funds_avg.index.get_level_values(1))
monthly_returns_all_funds_avg['year'] = monthly_returns_all_funds_avg['year'].dt.year

# Create temporary column to compute later the cumulative return
monthly_returns_all_funds_avg['mret_add_1']=monthly_returns_all_funds_avg['mret']+1

# Compite the cumulative return of each CIK number
monthly_returns_all_funds_avg['cum_return']=monthly_returns_all_funds_avg.groupby(['comp_cik', 'year'])['mret_add_1'].cumprod()-1

# Display dataframe
display(monthly_returns_all_funds_avg)

In [None]:
# Add specific columns to the aggregated dataframe
# in order to be able to merge it with the df containing the filings
all_funds_cum_ret = monthly_returns_all_funds_avg.reset_index()
all_funds_cum_ret['caldt'] = pd.to_datetime(all_funds_cum_ret['caldt']) # add date
all_funds_cum_ret = all_funds_cum_ret[all_funds_cum_ret['caldt'].dt.month.isin([6,12])] # only keep semi annual return and annual return
all_funds_cum_ret['report_type'] = all_funds_cum_ret.caldt.apply(lambda caldt: 'N-CSR' if caldt.month == 12 else 'N-CSRS') # label if semi-annual or annual
all_funds_cum_ret['year'] = all_funds_cum_ret.caldt.apply(lambda caldt: caldt.year) # add year
all_funds_cum_ret['comp_cik'] = all_funds_cum_ret.comp_cik.apply(lambda comp_cik: '000'+str(int(comp_cik))) # reformat the cik

# Process the the merge
new_df = pd.merge(all_funds_cum_ret, df_from_filings(),  how='inner', left_on=['comp_cik','report_type', 'year'], right_on = ['cik','report_type', 'year'])

In [None]:
# Save to pkl the new dataframe containing the texts, cumulative returns, date information and, other metadata
path_save = os.path.join('data', 'data_frame_complete.pkl')
save_pkl(new_df, path_save)

In [None]:
path_finbert_df = os.path.join('data', 'sentiment_analysis', 'df_finbert_predictions.pkl')