In [None]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from sec_edgar_downloader import Downloader # https://pypi.org/project/sec-edgar-downloader/
import pandas as pd
import os

In [None]:
# Save row_data_index
download_directory_root = os.path.join(os.getcwd(), 'data')
download_directory_index = os.path.join(download_directory_root, 'row_data_index')

In [None]:
# Initialize a downloader instance. If no argument is passed
# to the constructor, the package will download filings to
# the current working directory.
dl = Downloader(download_directory_root)

## LOAD INDEX FILE

In [None]:
path = os.path.join(download_directory_root, 'mutual_funds_data.csv')
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.CIK.unique().size

<div class="alert alert-block alert-info">
<b>IMPROVEMENT COMPARED TO PREVIOUS VERSION:</b> In this situation, we check that the CIK of files we are going to downlaod are also available on the CRSP database. So we only download report for which we can find performances / returns online. In addition, we also filter out the companies which have more than 1 mutual fund. This is to ensure that we get the return of the fund for which we have performed the sentiment analysis.
</div>

In [None]:
# Connection to WRDS
import wrds
db = wrds.Connection(wrds_username='etiennebruno')

from utils import save_pkl

In [None]:
cik_list = df["CIK"].unique()
cik_list_sorted = sorted(cik_list, reverse=True)
cik_list_sorted_str = ', '.join([str(i) for i in cik_list_sorted])

In [None]:
# Get all cik number available (intersection between our data and the data on CRSP)
ciks_data = db.raw_sql(f'''
        select distinct comp_cik
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_list_sorted_str})
''')
cik_available = list(ciks_data.comp_cik)
save_pkl(cik_available, 'data/cik_available_crsp.pkl')
cik_available_str = ', '.join([str(i) for i in cik_available])

print(f'Number of CIK available to download : {len(cik_available)}')

In [None]:
# With the mapping table of CRSP, get all companies' fund number
crsp_fundno_available = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_available_str})
''')
crsp_fundno_available

In [None]:
print(len(crsp_fundno_available['crsp_fundno'].unique()))
print(len(crsp_fundno_available['comp_cik'].unique()))

ret = sum(crsp_fundno_available['comp_cik'].value_counts() == 1)
print(f'Number of companies (CIK) having exactly 1 fund ({ret})')

In [None]:
# Filter by keeping only CIK which have exactly one fund

# Solution 1
#crsp_fundno_available[crsp_fundno_available.groupby("comp_cik").transform(len)["crsp_fundno"] == 1]

# Solution 2 - more explicit
s = crsp_fundno_available['comp_cik'].value_counts()
crsp_fundno_available = crsp_fundno_available[crsp_fundno_available['comp_cik'].map(s) == 1]
display(crsp_fundno_available)

cik_list_sorted_to_download = sorted(list(crsp_fundno_available.comp_cik))
cik_list_sorted_to_download_str = ['000'+str(int(i)) for i in cik_list_sorted_to_download]

In [None]:
%%time

DS_STORE = '.DS_Store'

try:
    path = os.path.join('data', 'sec-edgar-filings')
    fillings_cik = os.listdir(path)
    if DS_STORE in fillings_cik : fillings_cik.remove(DS_STORE)
    nb_elem = len(fillings_cik)
except:
    nb_elem = 0
    
print(nb_elem)


from tqdm import tqdm
for cik in tqdm(cik_list_sorted_to_download_str):
    dl.get("N-CSR", cik, amount=15, download_details=False)
    dl.get("N-CSRS", cik, amount=15, download_details=False)