In [1]:
import warnings
warnings.filterwarnings('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from sec_edgar_downloader import Downloader # https://pypi.org/project/sec-edgar-downloader/
import pandas as pd
import os

In [2]:
# Save row_data_index
download_directory_root = os.path.join(os.getcwd(), 'data')
download_directory_index = os.path.join(download_directory_root, 'row_data_index')

In [3]:
# Initialize a downloader instance. If no argument is passed
# to the constructor, the package will download filings to
# the current working directory.
dl = Downloader(download_directory_root)

## LOAD INDEX FILE

In [4]:
path = os.path.join(download_directory_root, 'mutual_funds_data.csv')
df = pd.read_csv(path)

In [5]:
df.head()

Unnamed: 0,CIK,CNAME,FORM_TYPE,FILLING_DATE,TXT_URL,HTML_URL
0,1000351,LEUTHOLD FUNDS INC,N-CSRS,2022-06-02,edgar/data/1000351/0000894189-22-004153.txt,edgar/data/1000351/0000894189-22-004153-index....
1,1003239,SEASONS SERIES TRUST,N-CSR,2022-06-08,edgar/data/1003239/0001104659-22-069195.txt,edgar/data/1003239/0001104659-22-069195-index....
2,1003632,SEI ASSET ALLOCATION TRUST,N-CSR,2022-06-08,edgar/data/1003632/0001398344-22-011683.txt,edgar/data/1003632/0001398344-22-011683-index....
3,1005020,VIRTUS OPPORTUNITIES TRUST,N-CSRS,2022-06-03,edgar/data/1005020/0001193125-22-166941.txt,edgar/data/1005020/0001193125-22-166941-index....
4,1005942,PUTNAM FUNDS TRUST,N-CSRS,2022-04-28,edgar/data/1005942/0000928816-22-000487.txt,edgar/data/1005942/0000928816-22-000487-index....


In [6]:
df.CIK.unique().size

4350

<div class="alert alert-block alert-info">
<b>IMPROVEMENT COMPARED TO PREVIOUS VERSION:</b> In this situation, we check that the CIK of files we are going to downlaod are also available on the CRSP database. So we only download report for which we can find performances / returns online. In addition, we also filter out the companies which have more than 1 mutual fund. This is to ensure that we get the return of the fund for which we have performed the sentiment analysis.
</div>

In [7]:
# Connection to WRDS
import wrds
db = wrds.Connection(wrds_username='etiennebruno')

from utils import save_pkl

Loading library list...
Done


In [8]:
cik_list = df["CIK"].unique()
cik_list_sorted = sorted(cik_list, reverse=True)
cik_list_sorted_str = ', '.join([str(i) for i in cik_list_sorted])

In [9]:
# Get all cik number available (intersection between our data and the data on CRSP)
ciks_data = db.raw_sql(f'''
        select distinct comp_cik
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_list_sorted_str})
''')
cik_available = list(ciks_data.comp_cik)
save_pkl(cik_available, 'data/cik_available_crsp.pkl')
cik_available_str = ', '.join([str(i) for i in cik_available])

print(f'Number of CIK available to download : {len(cik_available)}')

Number of CIK available to download : 1610


In [10]:
# With the mapping table of CRSP, get all companies' fund number
crsp_fundno_available = db.raw_sql(f'''
        select *
        from crsp_q_mutualfunds.crsp_cik_map
        where comp_cik in ({cik_available_str})
''')
crsp_fundno_available

Unnamed: 0,crsp_fundno,comp_cik,series_cik,contract_cik
0,3.0,912036.0,,
1,5.0,351895.0,S000004050,C000011336
2,7.0,1113914.0,,
3,8.0,812015.0,,
4,9.0,812015.0,,
...,...,...,...,...
41532,99720.0,1139819.0,S000073854,C000231080
41533,99721.0,1139819.0,S000073854,C000231079
41534,99722.0,1139819.0,S000073854,C000231074
41535,99726.0,908186.0,S000005778,C000236300


In [11]:
print(len(crsp_fundno_available['crsp_fundno'].unique()))
print(len(crsp_fundno_available['comp_cik'].unique()))

ret = sum(crsp_fundno_available['comp_cik'].value_counts() == 1)
print(f'Number of companies (CIK) having exactly 1 fund ({ret})')

41537
1610
Number of companies (CIK) having exactly 1 fund (186)


In [12]:
# Filter by keeping only CIK which have exactly one fund

# Solution 1
#crsp_fundno_available[crsp_fundno_available.groupby("comp_cik").transform(len)["crsp_fundno"] == 1]

# Solution 2 - more explicit
s = crsp_fundno_available['comp_cik'].value_counts()
crsp_fundno_available = crsp_fundno_available[crsp_fundno_available['comp_cik'].map(s) == 1]
display(crsp_fundno_available)

cik_list_sorted_to_download = sorted(list(crsp_fundno_available.comp_cik))
cik_list_sorted_to_download_str = ['000'+str(int(i)) for i in cik_list_sorted_to_download]

Unnamed: 0,crsp_fundno,comp_cik,series_cik,contract_cik
1,5.0,351895.0,S000004050,C000011336
28,357.0,883496.0,,
33,743.0,848012.0,S000006084,C000016714
444,3537.0,1311981.0,S000004764,C000012957
905,4456.0,1105877.0,S000005190,C000014168
...,...,...,...,...
40930,98751.0,1848758.0,S000071907,
41000,98870.0,1760588.0,S000066608,
41220,99118.0,1508033.0,S000072822,C000229393
41321,99243.0,1849998.0,S000074173,


In [None]:
%%time

DS_STORE = '.DS_Store'

try:
    path = os.path.join('data', 'sec-edgar-filings')
    fillings_cik = os.listdir(path)
    if DS_STORE in fillings_cik : fillings_cik.remove(DS_STORE)
    nb_elem = len(fillings_cik)
except:
    nb_elem = 0
    
print(nb_elem)


from tqdm import tqdm
for cik in tqdm(cik_list_sorted_to_download_str):
    dl.get("N-CSR", cik, amount=15, download_details=False)
    dl.get("N-CSRS", cik, amount=15, download_details=False)