In [1]:
import edgar # https://github.com/edgarminers/python-edgar
from sec_edgar_downloader import Downloader # https://pypi.org/project/sec-edgar-downloader/
import pandas as pd
import os

In [2]:
# Save row_data_index
download_directory_root = "/Users/etiennebruno/Library/CloudStorage/OneDrive-epfl.ch/ETIENNE/EPFL/EPFL_Master/MA2_EPFL/fin_407_financial_econometrics/FINE_project/mutual_funds_deviations/data/"
download_directory_index = download_directory_root+"row_data_index/"

# Download index of all SEC reports by quarter
since_year=2010
edgar.download_index(download_directory_index, since_year, user_agent='Etienne BRUNO etienne.bruno@epfl.ch', skip_all_present_except_last=False)

In [3]:
def read_tsv(file_name):
    col = ['CIK', 'CNAME', 'FORM_TYPE', 'FILLING_DATE', 'TXT_URL', 'HTML_URL']
    return pd.read_csv(file_name,  sep="|", names=col)

filepaths = sorted(['data/row_data_index/' + path for path in os.listdir(download_directory_index)], reverse=True)
df = pd.concat(map(read_tsv, filepaths), axis=0)
form_to_keep = ['N-CSR', 'N-CSRS']
df = df[df['FORM_TYPE'].isin(form_to_keep)]

# Save to CSV the final dataframe on which we will work
df.to_csv(download_directory_root+"/mutual_funds_data.csv", index=False)

In [20]:
df.head()

Unnamed: 0,CIK,CNAME,FORM_TYPE,FILLING_DATE,TXT_URL,HTML_URL
1130,1021882,VANGUARD SCOTTSDALE FUNDS,N-CSRS,2022-04-27,edgar/data/1021882/0001104659-22-051082.txt,edgar/data/1021882/0001104659-22-051082-index....
1316,1027596,ADVISORS SERIES TRUST,N-CSRS,2022-04-08,edgar/data/1027596/0000898531-22-000154.txt,edgar/data/1027596/0000898531-22-000154-index....
1340,102816,EATON VANCE GROWTH TRUST,N-CSRS,2022-04-26,edgar/data/102816/0001193125-22-119853.txt,edgar/data/102816/0001193125-22-119853-index.html
1341,102816,EATON VANCE GROWTH TRUST,N-CSR,2022-04-26,edgar/data/102816/0001193125-22-119859.txt,edgar/data/102816/0001193125-22-119859-index.html
1774,1038490,"T. Rowe Price Tax-Efficient Funds, Inc.",N-CSR,2022-04-21,edgar/data/1038490/0001206774-22-001167.txt,edgar/data/1038490/0001206774-22-001167-index....


In [21]:
# Initialize a downloader instance. If no argument is passed
# to the constructor, the package will download filings to
# the current working directory.
dl = Downloader(download_directory_root)

In [22]:
%%time
from tqdm import tqdm

cik_list = df["CIK"].unique()
n = len(cik_list)
for i, cik in tqdm(enumerate(sorted(cik_list, reverse=True)[:100])):
    dl.get("N-CSR", cik, amount=5, query="benchmark", download_details=False)
    dl.get("N-CSRS", cik, amount=5, query="benchmark", download_details=False)

100it [04:05,  2.45s/it]

CPU times: user 9.17 s, sys: 1.53 s, total: 10.7 s
Wall time: 4min 5s





In [None]:
import wrds

In [None]:
db = wrds.Connection(wrds_username='etiennebruno')

In [None]:
db.list_libraries().sort()
db.list_libraries()

In [None]:
# List the tables of a given library§
db.list_tables(library='crsp')

In [None]:
# Get data
daily = db.get_table(library='crsp', table='fund_summary2', obs=10)
daily

In [None]:
temp = db.get_table(library='crsp', table='monthly_tna_ret_nav', obs=10)
temp

In [None]:
keys = set()
for table in db.list_tables(library='crsp'):

    try:
        l= list(filter(lambda x: 'bench' in x or 'bnc' in x or 'ben' in x or 'bch' in x, db.get_table(library='crsp', table=table, obs=1).keys()))
    except:
        pass
    if len(l) > 0:
        print(table, "---", l)

In [None]:
db.get_table(library='crsp', table='crsp_cik_map', obs=4)

In [None]:
db.raw_sql('''
        select *
        from crsp.portnomap p
        inner join crsp.monthly_returns mr on p.crsp_fundno = mr.crsp_fundno
        inner join crsp.crsp_cik_map ccm on ccm.crsp_fundno = p.crsp_fundno
        --where p.m_fund = 'Y'
        limit 100
''')