#### Notes
* quarterly
* institutional investement managers with holdings over 100M
* Form 13F is required to be filed within 45 days of the end of a calendar quarter (which should be considered as significant information latency)
* only reports long positions (not short)
** different investment managers pursue different strategies with may bias results
** however, the vast majority of investment managers rely significantly on long positions for significant portion of fund performance
* 13F does not reveal international holdings (except for American depositary receipts).
* Section 13(f) securities generally include equity securities that trade on an exchange (including Nasdaq), certain equity options and warrants, shares of closed-end investment companies, and certain convertible debt securities.
* shares of open-end investment companies (i.e. mutual funds) are not Section 13(f) securities
* official list of qualifying securities: https://www.sec.gov/divisions/investment/13flists.htm
* excludes total portfolio value and percentage allocation of each stock listed
* Money managers allocate the most capital to their best ideas. Pay attention to "new positions" in their disclosures as these are their most recent ideas
* 13F is not their whole portfolio and that it's a past snapshot

In [82]:
import pandas as pd
import numpy as np
import html5lib
pd.set_option( 'display.notebook_repr_html', False )

from IPython.display import HTML # useful for snippets
#  e.g. HTML('<iframe src=http://en.mobile.wikipedia.org/?useformat=mobile width=700 height=350></iframe>')
from IPython.display import Image 
#  e.g. Image(filename='holt-winters-equations.png', embed=True) # url= also works
from IPython.display import YouTubeVideo
#  e.g. YouTubeVideo('1j_HxD4iLn8', start='43', width=600, height=400)
from IPython.core import page
get_ipython().set_hook('show_in_pager', page.as_hook(page.display_page), 0)

#  Generate PLOTS inside notebook, "inline" generates static png:
%matplotlib inline   
#          "notebook" argument allows interactive zoom and resize.


# note: https cannot be read by lxml


In [83]:
# load Q3 2018 report URLs
Q3Y18_index_df = pd.read_table('13f_Q3Y18_index.tsv', sep=',', index_col=False, encoding='latin-1')


In [84]:
# inspect size of dataset
Q3Y18_index_df.shape

(4819, 7)

In [85]:
# take sample of dataset for testing
percentage_sample = 5 # 5% dataset set to test dataset

test_df= Q3Y18_index_df.head(Q3Y18_index_df.shape[0]*percentage_sample/100)
test_df

     Unnamed: 0      CIK                                  Company Name  \
0            21  1000097            KINGDON CAPITAL MANAGEMENT, L.L.C.   
1            71  1000275                          ROYAL BANK OF CANADA   
2           477  1000490                          GIRARD PARTNERS LTD.   
3           519  1000742                    SANDLER CAPITAL MANAGEMENT   
4           582  1001085              BROOKFIELD ASSET MANAGEMENT INC.   
5           811  1002078            JAYHAWK CAPITAL MANAGEMENT, L.L.C.   
6           826  1002152               COMPASS CAPITAL MANAGEMENT INC/   
7           877  1002672                                     Bell Bank   
8           894  1002784                    SHELTON CAPITAL MANAGEMENT   
9           897  1002912        SHINE INVESTMENT ADVISORY SERVICES INC   
10          980  1003279               FORTALEZA ASSET MANAGEMENT INC/   
11         1136  1004244              NEW ENGLAND ASSET MANAGEMENT INC   
12         1426  1005354             V

In [86]:
# inspect if URL to be parsed is valid
test_df['Filing URL .html'].iloc[0]

u'edgar/data/1000097/0001000097-18-000003-index.html'

In [87]:
# initialize empty list to store dataframes from different investors (to be appended later)
appended_data = []

# loop through all reports, filter relevant data, create normalized dataframes per investor, add to list of dataframes to be appended
for index, row in test_df.iterrows():
 
    # need to parse initial html file for name of html file with investment data
    url = 'https://www.sec.gov/Archives/' + row['Filing URL .html'] #.iloc[index]
    page = pd.read_html( url )
    df = page[0]
    table_url_suffix = df[2].iloc[4]

    report_suffix = row['Filing URL .html'] 
    investor = row['Company Name'] 
    date = row['Filing Date'] 
    
    ### SET TO RETURN TOP 20 STOCKS PER INVESTOR (BY SIZE OF INVESTMENT)
    num_stocks_returned = 20

    stem = 'http://www.sec.gov/Archives/'
    xml_suffix = '/xslForm13F_X01/'

    report_suffix = report_suffix.replace('-index.html', '')
    report_suffix = report_suffix.replace('-', '')

    #  build URL to html file with investment data
    url = stem + report_suffix + xml_suffix + table_url_suffix
    print(url)
    
    # turn HTML file into dataframe
    page = pd.read_html( url )
    #  the last element of page contains relevant investement data
    df = page[-1]

    #  rename columns:
    df.columns = [ 'stock', 'class', 'cusip', 'usd', 'size', 'sh_prin', 'putcall', 'discret', 'manager', 'vote1', 'vote2', 'vote3']

    #  But first three rows are SEC labels, not data, 
    #  so delete them:
    df = df[3:]

    #  Start a new index from 0 instead of 3:
    df.reset_index( drop=True )


    #  Delete irrevelant columns:
    dflite = df.drop( df.columns[[1, 4, 5, 7, 8, 9, 10, 11]], axis=1 )

    #  usd needs float type since usd was read as string:
    dflite[['usd']] = dflite[['usd']].astype( float )
    #                  NOTE: int as type will fail for NaN

    #  Type change allows proper sort:
    dfusd = dflite.sort_values( by=['usd'], ascending=[False] )


    usdsum = sum( dfusd.usd )
    #  Portfolio total in USD:
    #usdsum


    #  New column for percentage of total portfolio:
    dfusd['pcent'] = np.round(( dfusd.usd / usdsum ) * 100, 2)


    # New column for date of report filling
    dfusd.insert(0, 'date', date)

    # New column for investor
    dfusd.insert(0, 'investor', investor)

    #  Dataframe per investor with top num_stocks_returned 
    appended_data.append(dfusd.head( num_stocks_returned ))

# show list of dataframes    
#appended_data



http://www.sec.gov/Archives/edgar/data/1000097/000100009718000003/xslForm13F_X01/king2q2018.xml
http://www.sec.gov/Archives/edgar/data/1000275/000156761918000890/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1000490/000100049018000003/xslForm13F_X01/06302018Girard.xml
http://www.sec.gov/Archives/edgar/data/1000742/000156761918001000/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1001085/000095012318008668/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1002078/000156761918000498/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1002152/000108514618001727/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1002672/000106299318003349/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1002784/000139834418011549/xslForm13F_X01/fp0034916_13fhr-table.xml
http://www.sec.gov/Archives/edgar/data/1002912/000100291218000003/xslForm13F_X01/Shin

http://www.sec.gov/Archives/edgar/data/1018674/000101867418000004/xslForm13F_X01/ParsonsCM_Q2_2018_13F.xml
http://www.sec.gov/Archives/edgar/data/1018825/000101882518000004/xslForm13F_X01/IBM2Q201813F.xml
http://www.sec.gov/Archives/edgar/data/1019231/000101923118000008/xslForm13F_X01/r13f.xml
http://www.sec.gov/Archives/edgar/data/1019531/000106299318002959/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1019754/000101975418000003/xslForm13F_X01/2ndQ2018.xml
http://www.sec.gov/Archives/edgar/data/1020066/000156761918000933/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1020317/000108514618001835/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1020455/000142050618000689/xslForm13F_X01/Form13F_InfoTable.xml
http://www.sec.gov/Archives/edgar/data/1020580/000102058018000003/xslForm13F_X01/tab13f630.xml
http://www.sec.gov/Archives/edgar/data/1020585/000158064218003455/xslForm13F_X01/infotable.xml
http://www.s

http://www.sec.gov/Archives/edgar/data/1040190/000117266118001399/xslForm13F_X01/infotable.xml
http://www.sec.gov/Archives/edgar/data/1040197/000104019718000004/xslForm13F_X01/inftable.xml
http://www.sec.gov/Archives/edgar/data/1040198/000156761918000703/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1040210/000104021018000004/xslForm13F_X01/jun18.xml
http://www.sec.gov/Archives/edgar/data/1040273/000108514618001980/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1040508/000095012318008487/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1040592/000104059218000008/xslForm13F_X01/infotable.xml
http://www.sec.gov/Archives/edgar/data/1040762/000091957418005187/xslForm13F_X01/infotable.xml
http://www.sec.gov/Archives/edgar/data/1041241/000095012318007311/xslForm13F_X01/form13fInfoTable.xml
http://www.sec.gov/Archives/edgar/data/1041283/000114420418037823/xslForm13F_X01/infotable.xml
http://www.sec.gov/Archives

In [88]:
# Concat investor dataframes together
appended_data = pd.concat(appended_data, axis=0)

# Export as CSV file
appended_data.to_csv('test_results.csv')