Compute the Annual Search Fraction similarity as described by Lee et al. (2015)

    Lee, C.M.C., et al., Search-based peer firms: Aggregating investor perceptions through internet co-searches. Journal of Financial Economics (2015).
    
Because the calculation can be broken down to the daily level, for the purpose of memory efficiency, we will count unique combinations of `(ip, cik, next cik)` for each day, and then sum the daily counts to the annual level. 

In [61]:
import time
import requests
import os
import dask
import bs4
import copy

import pandas as pd

from dask.diagnostics import ProgressBar
from datetime import datetime
from zipfile import ZipFile
from io import StringIO, BytesIO

In [4]:
# store large data files in separated dir
from Utility.dir_LF import large_file_dir

In [58]:
save_fp = large_file_dir  # set up file path
#open(save_fp, 'wb')

# save log zip file

## get url list of one year

In [51]:
def get_soup(url):
    hack  = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    r = requests.get(url, timeout = 30, headers = hack)
    r.encoding = 'utf-8'      
    return bs4.BeautifulSoup(r.text)

def get_year_log_url(year):
    url_temp =  'https://www.sec.gov/files/edgar' + str(year) +'.html'
    soup = get_soup(url_temp)
    urls = []
    links = soup.findAll("a")
    for link in links:
        urls.append(link["href"])    
    return urls

## download each day (with dask decorator)

In [111]:
# Download all log zip files
@dask.delayed # Why delay here? ----Break up computations into many pieces
def get_day_log(log_url, save_fp):
    
    sleep_time = 1 # sleep
    
    hack  = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    r = requests.get(log_url, headers = hack)
    
    if r.status_code == 200:
        with open(save_fp, 'wb') as f:  
            f.write(r.content)
            
    time.sleep(sleep_time)
    
    return log_url, r.status_code

## download a year

In [117]:
# based on get_day_log

def get_year_logs(year):  # only 2003~2017 
    
    #https://www.sec.gov/dera/data/edgar-log-file-data-set.html
    '''
    download all daily log files for one year from
    https://www.sec.gov/dera/data/edgar-log-file-data-set.html
    '''
    
    
    
    # save path
    save_dir = f"{large_file_dir}/log/{year}" # f"...", {vars}

    if not os.path.isdir(save_dir): # check if save_dir exists
        os.makedirs(save_dir)
        
    # get log csv
    url_lst = get_year_log_url(year)
    
    task_done = []
    task_error = []
    task_lst = [] # an iterable storing delayed objects (storing many callings, then parallelly run them together)
    
    for url in url_lst:
        save_f = save_dir +'/'+ url[-15:-4] + '.zip'
                
        ###### key feature of multitasking
            # skip if already exists
        if os.path.isfile(save_f):
            continue # force to next iteration 
        
        task_lst.append(get_day_log(url, save_f))     # calling func does not actually run it
                                                    # Why? ---Compute on lots of computation at once(avoid         
                                                        # calling compute() repeatly)
    
    with ProgressBar():    # actually run it
        task_lst = dask.compute(task_lst)
    
    try: 
        task_done = [(url, code) for url, code in task_lst if code==200]
        task_error = [(url, code) for url, code in task_lst if code!=200]
        print('failed urls', task_error, sep='\n')
    except:
        pass
    
    return 

        
 

In [118]:
get_year_logs(2003)

[########################################] | 100% Completed | 22.9s


# get indeces before a certain year

## download a qtr (with dask decorator)

In [96]:
@dask.delayed
def download_masterindex(year, qtr):   # Date should never forward the latest log accord
    from urllib.request import urlopen
    # Download Master.idx from EDGAR
    # Loop accounts for temporary server/ISP issues
    # ND-SRAF / McDonald : 201606

    number_of_tries = 10
    sleep_time = 10  # Note sleep time accumulates according to err

    PARM_ROOT_PATH = 'https://www.sec.gov/Archives/edgar/full-index/'

    masterindex = []
    #  using the zip file is a little more complicated but orders of magnitude faster
    append_path = str(year) + '/QTR' + str(qtr) + '/master.zip'  # /master.idx => nonzip version
    sec_url = PARM_ROOT_PATH + append_path

    for i in range(1, number_of_tries + 1):
        try:
            zipfile = ZipFile(BytesIO(urlopen(sec_url).read()))
            records = zipfile.open('master.idx').read().decode('utf-8', 'ignore').splitlines()[9:]
            break
        except Exception as exc:
            if i == 1:
                print('\nError in download_masterindex')
            print('  {0}. _url:  {1}'.format(i, sec_url))

            print('  Warning: {0}  [{1}]'.format(str(exc), time.strfime('%c')))
            if '404' in str(exc):
                break
            if i == number_of_tries:
                return False
            print('     Retry in {0} seconds'.format(sleep_time))
            time.sleep(sleep_time)
            
    index_str = '\n'.join([record for i, record in enumerate(records) if i!=1])
    
    
    
    # save
    save_dir = f'{large_file_dir}/masterindex/'
    if not os.path.isdir(save_dir): # check if save_dir exists
        os.makedirs(save_dir)

    
    
    save_f = f"{large_file_dir}/masterindex/mi_{year}_{qtr}.csv"
    if os.path.isfile(save_f):
          pass
    elif index_str == False:
        print('error occurs on',year,qtr)
    else:
        index_pd = pd.read_csv(StringIO(index_str), sep='|')
        index_pd.to_csv(save_f, index=False)
           
    return




## download all indeces before a year

In [105]:

def download_all_index(year): # return a df contains all index before year

    dask_lst = []  # each year, each element
    
    for year in range(1995, year+1):  # +1
        for qtr in range(1, 5):
            #print("processing", year, 'qtr',qtr)
            dask_lst.append(download_masterindex(year, qtr))
    
    dask_lst = dask.compute(dask_lst)
    
    with ProgressBar():    # actually run it
        dask_lst = dask.compute(dask_lst)
    
    return
            


In [116]:
download_all_index(2003)