In [2]:
import os

In [3]:
SHARED_DATA_ROOT = os.path.expanduser('~/Dropbox/Documents/School/_data/big/')
FIG_OUTPUT_DIR = os.path.expanduser('~/Dropbox/Documents/School/Dissertation/2_RiskFactors/outputs/')
PROJECT_DATA_ROOT = os.path.expanduser('~/Dropbox/Documents/School/Dissertation/2_RiskFactors/data/')
DATA_ROOT = '/data/D1A/'
IDX_ROOT = '/data/storage/edgar/indices/'
FEED_ROOT = '/data/storage/edgar/feeds/'

CHECK = '\u2713'
CIKACC = 'cik accession'.split()

def savefig(name, fig, *args, **kwargs):
    fig.savefig(os.path.join(FIG_OUTPUT_DIR, '{}.png'.format(name)),
                bbox_inches='tight', pad_inches=0.1, transparent=True, *args, **kwargs)
    
def linkhead(df, n=5, fields=None):    
    w = pd.get_option('display.max_colwidth')
    pd.set_option('display.max_colwidth', -1)
    
    if fields is None:
        fields = list(df.columns)
    dfn = df.head(n).copy()
    dfn['cik'] = dfn['cik'].astype(int)
    dfn['links'] = dfn.apply(lambda row: edgarweb.edgar_links(row), axis=1)
    html = dfn[fields + ['links',]].to_html(escape=False, index=False, na_rep="")

    display_html(html, raw=True)
    pd.set_option('display.max_colwidth', w)
    #return html

In [5]:
from io import BytesIO
from zipfile import ZipFile
import requests

def download_ffind_zip(ind_num):
    zip_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes{}.zip'
               .format(ind_num))
            
    data = requests.get(zip_url)
    zipfile = ZipFile(BytesIO(data.content))
    return zipfile.open('Siccodes{}.txt'.format(ind_num)).read().decode()

def get_ffind_df(ind_num):
    if ind_num not in [5, 10, 12, 17, 30, 38, 48, 49]:
        raise ValueError('Industry number must be one of {} not {}.'
                         .format([5, 10, 12, 17, 30, 38, 48, 49], ind_num))

    re_nameline = re.compile(r'^\s*(?P<ff{0}>\d\d?)\s+(?P<ff{0}_name>[a-z]+)\s+(?P<detail>.+)\s*$'
                             .format(ind_num), re.I|re.M)
    re_rangeline = re.compile(r'^\s*(?P<sicfrom>\d{3,4})-(?P<sicto>\d{3,4})(?P<notes>\s+.+)?\s*$', re.I|re.M)
    data = download_ffind_zip(ind_num)
    # init to 'other'
    try:
        current_ind = [_.groupdict() for _ in re_nameline.finditer(data) 
                       if _.group('ff{0}_name'.format(ind_num)).lower() == 'other'][0]
    except IndexError:
        current_ind = {'ff{0}'.format(ind_num):ind_num, 
                       'ff{0}_name'.format(ind_num):'Other', 
                       'detail':''}
    vals = {i:current_ind for i in range(10000)}
    for line in data.split('\n'):
        match = re_nameline.search(line.strip())
        if match:
            current_ind = match.groupdict()
            continue
        match = re_rangeline.search(line.strip())
        if not match:
            continue
        match = match.groupdict()
        sicfrom,sicto = int(match['sicfrom']), int(match['sicto'])
        for i in range(sicfrom, sicto+1):
            vals[i] = current_ind
    df = pd.DataFrame.from_dict(vals, orient='index')
    df.index.name = 'sic'
    df['ff{0}'.format(ind_num)] = df['ff{0}'.format(ind_num)].astype(int)
    return df.reset_index()

In [5]:
def yrhist(sr, ylabN=4):
    sns.set_style('darkgrid')
    sns.set_context('talk')
    _d = sr.sort_values()
    try:
        _d = _d.dt.year
    except:
        pass
    _g = _d.value_counts()
    plt.figure(figsize=(13,2))
    ax = plt.gca()
    plt.bar(_g.index, _g, width=1)
    
    # Format
    tene = math.log10(_g.max())//1-1
    topnum = math.ceil(_g.max() / 10**tene)
    
    ax.set_xlim(left=_g.index.min(), right=_g.index.max()+1)
    _t = np.arange(_g.index.min(), _g.index.max()+1)
    plt.xticks([i+.5 for i in _t], list(map(str,_t)), rotation='vertical')
    ax.set_yticks([(topnum * i // ylabN)*10**tene for i in range(ylabN, 0, -1)])