In [None]:
# system/os/regex and basic math functions
import os
import re
import sys
import math
import json
import time
import string
import dateutil
import datetime as dt
from itertools import chain

# IPython display convenience stuff
from IPython.display import HTML, display, display_html, display_javascript
from IPython import __version__ as ipythonversion
import ipywidgets
print("IPython: {}".format(ipythonversion))

# Set logging level
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# print("Set logger to INFO, call logger.setLevel(logging.WARNING) to reduce the clutter.")

try:
    # numpy for matrix algebra
    import numpy as np
    print("Numpy (np): {}".format(np.version.full_version))
except ImportError:
    pass

try:
    # scipy for probability distributions and some statistical tests
    import scipy as sp
    import scipy.stats as stats
    print("Scipy (sp, stats): {}".format(sp.version.full_version))
except ImportError:
    pass

try:
    # pandas for data manipulation
    import pandas as pd
    print("Pandas (pd): {}".format(pd.__version__))

    def fmt_float(float_in):
        if float_in//1 == float_in:
            return '{:.0f}'.format(float_in)
        #if pd.np.isnan(float_in):
        #    return '<i>nan</i>'
        return '{:0.3f}'.format(float_in)
    pd.set_option('max_rows', 100, 'max_columns', 50)
    pd.set_option('display.notebook_repr_html', True, 'float_format', fmt_float)
except ImportError:
    pass

try:
    # SQLAlchemy for relational db management
    # import sqlalchemy as sa

    # matplotlib for plotting and pyplot for MATLAB-style API
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    plt.rcParams['figure.figsize'] = (15, 5) 
    print("MatPlotLib (mpl, plt): {}".format(mpl.__version__))
except ImportError:
    pass

try:
    # Seaborn for pretty plotting
    import seaborn as sns
    print("Seaborn (sns): {}".format(sns.__version__))
except ImportError:
    pass

try:
    # Scikit Learn for more regressions
    import sklearn as sk
    print("Scikit-Learn (sk): {}".format(sk.__version__))
except ImportError:
    pass

# statsmodels for econometrics
try:
    import statsmodels.api as sm
    print("Statsmodels (sm): {}".format(sm.__version__))
except ImportError:
    pass

try:
    # patsy for making formulas
    import patsy as pt
    print("Patsy (pt): {}".format(pt.__version__))
except ImportError:
    pass

try:
    # Gensim for textual analysis
    import gensim
    print("Gensim: {}".format(gensim.__version__))
except ImportError:
    pass

try:
    # TQDM for progress bar outputs
    from tqdm._tqdm_notebook import tqdm_notebook as tqdm
except ImportError:
    pass

try:
    # sas7bdat for reading SAS created databases
    from sas7bdat import SAS7BDAT as SASdb

    def sas_date_to_datetime(df_col):
        return pd.to_timedelta(df_col, unit='d') + SAS_ZERO
except ImportError:
    pass

In [1]:
# Constants
SAS_ZERO = dt.datetime(1960,1,1)
MIN_DATE = dt.datetime(1900, 1, 1)
MAX_DATE = dt.datetime.today()

TD_DAY = pd.Timedelta(days=1)
TD_YEAR = pd.Timedelta(days=1) * 365

In [None]:
def C(df, cols=None):
    if isinstance(df, str):
        cols = df
    if isinstance(cols, str):
        new_cols = []
        for col in cols.split():
            if '*' in col or '?' in col:
                matcher = re.compile(r'\b'+col.replace('*', '.*').replace('?', '.')+r'\b', re.I)
                new_cols.extend([c for c in df.columns if matcher.search(c)])
            else:
                new_cols.append(col)
        cols = new_cols
    return cols
pd.DataFrame.C = C

In [None]:
# print("linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession')")
def linkhead(df, n=5, title='', fields=None, cik='cik', accession='accession', return_df=False):
    """
    Displays top rows of a dataframe, and includes
    links to the HTML and FTP websites if CIK and Accession are found.
    """
    if len(df) == 0:
        display_html(df[fields or df.columns].assign(link='').to_html(), raw=True)
        return
        
    w = pd.get_option('display.max_colwidth')
    pd.set_option('display.max_colwidth', -1)
    
    if fields is None:
        fields = list(df.columns)
    
    dfn = df.head(n).copy()        
    
    if cik in dfn.columns:
        linkstr, i = 'links', 0
        while linkstr in dfn.columns:
            linkstr = 'links%d' % i
            i += 1
        dfn[linkstr] = dfn.apply(lambda row: edgarweb.edgar_links(row[cik], row[accession]), axis=1)
        fields.append(linkstr)
    
    html = f"<h4>{title}</h4>" if title else ''
    html += dfn[fields].to_html(escape=False, index=False, na_rep="")
    
    display_html(html, raw=True)
    pd.set_option('display.max_colwidth', w)
    
    if return_df: 
        return dfn
    
try:
    # in case user doesn't have pyedgar installed
    from pyedgar.utilities import edgarweb
except ModuleNotFoundError:
    print("Warning, pyedgar not found.")
    class _o_(object):
        def edgar_links(*args, **kwargs):
            return ''
    edgarweb = _o_()

In [None]:
# print("timehist(dtseries_or_df, time_variable='year', y_tic_number=4, x_tic_skip=0, *args, **kwargs)")
def timehist(dtseries_or_df, time_variable='year',
             y_tic_number=4, x_tic_skip=0,
             width=.9, ax=None, skip_retick=False,
             label=None,
             *args, **kwargs):
    """
    Historgam of observations per time period.
    First tries: dtseries_or_df.dt.time_variable
    Failing that, does dtseries_or_df.value_counts()
    Sends args and kwargs to figure.
    """
    x_tic_skip += 1
    sns.set_style('darkgrid')
    sns.set_context('talk', rc={'patch.linewidth': 0, 'patch.edgecolor': 'k', 'patch.facecolor': 'k'})
    _d = dtseries_or_df
    try:
        _d = _d.dt.__getattribute__(time_variable)
    except:
        try:
            _d = _d[time_variable]
        except:
            pass
    _g = _d.value_counts().sort_index()
    if len(_g) > 1000:
        logger.error("ERROR: You are trying to plot something with too many levels. Don't do that.")
        return 
    
    if ax is None:
        if 'figsize' not in kwargs:
            kwargs['figsize'] = (13,2)
        plt.figure(*args, **kwargs)
        ax = plt.gca()
        # If ax is none, assume kwargs are for figure generation.
        kwargs = {}
    
    ax.bar(_g.index, _g, width=width, label=label, **kwargs)
    
    if not skip_retick:
        # Format and label X axis
        ax.set_xlim(left=_g.index.min()-0.5, right=_g.index.max()+0.5)
        _t = _g.index[::x_tic_skip]
        ax.set_xticks(_t)
        ax.set_xticklabels(map(str, _t), rotation='vertical')

        # Label Y Axis
        tene = math.log10(_g.max())//1-1
        topnum = math.ceil(_g.max() / 10**tene)
        ax.set_yticks([(topnum * i // y_tic_number)*10**tene for i in range(y_tic_number, 0, -1)])
    
    return ax