In [7]:
import os
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [8]:
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame, col_preserve:list) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    ------------------------------------------------------------------------------------
    Input:
        :param: unstructured_df (type pandas.DataFrame)
            unstuructured pandas dataframe with loose column construction 
        :param: cluster_df (type pandas.DataFrame)
            a pandas dataframe of clustered labels and corresponding line items
    Output:
        :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[col_preserve]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with 
        # only np.nan and convert sumV index to np.nan
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

In [9]:
def prediction_probabilites(line_items:np.array, clf_mdl, vec_mdl) -> pd.DataFrame:
    """
    Constructs a mapping convention for the machine learning predictions 
    ------------------------------------------------------------------------------------
    Input:
        :param: line_items (type numpy.array)
            list of all unstructured line item names
        :param: clf_mdl (type joblib.obj)
            a classification model to convert a line item 
        :param: vec_mdl (type joblib.obj)
            a feature extraction model for string/text data 
    Output:
        :return: (type pandas DataFrame)
    """
    # predict the corresponding class for each line item
    prediction = pd.DataFrame(data=clf_mdl.predict(vec_mdl.fit_transform(line_items)), columns=['Predicted Class'])
    
    # the actual line items that are used as predictors
    lines = pd.DataFrame(line_items, columns=['Line Items'])
    
    # compute the probability for each prediction to the accompanying classes
    prediction_probability = pd.DataFrame(data=clf_mdl.predict_proba(vec_mdl.fit_transform(line_items)),
                                          columns=clf_mdl.classes_)
    
    # sum across row, determines total class probability measure 
    # NOTE: each class is bounded by 0.0-1.0, so total column wise sums can exceed 1.0
    prediction_probability['Total Prediction score'] = prediction_probability.sum(axis=1) 
    
    # join the line items to the prediction probabilities
    return lines.join(prediction).join(prediction_probability)

In [10]:
def company_pdf(df:pd.DataFrame, mdl):
    """
    Return a dataframe for a company showcasing its column names, the predicted class and the original values.
    This function is used for error handling and de-bugging as it returns (Lineitems, Predictions, Linevalues) 
    """
    # split values for company dataframe according to columns and values
    colNames = df.index
    colValues = df.values
    
    # predicting the column groups with accompanying sklearn model
    # NOTE: We pre-process with a HashingVectorizer with 1000 features, this action is very model specific
    predNames = mdl.predict(HashingVectorizer(strip_accents='unicode', 
                                              lowercase=True, analyzer='word',
                                              n_features=1000, norm='l2').fit_transform(colNames))
    print(predNames.size)
    print(colNames.size)
    print(colValues.size)
    retDF = pd.DataFrame({'Original Lineitems': colNames,                       # the original line items
                          'Predicted Lineitems': predNames,                     # the predicted line items
                          'Line values': colValues.flatten().tolist()})         # the corresponding line values
    
    return retDF

In [11]:
def relative_indicator(pct):
    """
    Determines the level of matching accuracy for a particular firm/year
    """
    def indicator(x):
        
        if type(x) is float:
            y = x
        else:
            y = min(x)     # from an array determine the minimum relative error
        
        if y == 0: return 'PERFECT MATCH'
        if 0 < y < 0.01: return 'BOUNDED MATCH'
        if y >= 0.01: return 'GROSS MISMATCH'
        if np.isnan(y): return 'NOT FOUND'
    
    vFunc = np.vectorize(indicator)      # vectorize function to apply to numpy array
    cleanValue = indicator(pct)            # apply vector function
    
    return cleanValue
    

In [12]:
def relative_finder(pct):
    """
    Determines the level of matching accuracy for a particular firm/year
    """
    def min_find(x):
        return min(x)
    
    vFunc = np.vectorize(min_find)      # vectorize function to apply to numpy array
    cleanValue = min_find(pct)            # apply vector function
    
    return cleanValue

# Main Execution

In [21]:
if __name__ == "__main__":
    
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    bucket = 'ran-s3-systemic-risk'
    output_folder = 'Output/'
    
    # ==============================================================================
    # check available pdfs stored within desired output-folder
    s3_path = session.list_s3_files(bucket, output_folder)
    
    # retrieving the unstructured asset values file from s3 bucket
    s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
    s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

    # load in asset and liability dataframes
    assetDF = pd.read_csv('unstructAsset.csv')
    liableDF = pd.read_csv('unstructLiable.csv')

    # remove local file after it has been created (variable is stored in memory)
    os.remove('unstructAsset.csv')
    os.remove('unstructLiable.csv')
    # ==============================================================================
    
    # load in sklearn classification models
    assetMDL = load('/home/ec2-user/SageMaker/SEC_X17A5/code/notebook/ml-model/trained_models/asset_log_reg_mdl_v2.joblib')
    liableMDL = load('/home/ec2-user/SageMaker/SEC_X17A5/code/notebook/ml-model/trained_models/liability_log_reg_mdl_v2.joblib')
    
    str_mdl = HashingVectorizer(strip_accents='unicode', lowercase=True, analyzer='word', n_features=1000, norm='l2')
    
    # NOTE: we select the post-first 4 columns avoiding the CIK, Name, Filing Date, Fiscal Year, # Totals Check
    a_columns = assetDF.columns[4:]
    l_columns = liableDF.columns[4:]
    
    # Use classification model to predict label names for each line item
    asset_label_predictions = assetMDL.predict(str_mdl.fit_transform(a_columns))
    liable_label_predictions = liableMDL.predict(str_mdl.fit_transform(l_columns))
    
    # structured database for asset and liability terms 
    struct_asset_map = pd.DataFrame([a_columns, asset_label_predictions], 
                                    index=['LineItems', 'Labels']).T

    struct_liable_map = pd.DataFrame([l_columns, liable_label_predictions], 
                                     index=['LineItems', 'Labels']).T
    
    # construct the line-item prediction classes with corresponding probabilites 
    a_proba_df = prediction_probabilites(a_columns, assetMDL, str_mdl)
    l_proba_df = prediction_probabilites(l_columns, liableMDL, str_mdl)
    
    # ==============================================================================
    # Auxillary Database Files 
    # ==============================================================================
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/asset_prediction_proba.csv'
    a_proba_df.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'asset_prediction_proba.csv', Body=data)
    os.remove(filename)
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/liable_prediction_proba.csv'
    l_proba_df.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'liable_prediction_proba.csv', Body=data)
    os.remove(filename)
    
    # -------------------------------------------------------------------------------
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/asset_name_map.csv'
    struct_asset_map.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'asset_name_map.csv', Body=data)
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/liability_name_map.csv'
    struct_liable_map.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'liability_name_map.csv', Body=data)
    
    # ==============================================================================
    # Database construction 
    # ==============================================================================
    
    # structured database for asset and liability terms 
    struct_asset_df = structured_data(assetDF, struct_asset_map, 
                             col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year'])
    
    # we drop ammended releases, preserving unique CIKs with Filing Year
    struct_asset_df = struct_asset_df.drop_duplicates(subset=['CIK', 'Filing Year'], keep='first')
    
    # extract all line items to reconstruct the appropriate total categories and compute relative differences
    asset_lines = struct_asset_df.columns[~np.isin(struct_asset_df.columns, 
                                            ['CIK', 'Name', 'Filing Date', 
                                             'Filing Year',  'Total assets'])]
    struct_asset_df['Reconstructed Total assets'] = struct_asset_df[asset_lines].sum(axis=1)
    struct_asset_df['Relative Error'] = abs(struct_asset_df['Reconstructed Total assets'] - struct_asset_df['Total assets']) / struct_asset_df['Total assets']

    struct_asset_df['Total asset check'] = struct_asset_df['Relative Error'].apply(relative_indicator)
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/structured_asset.csv'
    struct_asset_df.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'structured_asset.csv', Body=data)
        
        
    # structured database for asset and liability terms 
    struct_liable_df = structured_data(liableDF, struct_liable_map, 
                             col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year'])
    
    struct_liable_df = struct_liable_df.drop_duplicates(subset=['CIK', 'Filing Year'], keep='first')
    
     # extract all line items to reconstruct the appropriate total categories and compute relative differences
    liable_lines = struct_liable_df.columns[~np.isin(struct_liable_df.columns, 
                                            ['CIK', 'Name', 'Filing Date', 
                                             'Filing Year',  "Total liabilities and shareholder's equity"])]
    
    # we remove all other premature totals from the reconsturctured
    struct_liable_df["Reconstructed Total liabilities and shareholder's equity"] = struct_liable_df[liable_lines].sum(axis=1) 
    struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total liabilites)"] = struct_liable_df[liable_lines].sum(axis=1) - struct_liable_df['Total liabilities'].fillna(0)
    struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total equity)"] = struct_liable_df[liable_lines].sum(axis=1) - struct_liable_df["Total shareholder's equity"].fillna(0)
    struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total L+E)"] = struct_liable_df[liable_lines].sum(axis=1) - struct_liable_df['Total liabilities'].fillna(0) - struct_liable_df["Total shareholder's equity"].fillna(0)
    
    # constructing measures of relative erorrs against each different reconstruction frameworks
    struct_liable_df['Relative Error1'] = abs(struct_liable_df["Reconstructed Total liabilities and shareholder's equity"] - struct_liable_df["Total liabilities and shareholder's equity"]) / struct_liable_df["Total liabilities and shareholder's equity"]
    struct_liable_df['Relative Error2'] = abs(struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total liabilites)"] - struct_liable_df["Total liabilities and shareholder's equity"]) / struct_liable_df["Total liabilities and shareholder's equity"]
    struct_liable_df['Relative Error3'] = abs(struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total equity)"] - struct_liable_df["Total liabilities and shareholder's equity"]) / struct_liable_df["Total liabilities and shareholder's equity"]
    struct_liable_df['Relative Error4'] = abs(struct_liable_df["Reconstructed Total liabilities and shareholder's equity (less total L+E)"] - struct_liable_df["Total liabilities and shareholder's equity"]) / struct_liable_df["Total liabilities and shareholder's equity"]

    struct_liable_df["Total liabilities & shareholder's equity check"] = struct_liable_df[['Relative Error1', 'Relative Error2', 'Relative Error3', 'Relative Error4']].apply(relative_indicator, axis=1)
    struct_liable_df["Relative Error"] = struct_liable_df[['Relative Error1', 'Relative Error2', 'Relative Error3', 'Relative Error4']].apply(relative_finder, axis=1)
    
    filename = '/home/ec2-user/SageMaker/SEC_X17A5/output/structured_liability.csv'
    struct_liable_df[struct_liable_df.columns[~np.isin(struct_liable_df.columns, ['Relative Error1', 'Relative Error2', 'Relative Error3', 'Relative Error4'])]].to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + 'structured_liability.csv', Body=data)
    # ==============================================================================
    
    print('The final structured dataframe has been created.')

The final structured dataframe has been created.


In [19]:
# # Amazon Textract client and Sagemaker session
# s3 = boto3.client('s3')
# session = Session()

# bucket = 'ran-s3-systemic-risk'
# output_folder = 'Output/'

# # ==============================================================================
# # check available pdfs stored within desired output-folder
# s3_path = session.list_s3_files(bucket, output_folder)

# # retrieving the unstructured asset values file from s3 bucket
# s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
# s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

# # load in asset and liability dataframes
# assetDF = pd.read_csv('unstructAsset.csv')
# liableDF = pd.read_csv('unstructLiable.csv')

# # remove local file after it has been created (variable is stored in memory)
# os.remove('unstructAsset.csv')
# os.remove('unstructLiable.csv')
# # ==============================================================================

In [18]:
# liableDF[(liableDF.CIK == 895502) & (liableDF['Filing Date'] == '2002-02-28')].T.dropna()