In [1]:
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
assetMDL = load('Outs/asset_svc_mdl_v1.joblib')
liableMDL = load('Outs/liability_svc_mdl_v1.joblib')

In [3]:
# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

In [11]:
# journal of physics A, Journal Stat. Physics
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Name', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with only np.nan and convert sumV index to np.nan
        # handle for Missing (NaN) and blank terms (0.0)
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

## Use Classificaiton model to predict label names for each line item

In [5]:
asset_predictions = pd.DataFrame([assetDF.columns[3:], 
                                  assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[3:]))], 
                                 index=['LineItems', 'Labels']).T

liable_predictions = pd.DataFrame([liableDF.columns[3:], 
                                   liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[3:]))], 
                                  index=['LineItems', 'Labels']).T

## Examine robustness of structured databases

In [6]:
assetterms = ['Account receivable', 'Accumulated depreciation and amortization', 'Cash and cash equivalents',
              'Cash and securities segregated for benefit of customers', 'Deferred tax asset', 
              'Deposits with clearing organizations', 'Due from broker dealer', 'Due from customer and counterparties',
              'Due from employees', 'Due from parent and affiliates', 'Escrow account balances', 'Exchange memberships',
              'Financial instruments owned, at fair value', 'Fixed assets','Goodwill, net amortization', 
              'Other assets', 'Other receivables', 'Prepaid expenses and other assets', 'Property, plant and equipment', 
              'Receivable from broker-dealers', 'Receivables from customers and counterparties',
              'Reverse Repurchase Agreements (reverse-repo)', 'Securities received as collateral', 
              'U.S. government and government agency']
liableterms = ['Accounts payable', 'Accrued liabilities', 'Current liabilities', 
               'Deferred liability', 'Due to customers and members', 'Due to parent and affiliates',
               'Due to third party affiliates', 'Government and agency securities obligations', 'Income tax payable', 
               'Lease Liability', 'Long-term borrowing', 'Other liabilities', 'Payable to Broker/Dealers',
               'Payable to customers and counterparties', 'Payable to parent and affiliates', 
               'Repurchase Agreements (repo)', 'Securities borrowed', 'Securities sold short', 'Short-term borrowing',
               'Subordinated liabilities', 'Taxes payable']
equityterms = ['Common stock, par value', 'Other equity', 'Retained (Accumulated) earnings', 'Treasury stock',
               'Additional Paid-in capital',]

### Structured Asset Terms

In [7]:
# construct the strucutred data set 
tempdf = structured_data(assetDF, asset_predictions)
tempdf.to_csv('structAsset.csv', index=False)

In [8]:
tempdf

Unnamed: 0,CIK,Name,Year,Accumulated depreciation and amortization,Cash and cash equivalents,Cash and securities segregated for benefit of customers,Deposits with clearing organizations,Exchange memberships,"Financial instruments owned, at fair value",Fixed assets,...,Other assets,Other receivables,"Property, plant and equipment",Receivable from broker-dealers,Receivables from customers and counterparties,Reverse Repurchase Agreements (reverse-repo),Securities received as collateral,Total assets,U.S. government and government agency,Check
20,42352,GOLDMAN SACHS & CO. LLC,2005,,34330737.0,,,,130905560.0,,...,3329362.0,,,8974752.0,12342912.0,224160434.0,,348590977.0,,False
21,42352,GOLDMAN SACHS & CO. LLC,2006,,43053739.0,,,,212709782.0,,...,5357602.0,,,9648246.0,14642116.0,296815545.0,,475872139.0,,False
22,42352,GOLDMAN SACHS & CO. LLC,2007,,2008168.0,53236610.0,,,211801140.0,,...,7274550.0,,,9551828.0,16139054.0,315139956.0,,509250736.0,,False
23,42352,GOLDMAN SACHS & CO. LLC,2008,,3815314.0,85274770.0,,,563457220.0,,...,5483735.0,,,20402378.0,22739706.0,50427432.0,,632858770.0,,False
24,42352,GOLDMAN SACHS & CO. LLC,2010,,3273.0,11459.0,,,477977.0,,...,4894.0,,,8193.0,20203.0,70499.0,,463755.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,91154,CITIGROUP GLOBAL MARKETS INC.,2016,,3641.0,,,,,,...,62439.0,,279.0,13239.0,48702.0,160671.0,8716.0,253416.0,24756.0,False
155,91154,CITIGROUP GLOBAL MARKETS INC.,2017,,5386.0,,,,,,...,66032.0,,71.0,13446.0,46941.0,160143.0,9307.0,261644.0,36821.0,False
156,91154,CITIGROUP GLOBAL MARKETS INC.,2018,,5146.0,,,,,,...,62383.0,,,11419.0,51904.0,162683.0,15443.0,272544.0,31492.0,False
157,91154,CITIGROUP GLOBAL MARKETS INC.,2019,,5875.0,,,,,,...,73741.0,,,11405.0,54181.0,173896.0,15877.0,298908.0,37088.0,False


### Structured Liability Terms

In [None]:
# construct the strucutred data set 
tempdf = structured_data(liableDF, liable_predictions)
tempdf.to_csv('structLiable.csv', index=False)

In [None]:
tempdf