In [118]:
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [119]:
assetMDL = load('Outs/asset_svc_mdl_v1.joblib')
liableMDL = load('Outs/liability_svc_mdl_v1.joblib')

In [120]:
# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

In [163]:
# journal of physics A, Journal Stat. Physics
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Name', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, sum across rows, replace 0 with NaN and report figures
        sumV = unstructured_df[data.values].sum(axis=1).replace({0:np.nan})
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

## Use Classificaiton model to predict label names for each line item

In [122]:
asset_predictions = pd.DataFrame([assetDF.columns[3:], 
                                  assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[3:]))], 
                                 index=['LineItems', 'Labels']).T

liable_predictions = pd.DataFrame([liableDF.columns[3:], 
                                   liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[3:]))], 
                                  index=['LineItems', 'Labels']).T

## Examine robustness of structured databases

In [175]:
assetterms = ['Account receivable', 'Accumulated depreciation and amortization', 'Cash and cash equivalents',
              'Cash and securities segregated for benefit of customers', 'Deferred tax asset', 
              'Deposits with clearing organizations', 'Due from broker dealer', 'Due from customer and counterparties',
              'Due from employees', 'Due from parent and affiliates', 'Escrow account balances', 'Exchange memberships',
              'Financial instruments owned, at fair value', 'Fixed assets','Goodwill, net amortization', 
              'Other assets', 'Other receivables', 'Prepaid expenses and other assets', 'Property, plant and equipment', 
              'Receivable from broker-dealers', 'Receivables from customers and counterparties',
              'Reverse Repurchase Agreements (reverse-repo)', 'Securities received as collateral', 
              'U.S. government and government agency']
liableterms = ['Accounts payable', 'Accrued liabilities', 'Current liabilities', 
               'Deferred liability', 'Due to customers and members', 'Due to parent and affiliates',
               'Due to third party affiliates', 'Government and agency securities obligations', 'Income tax payable', 
               'Lease Liability', 'Long-term borrowing', 'Other liabilities', 'Payable to Broker/Dealers',
               'Payable to customers and counterparties', 'Payable to parent and affiliates', 
               'Repurchase Agreements (repo)', 'Securities borrowed', 'Securities sold short', 'Short-term borrowing',
               'Subordinated liabilities', 'Taxes payable']
equityterms = ['Common stock, par value', 'Other equity', 'Retained (Accumulated) earnings', 'Treasury stock',
               'Additional Paid-in capital',]

In [171]:
# construct the strucutred data set 
tempdf = structured_data(assetDF, asset_predictions)
tempdf.to_csv('structAsset.csv', index=False)

# filter out the Total Assets total 
cols = tempdf.columns[3:]
cols = cols[~np.isin(cols, 'Total assets')]

# compute the total asset check for each of the firms (robustness for equality figure)
tempdf = tempdf.assign(Check=(tempdf['Total assets'] == tempdf[cols].sum(axis=1)))

In [172]:
tempdf[tempdf.Check == False]

Unnamed: 0,CIK,Name,Year,Accumulated depreciation and amortization,Cash and cash equivalents,Cash and securities segregated for benefit of customers,Deposits with clearing organizations,Exchange memberships,"Financial instruments owned, at fair value",Fixed assets,...,Other assets,Other receivables,"Property, plant and equipment",Receivable from broker-dealers,Receivables from customers and counterparties,Reverse Repurchase Agreements (reverse-repo),Securities received as collateral,Total assets,U.S. government and government agency,Check
17,42352,GOLDMAN SACHS & CO. LLC,2002,,24258610.0,,,,31630471.0,,...,1941781.0,,,4408164.0,13165513.0,114571529.0,,,,False
18,42352,GOLDMAN SACHS & CO. LLC,2003,,19384642.0,,,,81056494.0,,...,1716894.0,,,3964833.0,10265518.0,138122674.0,,,,False
19,42352,GOLDMAN SACHS & CO. LLC,2004,,26573659.0,,,,106642938.0,,...,1981583.0,,,7976880.0,10289117.0,167604597.0,,,,False
20,42352,GOLDMAN SACHS & CO. LLC,2005,,34330737.0,,,,130905560.0,,...,3329362.0,,,8974752.0,12342912.0,224160434.0,,348590977.0,,False
21,42352,GOLDMAN SACHS & CO. LLC,2006,,43053739.0,,,,212709782.0,,...,5357602.0,,,9648246.0,14642116.0,296815545.0,,475872139.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,91154,CITIGROUP GLOBAL MARKETS INC.,2016,,3641.0,,,,,,...,62439.0,,279.0,13239.0,48702.0,160671.0,8716.0,253416.0,24756.0,False
155,91154,CITIGROUP GLOBAL MARKETS INC.,2017,,5386.0,,,,,,...,66032.0,,71.0,13446.0,46941.0,160143.0,9307.0,261644.0,36821.0,False
156,91154,CITIGROUP GLOBAL MARKETS INC.,2018,,5146.0,,,,,,...,62383.0,,,11419.0,51904.0,162683.0,15443.0,272544.0,31492.0,False
157,91154,CITIGROUP GLOBAL MARKETS INC.,2019,,5875.0,,,,,,...,73741.0,,,11405.0,54181.0,173896.0,15877.0,298908.0,37088.0,False


In [176]:
# construct the strucutred data set 
tempdf = structured_data(liableDF, liable_predictions)
tempdf.to_csv('structLiable.csv', index=False)

# filter out the Total Liability, Equity and Combined totals  
cols = tempdf.columns[3:]
cols = cols[~np.isin(cols, ['Total liabilities', "Total liabilities and shareholder's equity", 
                            "Total shareholder's equity"])]

# compute the total asset check for each of the firms (robustness for equality figure)
tempdf = tempdf.assign(Check1=(tempdf['Total liabilities'] == tempdf[liableterms].sum(axis=1)))
tempdf = tempdf.assign(Check2=(tempdf["Total shareholder's equity"] == tempdf[equityterms].sum(axis=1)))
tempdf = tempdf.assign(Check2=(tempdf["Total liabilities and shareholder's equity"] == tempdf[cols].sum(axis=1)))

KeyError: "['Lease Liability', 'Deferred liability', 'Due to third party affiliates', 'Due to parent and affiliates', 'Taxes payable', 'Due to customers and members'] not in index"

In [177]:
tempdf 

Unnamed: 0,CIK,Name,Year,Accounts payable,Accrued liabilities,Additional Paid-in capital,"Common stock, par value",Current liabilities,Government and agency securities obligations,Income tax payable,...,Repurchase Agreements (repo),Retained (Accumulated) earnings,Securities borrowed,Securities sold short,Short-term borrowing,Subordinated liabilities,Total liabilities,Total liabilities and shareholder's equity,Total shareholder's equity,Treasury stock
0,1224385,"WELLS FARGO SECURITIES, LLC",2004,,27595.0,,,,,,...,13602472.0,39849.0,,3686980.0,,538500.0,19715370.0,21509787.0,1255917.0,
1,1224385,"WELLS FARGO SECURITIES, LLC",2005,,83799.0,,,,,,...,21430507.0,,,6775259.0,,1183500.0,32320448.0,35611655.0,2107707.0,
2,1224385,"WELLS FARGO SECURITIES, LLC",2006,,59321.0,,,,,,...,12026974.0,,,5059406.0,,1183500.0,20118057.0,24029490.0,2727933.0,
3,1224385,"WELLS FARGO SECURITIES, LLC",2007,2536.0,75497.0,,,,,,...,9294056.0,902450.0,,4241568.0,,1183500.0,16498871.0,20879923.0,3197552.0,
4,1224385,"WELLS FARGO SECURITIES, LLC",2008,,92477.0,,,,,,...,11870678.0,724933.0,,3605979.0,,1333500.0,17307282.0,21660817.0,3020035.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,91154,CITIGROUP GLOBAL MARKETS INC.,2016,,,9099.0,10.0,,24756.0,,...,129960.0,33.0,,,10606.0,9945.0,234329.0,253416.0,20415.0,
155,91154,CITIGROUP GLOBAL MARKETS INC.,2017,,,8921.0,10.0,,36821.0,,...,138098.0,263.0,,,5576.0,9945.0,252450.0,261644.0,21067.0,
156,91154,CITIGROUP GLOBAL MARKETS INC.,2018,,,8937.0,10.0,,31492.0,,...,146794.0,68.0,,,1251.0,9945.0,263529.0,272544.0,18984.0,
157,91154,CITIGROUP GLOBAL MARKETS INC.,2019,,,8802.0,10.0,,37088.0,,...,157690.0,48.0,,,508.0,9945.0,290048.0,298908.0,27197.0,


In [126]:
s3 = boto3.client('s3')
session = Session()

In [127]:
# retrieving downloaded files from s3 bucket
s3.download_file("ran-s3-systemic-risk", 'Output/X-17A-5-BS/1224385-2004.csv', 'temp.pdf')
df = pd.read_csv('temp.pdf')

In [128]:
pd.DataFrame([df.iloc[:11]['1'], df.iloc[:11]['2'],
             assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(df.iloc[:11]['1']))], 
             index=['LineItems', 'Values', 'Labels']).T

Unnamed: 0,LineItems,Values,Labels
0,Cash,$ 176,Cash and cash equivalents
1,Cash segregated pursuant to federal regulations,75000,Cash and cash equivalents
2,Securities purchased under agreements to resell,7944113,Reverse Repurchase Agreements (reverse-repo)
3,"Securities owned, at market value ($8,769,300 ...",9075170,Securities received as collateral
4,Receivable from broker-dealers and clearing or...,4049708,Receivable from broker-dealers
5,Receivable from customers,51246,Receivables from customers and counterparties
6,Accrued interest receivable,44457,Other receivables
7,"Property, equipment, and leasehold improvement...",11067,"Property, plant and equipment"
8,"Goodwill, net",6112,"Goodwill, net amortization"
9,Other assets,252738,Other assets


In [129]:
a.iloc[0]

CIK                                                                             1224385
Name                                                       WELLS FARGO SECURITIES, LLC 
Year                                                                               2004
Accumulated depreciation and amortization                                           NaN
Cash and cash equivalents                                                         75176
Cash and securities segregated for benefit of customers                             NaN
Deposits with clearing organizations                                                NaN
Exchange memberships                                                                NaN
Financial instruments owned, at fair value                                          NaN
Fixed assets                                                                        NaN
Goodwill, net amortization                                                         6112
Other assets                    

In [130]:
b = assetDF[assetDF.columns[3:]].iloc[0]

In [136]:
b[~np.isnan(b)]

Accrued interest receivable                                                44457.0
Cash                                                                         176.0
Cash segregated pursuant to federal regulations                            75000.0
Goodwill, net                                                               6112.0
Other assets                                                              252738.0
Property, equipment, and leasehold improvements, net                       11067.0
Receivable from broker-dealers and clearing organizations                4049708.0
Receivable from customers                                                  51246.0
Securities owned, at market value ($8,769,300 pledged as collateral)     9075170.0
Securities purchased under agreements to resell                          7944113.0
Total assets                                                            21509787.0
Name: 0, dtype: float64