In [90]:
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [91]:
assetMDL = load('Outs/asset_svc_mdl_v1.joblib')
liableMDL = load('Outs/liability_svc_mdl_v1.joblib')

In [92]:
# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

In [93]:
# journal of physics A, Journal Stat. Physics
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Name', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with only np.nan and convert sumV index to np.nan
        # handle for Missing (NaN) and blank terms (0.0)
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

In [94]:
def company_pdf(df:pd.DataFrame, mdl):
    """
    Return a dataframe for a company showcasing its column names, the predicted class and the original values
    """
    
    # split values for company dataframe according to columns and values
    colNames = df.index
    colValues = df.values
    
    # predicting the column groups
    predNames = mdl.predict(HashingVectorizer(n_features=1000).fit_transform(colNames))
    
    retDF = pd.DataFrame({'Original Lineitems': colNames, 'Predicted Lineitems': predNames, 'Line values': colValues})
    
    return retDF

## Use Classificaiton model to predict label names for each line item

In [95]:
asset_predictions = pd.DataFrame([assetDF.columns[3:], 
                                  assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[3:]))], 
                                 index=['LineItems', 'Labels']).T

liable_predictions = pd.DataFrame([liableDF.columns[3:], 
                                   liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[3:]))], 
                                  index=['LineItems', 'Labels']).T

### Structured Asset Terms

In [96]:
# construct the strucutred data set for asset terms
tempdf = structured_data(assetDF, asset_predictions)
tempdf.to_csv('structAsset.csv', index=False)

In [97]:
np.random.seed(0)
rngSample = np.random.choice(tempdf.index, 20)
tempdf.iloc[rngSample].to_csv('sample.csv', index=False)

In [98]:
company_pdf(assetDF[(assetDF.CIK == 91154) & (assetDF.Year == 2003)].iloc[0].iloc[3:].dropna(), assetMDL)

Unnamed: 0,Original Lineitems,Predicted Lineitems,Line values
0,"Brokers, dealers and clearing organizations",Receivable from broker-dealers,6282
1,Cash and cash equivalents,Cash and cash equivalents,581
2,Cash and securities segregated and on deposit ...,Cash and cash equivalents,1998
3,Contractual commitments,Other assets,876
4,Corporate debt securities,Other assets,10530
5,Customers,Receivables from customers and counterparties,15205
6,Deposits paid for securities borrowed,Deposits with clearing organizations,42750
7,Equity securities,Other assets,4263
8,Goodwill,"Goodwill, net amortization",351
9,Intangibles,"Goodwill, net amortization",36


### Structured Liability Terms

In [99]:
# construct the strucutred data set 
tempdf = structured_data(liableDF, liable_predictions)
tempdf.to_csv('structLiable.csv', index=False)

In [100]:
company_pdf(tempdf[(tempdf.CIK == 58056) & (tempdf.Year == 2006)].iloc[0].iloc[3:].dropna(), liableMDL)

Unnamed: 0,Original Lineitems,Predicted Lineitems,Line values
0,Accounts payable,Accounts payable,15529300.0
1,Additional Paid-in capital,Additional Paid-in capital,7544870.0
2,"Common stock, par value","Common stock, par value",2.0
3,Government and agency securities obligations,Government and agency securities obligations,20677900.0
4,Other liabilities,Other liabilities,11430600.0
5,Payable to Broker/Dealers,Payable to Broker/Dealers,2461420.0
6,Payable to customers and counterparties,Payable to customers and counterparties,12661500.0
7,Repurchase Agreements (repo),Repurchase Agreements (repo),189866000.0
8,Subordinated liabilities,Subordinated liabilities,4273400.0
