In [2]:
import os
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [3]:
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    ------------------------------------------------------------------------------------
    Input:
        :param: unstructured_df (type pandas.DataFrame)
            unstuructured pandas dataframe with loose column construction 
        :param: cluster_df (type pandas.DataFrame)
            a pandas dataframe of clustered labels and corresponding line items
    Output:
        :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Name', 'Filing Date', 'Filing Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with 
        # only np.nan and convert sumV index to np.nan
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

In [36]:
def company_pdf(df:pd.DataFrame, mdl):
    """
    Return a dataframe for a company showcasing its column names, the predicted class and the original values.
    This function is used for error handling and de-bugging as it returns (Lineitems, Predictions, Linevalues) 
    """
    # split values for company dataframe according to columns and values
    colNames = df.index[4:]
    colValues = df.values
    
    # predicting the column groups with accompanying sklearn model
    # NOTE: We pre-process with a HashingVectorizer with 1000 features, this action is very model specific
    predNames = mdl.predict(HashingVectorizer(n_features=1000).fit_transform(colNames))
    print(colNames)
    print(predNames)
    print(colValues.flatten().tolist())
    
    retDF = pd.DataFrame({'Original Lineitems': colNames,                            # f
                          'Predicted Lineitems': predNames,                         # f
                          'Line values': colValues.flatten().tolist()})               # f
    
    return retDF

In [11]:
if __name__ == "__main__":
    
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    bucket = 'ran-s3-systemic-risk'
    output_folder = 'Output/'
    
    # ==============================================================================
    # check available pdfs stored within desired output-folder
    s3_path = session.list_s3_files(bucket, output_folder)
    
    # retrieving CIK-Dealers JSON file from s3 bucket
    s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
    s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

    # load in asset and liability dataframes
    assetDF = pd.read_csv('unstructAsset.csv')
    liableDF = pd.read_csv('unstructLiable.csv')

    # remove local file after it has been created (variable is stored in memory)
    os.remove('unstructAsset.csv')
    os.remove('unstructLiable.csv')
    # ==============================================================================
    
    # load in sklearn classification models
    assetMDL = load('asset_log_reg_mdl_v1.joblib')
    liableMDL = load('liability_log_reg_mdl_v1.joblib')
    
    # Use classification model to predict label names for each line item
    # (select the post-first 4 columns avoid the CIK, Name, Filing Date, Fiscal Year)
    asset_label_predictions = assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[4:]))
    liable_label_predictions = liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[4:]))
    
    # structured database for asset and liability terms 
    struct_asset_map = pd.DataFrame([assetDF.columns[4:], asset_label_predictions], 
                                   index=['LineItems', 'Labels']).T

    struct_liable_map = pd.DataFrame([liableDF.columns[4:], liable_label_predictions], 
                                    index=['LineItems', 'Labels']).T
    
    # ==============================================================================
    # Database construction 
    # ==============================================================================
    
    # structured database for asset and liability terms 
    tempdf = structured_data(assetDF, struct_asset_map)
    
    # write datafile to 
    filename1 = 'structured_asset.csv'
    tempdf.to_csv(filename1, index=False)
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename1, Body=data)
        
        
    # structured database for asset and liability terms 
    tempdf = structured_data(liableDF, struct_liable_map)
    
    # write datafile to 
    filename2 = 'structured_liability.csv'
    tempdf.to_csv(filename2, index=False)
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename2, Body=data)
    
    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    # ==============================================================================
    
    print('The final structured dataframe has been created.')

The final structured dataframe has been created.


In [32]:
# sample_df = df[(df['CIK'] == 58056) & (df['Filing Year'] == 2001)].T
# sample_df = sample_df.dropna()

In [33]:
# sample_df

In [34]:
# struct_liable_map[np.isin(struct_liable_map.LineItems, sample_df.index)]

In [5]:
# Amazon Textract client and Sagemaker session
s3 = boto3.client('s3')
session = Session()

bucket = 'ran-s3-systemic-risk'
output_folder = 'Output/'

# ==============================================================================
# check available pdfs stored within desired output-folder
s3_path = session.list_s3_files(bucket, output_folder)

# retrieving CIK-Dealers JSON file from s3 bucket
s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

# load in asset and liability dataframes
assetDF = pd.read_csv('unstructAsset.csv')
liableDF = pd.read_csv('unstructLiable.csv')

# remove local file after it has been created (variable is stored in memory)
os.remove('unstructAsset.csv')
os.remove('unstructLiable.csv')
# ==============================================================================

# load in sklearn classification models
assetMDL = load('asset_log_reg_mdl_v1.joblib')
liableMDL = load('liability_log_reg_mdl_v1.joblib')

# Use classification model to predict label names for each line item
# (select the post-first 4 columns avoid the CIK, Name, Filing Date, Fiscal Year)
asset_label_predictions = assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[4:]))
liable_label_predictions = liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[4:]))

# structured database for asset and liability terms 
struct_asset_map = pd.DataFrame([assetDF.columns[4:], asset_label_predictions], 
                               index=['LineItems', 'Labels']).T

struct_liable_map = pd.DataFrame([liableDF.columns[4:], liable_label_predictions], 
                                index=['LineItems', 'Labels']).T

In [13]:
temp = liableDF[(liableDF.CIK == 42352) & (liableDF['Filing Date'] == '2003-01-28')].T.dropna()

In [14]:
temp

Unnamed: 0,19
CIK,42352
Name,GOLDMAN SACHS & CO. LLC
Filing Date,2003-01-28
Filing Year,2002
Securities sold under agreements to repurchase,4.88639e+10
Subordinated borrowings,6.52294e+09
Securities loaned,4.80185e+10
"Financial instruments sold, but not yet purchased, at fair value",1.51734e+10
Long-term borrowings,3.8638e+08
Other liabilities and accrued expenses,2.48323e+09


In [18]:
liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(temp.iloc[4:]))

array(['Repurchase Agreements (repo)', 'Subordinated liabilities',
       'Other liabilities', 'Securities sold short',
       'Long-term borrowing', 'Accrued liabilities', 'Other equity',
       'Payable to Broker/Dealers',
       'Payable to customers and counterparties', 'Short-term borrowing',
       'Income tax payable'], dtype=object)

In [38]:
temp

Unnamed: 0,19
CIK,42352
Name,GOLDMAN SACHS & CO. LLC
Filing Date,2003-01-28
Filing Year,2002
Securities sold under agreements to repurchase,4.88639e+10
Subordinated borrowings,6.52294e+09
Securities loaned,4.80185e+10
"Financial instruments sold, but not yet purchased, at fair value",1.51734e+10
Long-term borrowings,3.8638e+08
Other liabilities and accrued expenses,2.48323e+09


Unnamed: 0,LineItems,Labels
0,Accrued interest receivable,Other receivables
1,Cash,Cash and cash equivalents
2,Cash segregated pursuant to federal regulations,Cash and cash equivalents
3,"Goodwill, net","Goodwill, net amortization"
4,Other assets,Other assets
...,...,...
529,Securities borrowed or purchased under agreeme...,Reverse Repurchase Agreements (reverse-repo)
530,Other assets (including $660 at fair value),Other assets
531,Securities borrowed and purchased under agreem...,Reverse Repurchase Agreements (reverse-repo)
532,"Other assets (including $1,490 at fair value)",Other assets
