In [45]:
import os
import boto3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sagemaker.session import Session
from joblib import dump, load

from sklearn.feature_extraction.text import HashingVectorizer

In [46]:
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame, col_preserve:list) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    ------------------------------------------------------------------------------------
    Input:
        :param: unstructured_df (type pandas.DataFrame)
            unstuructured pandas dataframe with loose column construction 
        :param: cluster_df (type pandas.DataFrame)
            a pandas dataframe of clustered labels and corresponding line items
    Output:
        :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    label_names = np.unique(cluster_df.Labels.values)
    remap = {}
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[col_preserve]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # we first select all predicted columns, then sum across rows for only numeric figures
        selection = unstructured_df[data.values]
        
        sumV = selection.sum(axis=1, numeric_only=True)
        
        # we then select rows from the original unstructured dataframe with 
        # only np.nan and convert sumV index to np.nan
        sumV[selection.isnull().all(axis=1)] = np.nan
        
        # assign dictionary to have labels and matching vector
        remap[label] = sumV

    structured_df = structured_df.assign(**remap)   
    return structured_df

In [47]:
def prediction_probabilites(line_items:np.array, clf_mdl, vec_mdl) -> pd.DataFrame:
    """
    Constructs a mapping convention for the machine learning predictions 
    ------------------------------------------------------------------------------------
    Input:
        :param: line_items (type numpy.array)
            list of all unstructured line item names
        :param: clf_mdl (type joblib.obj)
            a classification model to convert a line item 
        :param: vec_mdl (type joblib.obj)
            a feature extraction model for string/text data 
    Output:
        :return: (type pandas DataFrame)
    """
    # predict the corresponding class for each line item
    prediction = pd.DataFrame(data=clf_mdl.predict(vec_mdl.fit_transform(line_items)), columns=['Predicted Class'])
    
    # the actual line items that are used as predictors
    lines = pd.DataFrame(line_items, columns=['Line Items'])
    
    # compute the probability for each prediction to the accompanying classes
    prediction_probability = pd.DataFrame(data=clf_mdl.predict_proba(vec_mdl.fit_transform(line_items)),
                                          columns=clf_mdl.classes_)
    
    # sum across row, determines total class probability measure 
    # NOTE: each class is bounded by 0.0-1.0, so total column wise sums can exceed 1.0
    prediction_probability['Total Prediction score'] = prediction_probability.sum(axis=1) 
    
    # join the line items to the prediction probabilities
    return lines.join(prediction).join(prediction_probability)

In [48]:
def company_pdf(df:pd.DataFrame, mdl):
    """
    Return a dataframe for a company showcasing its column names, the predicted class and the original values.
    This function is used for error handling and de-bugging as it returns (Lineitems, Predictions, Linevalues) 
    """
    # split values for company dataframe according to columns and values
    colNames = df.index
    colValues = df.values
    
    # predicting the column groups with accompanying sklearn model
    # NOTE: We pre-process with a HashingVectorizer with 1000 features, this action is very model specific
    predNames = mdl.predict(HashingVectorizer(n_features=1000).fit_transform(colNames))

    retDF = pd.DataFrame({'Original Lineitems': colNames,                       # the original line items
                          'Predicted Lineitems': predNames,                     # the predicted line items
                          'Line values': colValues.flatten().tolist()})         # the corresponding line values
    
    return retDF

In [49]:
if __name__ == "__main__":
    
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    bucket = 'ran-s3-systemic-risk'
    output_folder = 'Output/'
    
    # ==============================================================================
    # check available pdfs stored within desired output-folder
    s3_path = session.list_s3_files(bucket, output_folder)
    
    # retrieving the unstructured asset values file from s3 bucket
    s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
    s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

    # load in asset and liability dataframes
    assetDF = pd.read_csv('unstructAsset.csv')
    liableDF = pd.read_csv('unstructLiable.csv')

    # remove local file after it has been created (variable is stored in memory)
    os.remove('unstructAsset.csv')
    os.remove('unstructLiable.csv')
    # ==============================================================================
    
    # load in sklearn classification models
    assetMDL = load('asset_log_reg_mdl_v1.joblib')
    liableMDL = load('liability_log_reg_mdl_v1.joblib')
    
    # Use classification model to predict label names for each line item
    # NOTE: we select the post-first 5 columns avoiding the CIK, Name, Filing Date, Fiscal Year, Totals Check
    asset_label_predictions = assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[5:]))
    liable_label_predictions = liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[5:]))
    
    # structured database for asset and liability terms 
    struct_asset_map = pd.DataFrame([assetDF.columns[5:], asset_label_predictions], 
                                   index=['LineItems', 'Labels']).T

    struct_liable_map = pd.DataFrame([liableDF.columns[5:], liable_label_predictions], 
                                    index=['LineItems', 'Labels']).T
    
    # construct the line-item prediction classes with corresponding probabilites 
    a_proba_df = prediction_probabilites(assetDF.columns[5:], 
                                         assetMDL, 
                                         HashingVectorizer(n_features=1000))
    l_proba_df = prediction_probabilites(liableDF.columns[5:], 
                                         liableMDL, 
                                         HashingVectorizer(n_features=1000))
    
    # ==============================================================================
    # Database construction 
    # ==============================================================================
    
    filename = 'asset_prediction_proba.csv'
    a_proba_df.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename, Body=data)
    os.remove(filename)
    
    filename = 'liable_prediction_proba.csv'
    l_proba_df.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename, Body=data)
    os.remove(filename)
    
    # -------------------------------------------------------------------------------
    
    filename = 'asset_name_map.csv'
    struct_asset_map.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename, Body=data)
    os.remove(filename)
    
    filename = 'liability_name_map.csv'
    struct_liable_map.to_csv(filename, index=False)
    with open(filename, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename, Body=data)
    os.remove(filename)
    
    # -------------------------------------------------------------------------------
    
    # structured database for asset and liability terms 
    struct_asset_df = structured_data(assetDF, struct_asset_map, 
                             col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year', 
                                           'Total asset check'])
    filename1 = 'structured_asset.csv'
    struct_asset_df.to_csv(filename1, index=False)
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename1, Body=data)
        
        
    # structured database for asset and liability terms 
    struct_liable_df = structured_data(liableDF, struct_liable_map, 
                             col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year', 
                                           "Total liabilities & shareholder's equity check"])
    filename2 = 'structured_liability.csv'
    struct_liable_df.to_csv(filename2, index=False)
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=output_folder + filename2, Body=data)
    
    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    # ==============================================================================
    
    print('The final structured dataframe has been created.')

The final structured dataframe has been created.


In [50]:
# other_columns = struct_asset_df.columns[~np.isin(struct_asset_df.columns, 
#                                                  ['CIK', 'Name', 'Filing Date', 'Filing Year', 'Total asset check', 'Total assets'])]
# (struct_asset_df['Total assets'] - struct_asset_df[other_columns].sum(axis=1)) / struct_asset_df['Total assets'] 

In [51]:
# # Amazon Textract client and Sagemaker session
# s3 = boto3.client('s3')
# session = Session()

# bucket = 'ran-s3-systemic-risk'
# output_folder = 'Output/'

# # ==============================================================================
# # check available pdfs stored within desired output-folder
# s3_path = session.list_s3_files(bucket, output_folder)

# # retrieving CIK-Dealers JSON file from s3 bucket
# s3.download_file(bucket, 'Output/unstructured_assets.csv', 'unstructAsset.csv')
# s3.download_file(bucket, 'Output/unstructured_liable.csv', 'unstructLiable.csv')

# # load in asset and liability dataframes
# assetDF = pd.read_csv('unstructAsset.csv')
# liableDF = pd.read_csv('unstructLiable.csv')

# # remove local file after it has been created (variable is stored in memory)
# os.remove('unstructAsset.csv')
# os.remove('unstructLiable.csv')
# # ==============================================================================

# # load in sklearn classification models
# assetMDL = load('asset_log_reg_mdl_v1.joblib')
# liableMDL = load('liability_log_reg_mdl_v1.joblib')

# # Use classification model to predict label names for each line item
# # (select the post-first 4 columns avoid the CIK, Name, Filing Date, Fiscal Year)
# asset_label_predictions = assetMDL.predict(HashingVectorizer(n_features=1000).fit_transform(assetDF.columns[4:]))
# liable_label_predictions = liableMDL.predict(HashingVectorizer(n_features=1000).fit_transform(liableDF.columns[4:]))

# # structured database for asset and liability terms 
# struct_asset_map = pd.DataFrame([assetDF.columns[4:], asset_label_predictions], 
#                                index=['LineItems', 'Labels']).T

# struct_liable_map = pd.DataFrame([liableDF.columns[4:], liable_label_predictions], 
#                                 index=['LineItems', 'Labels']).T

In [52]:
# temp = liableDF[(liableDF.CIK == 1224385) & (liableDF['Filing Date'] == '2004-03-01')].T.dropna()

In [53]:
# company_pdf(temp.iloc[5:], liableMDL)

In [54]:
# temp