In [1]:
import os
import re
import boto3
import itertools
import json

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [2]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
# set_nb_theme('chesterish')

In [292]:
def cleanNumeric(value) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
        
    """
    
    assert type(value) is str or int or np.ndarray, 'Value must be of type string, integer, float or numpy array'
    
    # checks to see what type of value is being provided
    operator = type(value)
    size = len(value)
    
    def num_strip(number):
        """
        Nested function for extracting numerical quantities
        """
        try:
            # some accounting formats take () to be negative numbers
            if number[0] == '(':
                number = '-' + number

            # perform regex operation scanning for only numeric quantities/identifiers
            cleanValue = re.sub("[^0-9|.|-]", "", number)

            # last check against poor lagging formats e.g. "." or "-" to return nan or floating-point number
            try: 
                return float(cleanValue)
            except ValueError: 
                return np.nan
            
        except (TypeError, IndexError):
            return np.nan
    
    # if provided a string, perform regex operation 
    if (operator is str) and (size > 0):
        return num_strip(value)
    
    # if operator is integer then simply return the value, no need to modify 
    elif (operator is int):
        return value
    
    # if operator is numpy array then we perform a extraction per element in array
    elif (operator is np.ndarray):
        vFunc = np.vectorize(num_strip)      # vectorize function to apply to numpy array
        cleanValue = vFunc(value)            # apply vector function
        return cleanValue
    
    else:
        return np.nan

In [293]:
print(cleanNumeric(''))
print(cleanNumeric('$.'))
print(cleanNumeric('$ -'))
print(cleanNumeric('.3'))
print(cleanNumeric('128,123'))
print(cleanNumeric('-$24,613'))
print(cleanNumeric('(21,843)'))
print(cleanNumeric('$212,223'))
print(cleanNumeric(np.array(['$13,300', '-$2344', '$23,421'])))
print(cleanNumeric(np.array(['$13,300', np.nan, '$23,421'])))
print(cleanNumeric(np.array([np.nan, np.nan])))

nan
nan
nan
0.3
128123.0
-24613.0
-21843.0
212223.0
[13300. -2344. 23421.]
[13300.    nan 23421.]
[nan nan]


In [5]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [6]:
# sample = "Total Liabilities stockholder's equity"
sample = "MEMBER'S EQUITY"

In [7]:
regexCheck(sample, '^(?!.*liabilities).*equity$')

"MEMBER'S EQUITY"

In [8]:
def tMatch(totalA:float, totalLE:float):
    # helps match the accounting equation assets = libabilities + equities 
    try:
        multiplier = totalA / totalLE
        
        # if multiplier is zero, then total asset column is zero, we return nothing (imperfect)
        if multiplier == 0: return (None, 'Asset table is missing')
        
        # if multiplier is non-negative we accept only 1 or multiples of 10 (perfect and semi-perfect)
        elif (multiplier == 1) or (multiplier%10 == 0): return (multiplier, 'Perfect/Semi-Perfect match')
        
        # all other multipliers, should return nothing (number mismatch) 
        else: return (None, 'Accounting Error')
        
    # if total liabilites doesn't exist, but total assets either exists or does not exist  
    except ZeroDivisionError:
        return (None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")

In [9]:
print(tMatch(totalA=0, totalLE=100))
print(tMatch(totalA=0, totalLE=0))
print(tMatch(totalA=100, totalLE=0))
print(tMatch(totalA=100, totalLE=100))
print(tMatch(totalA=10000, totalLE=100))
print(tMatch(totalA=102020, totalLE=11))

(None, 'Asset table is missing')
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(1.0, 'Perfect/Semi-Perfect match')
(100.0, 'Perfect/Semi-Perfect match')
(None, 'Accounting Error')


## Interact with System Risk

In [10]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [11]:
print('Total cleaned .csv files total {}'.format(paths.size-1))

Total cleaned .csv files total 12488


## Test Case Example

In [12]:
selections = np.random.choice(paths[1:], 100)

In [13]:
index = 40

In [14]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [15]:
# complete .csv file read from the 
df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Cash,"$ 39,499"
1,1,Due from Clearing Broker,242638316
2,2,"Securities owned, at market value (Note 5)",38087111
3,3,JBO investment,50000
4,4,Equipment (net of accumulated depreciation of ...,151499
5,5,Leasehold improvements (net of accumulated amo...,31172
6,6,TOTAL ASSETS,"$ 280,997,597"
7,7,LIABILITIES AND MEMBERS' EQUITY,
8,8,Accounts payable and accrued expenses,"$ 128,392"
9,9,"Securities sold, not yet purchased, at market",243570454


In [16]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [17]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [18]:
cleanDf

Unnamed: 0,0,1
0,Cash,39499.0
1,Due from Clearing Broker,242638316.0
2,"Securities owned, at market value (Note 5)",38087111.0
3,JBO investment,50000.0
4,Equipment (net of accumulated depreciation of ...,151499.0
5,Leasehold improvements (net of accumulated amo...,31172.0
6,TOTAL ASSETS,280997597.0
8,Accounts payable and accrued expenses,128392.0
9,"Securities sold, not yet purchased, at market",243570454.0
10,TOTAL LIABILITIES,243698846.0


In [19]:
print('Data for {}'.format(selections[index]))

Data for Output/BalanceSheet/1049854-03.csv


In [20]:
def extraction(df:pd.DataFrame, file:str) -> pd.DataFrame:
    """
    Extract specific line items from cleaned dataframes (balance sheets)
    :param: df (type pd.DataFrame)
        The dataframe from
    :param: file (type str)
        The file string stored in the s3 bucket
    
    :return: A dataframe subset with critical lines extracted
    """
    
    # regex expression for searching for line items 
    keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$', 'common stock', 
                 'preferred stock', 'earnings', '^(?!.*liabilities).*equity$|^(?!.*liabilities).*equity:$', 
                 '^total liabilities.*equity$|^total liabilities.*equity:$']
    
    # key names reflecting the corresponding the regex names
    keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Payables', 'Total Liabilities',
                'Common Stock', 'Preferred Stock', 'Earnings', 'Total Equity', 'Total Liabilities & Equity']
    
    # concat the list of dataframe for each extraction 
    series = {}
    
    # creating two rows to track the CIK and year information released
    cik, year = file.split('/')[-1].split('-')
    
    series['CIK'] = cik          # CIK number for firm 
    series['Year'] = year[:2]    # Year for firm filing  
    
    # select key for search names
    for i, key in enumerate(keySearch):
        # filter regex for corresponding string expression
        filterSet = df[df.columns[0]].str.contains(key, regex=True, flags=re.IGNORECASE)
        
        # matching names for search and constructing a row (dictionary form)
        # filter dataframe and sum corresponding data column
        series[keyNames[i]] = df[filterSet][df.columns[1]].sum()
  
    # match the T-table, asset/liabilities/equity for rows
    mul, msg = tMatch(series['Total Assets'], series['Total Liabilities & Equity'])
    
    if mul != None:
        series['Total Liabilities & Equity'] = series['Total Liabilities & Equity'] * mul
        
        # computing the series for other available line item
        series['Other Assets'] = series['Total Assets'] - series['Cash & Equivalents'] - series['Deposits'] - series['Receivables']
        series['Other Liabilities'] = series['Total Liabilities'] - series['Payables']
        series['Other Equity'] = series['Total Equity'] - series['Common Stock'] - series['Preferred Stock'] - series['Earnings']

        # convert the filtered dictionaries to a dataframe
        comboDF = pd.DataFrame.from_dict(series, orient='index')

        return comboDF.transpose()
    else:
        return msg

In [21]:
extraction(cleanDf, selections[index])

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1049854,3,39499,0,0,280998000.0,128392,243699000.0,0,0,0,37298800.0,280998000.0,280958000.0,243570000.0,37298800.0


## Constructing an Unstructured Database
### Retrieving all Asset and Liabilities & Equity line items 

In [76]:
obsRange = paths[1:500]

In [77]:
def balanceSheetLines(fileNames:np.ndarray) -> tuple:
    """
    Retrieving balance sheet information line item names for s3 files
    :param: (type np.ndarray)
        An array of file names for .csv files from s3
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    Asset = []
    LiabilityandEquity = []

    # iterate through files from s3 bucket 
    for file in fileNames:
        
        # download temporary file from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        n = df.columns.size   # the number of columns
        
        if n > 1: # if there are more than 1 column we continue
            arr = df[df.columns[1]].dropna().values     # all line item names for balance sheet info

            for i, item in enumerate(arr):
                # search for val check for liabilties, name
                val = re.search('liabilities|liability ', item, flags=re.I)
                
                # if val not None, we reached a liability line item 
                if val is not None:
                    # split the asset and liability portions of key line items
                    Asset.append(arr[:i])
                    LiabilityandEquity.append(arr[i:])
                    break
                    
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    # flatten series for all values
    left = np.hstack(np.array(Asset))
    right = np.hstack(np.array(LiabilityandEquity))
    
    return (left, right)

In [78]:
leftSide, rightSide = balanceSheetLines(obsRange)

In [79]:
# storing unique list of asset times and liability line items
with open('assetLines.txt', 'w') as f: json.dump(list(set(leftSide)), f)
with open('liabilityLines.txt', 'w') as f: json.dump(list(set(rightSide)), f)

print('All line items have been stored')

All line items have been stored


### Iterate through file paths extracting all data for line items

In [80]:
with open('assetLines.txt', 'r') as f: assetSide = json.loads(f.read())
with open('liabilityLines.txt', 'r') as f: liableSide = json.loads(f.read())
    
print('Asset and Liabilities & Equity lines loaded')

Asset and Liabilities & Equity lines loaded


In [81]:
assetDict = dict([(i, np.nan) for i in assetSide])
liableDict = dict([(i, np.nan) for i in liableSide])

In [294]:
def unstructured_data(filepaths, lineDictionary, lineItems) -> tuple:
    """
    Forms unstructured data frame from .csv file(s) located in s3 bucket
    
    :param: filepaths
        filepaths from s3 that store .csv file(s) (Output/BalanceSheet/)
    :paran: lineDictionary
        dictionary of total unstructured line items and corresponding values
    :paran: lineItems
        list of line items (asset/liabilites) that will be searched for
        
    :return: tuple
        Returns a tuple, itters is a list of dataframes representing line items, error is a dictionary
        tracking all error terms that exist due to univariate dataframes 
    """
    itters = []
    error = {}

    # iterate through files from s3 
    for file in filepaths:
        
        # create temporary dictionary copy for storage of values
        tempDict = lineDictionary.copy()
          
        # creating two rows to track the CIK and year information released
        cik, year = file.split('/')[-1].split('-')

        tempDict['CIK'] = cik                 # CIK number for firm 
        tempDict['Year'] = '20' + year[:2]    # Year for firm filing  
        
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')

        # clean dataframes for only the items and their immediate values
        cleanDF = df[df.columns[1:]] 

        # clean dataframe should be of size greater than 1
        if len(cleanDF.columns) > 1:
            
            # extract line items from each dataframe (balance sheet)
            lines = cleanDF[cleanDF.columns[0]]
            
            # filter dataframes according line items, and extract numerical values from dataframe 
            filterDF = cleanDF[np.isin(lines, lineItems)]
            filterDF = filterDF.set_index(filterDF.columns[0])             # set line items as index
            filterDF = filterDF.apply(lambda x: cleanNumeric(x.values))    # extract numerical figures

            # iterate through items from filterlist (asset or liability items)
            for item in filterDF.index:
                lineVal = filterDF.loc[item]                 # line item e.g. Cash $72,343 $71,231
                
                # check to see scope of line value, if multicolumns present
                # in the event we have repeating 'item' lines (e.g. 2 Prepaid expense) we sum columns  
                if type(lineVal) is not pd.Series:
                    lineVal = lineVal.sum()
                
                recentVal = lineVal.iloc[0]                  # first column value e.g. 72343
                
                # value of line items for the adjacent column (current year)
                if ~np.isnan(recentVal):
                    tempDict[item] = recentVal
                    
                else:
                    try:
                        # if the first column is blank we assume the second column is filled with totals
                        recentVal = lineVal.iloc[1]
                        
                        # if second column value is not-nan we attach those values
                        if ~np.isnan(recentVal):
                            tempDict[item] = recentVal
                    
                    # if no second column exists, we ignore and pass
                    except IndexError: pass
            
            # convert the dictionary values to dataframe for database construction 
            row = pd.DataFrame.from_dict(tempDict, orient='index')
            
            # append dataframe set to array transposing 
            itters.append(row.transpose())
            
        else:
            error[file] = 'Issue reading PDF'
        
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    return itters, error

In [297]:
# the output directory for both asset and liability figures 
assetItters, errors = unstructured_data(obsRange, assetDict, list(assetDict.keys()))
assetDF = pd.concat(assetItters)

liableItters, errors = unstructured_data(obsRange, liableDict, list(liableDict.keys()))
liableDF = pd.concat(liableItters)

In [298]:
# re-order the CIK and Year columns to appear as the first two columns
remap = assetDF.columns[~np.isin(assetDF.columns, ['CIK', 'Year'])]
unstructADF = assetDF[np.insert(remap, [0, 0], ['CIK', 'Year'])]

In [299]:
# filter out columns with NaN values, return only values
filterNaN = unstructADF.isnull().all()
cleanCols = filterNaN[filterNaN == False].index

In [300]:
unstructADF[cleanCols].head()

Unnamed: 0,CIK,Year,"(Net of accumulated depreciation and amortization of $475,741)",Receivable from affiliate,"Furniture, equipment and leasehold improvements, at cost, less accumulated depreciation and amortization of $431 (Note 5)","Securities owned, at fair value",Investments,Deferred tax asset (Note 8),Receivable from affiliates (Note 6),Accounts receivable - trade,...,Interest and dividends receivable,Payable to Affiliate,Deposit at clearing broker,Clearing deposit with broker dealer,Receivable from related party,Total Assets,Receivable from Non-Customer,Equity securities,Accounts Receivable,"Securities, at fair value"
0,1000147,2002,,,,,,,,,...,,,,,,,,,,
0,1000147,2003,,,,,,,,,...,,,,,,,,,,
0,1000147,2004,,,,,,,,,...,,,,,,,,,,
0,1000147,2005,,,,,,,,,...,,,,,,,,,,
0,1000147,2006,,,,,,,,,...,,,,,,,,,,


In [301]:
unstructADF[cleanCols].to_csv('unstructAssetSample.csv', index=False)

In [302]:
# re-order the CIK and Year columns to appear as the first two columns
remap = liableDF.columns[~np.isin(liableDF.columns, ['CIK', 'Year'])]
unstructLDF = liableDF[np.insert(remap, [0, 0], ['CIK', 'Year'])]

In [303]:
# filter out columns with NaN values, return only values
filterNaN = unstructLDF.isnull().all()
cleanCols = filterNaN[filterNaN == False].index

In [304]:
unstructLDF[cleanCols].head()

Unnamed: 0,CIK,Year,10 shares issued and outstanding),Accrued commissions and clearance charges,Due to To Shareholder,"Common stock, no par value, 1,000 shares authorized and 100 shares issued and outstanding",Additional paid-in-capital,Payables to affiliates (Note 6),Total liabilities and members' equity,Income taxes payable to parent,...,Bank overdrafts,Payable to Affiliate,"Securites sold, not yet purchased",MEMBERS' EQUITY,Net Income,Paid in capital,Bank overdraft,Additional Paid in Capital,Total Liabilities and Shareholder's Equity,Liability and Shareholder's Equity Liability: Due to Parent Company
0,1000147,2002,,,,,,,,,...,,,,,,,,,,4200.0
0,1000147,2003,,,,,,,,,...,,,,,,,,,,
0,1000147,2004,,,,,,,,,...,,,,,,,,,,
0,1000147,2005,,,,,,,,,...,,,,,,,,,,
0,1000147,2006,,,,,,,,,...,,,,,,,,,,


In [305]:
unstructLDF[cleanCols].to_csv('unstructLiableSample.csv', index=False)

### Consolidating Columns by Text analysis
#### Using clustering machine-learning algorithms for determing word similitaries

In [28]:
# reading cleaned unstructured data sets for samples 
assets = pd.read_csv('unstructAssetSample.csv')
liability = pd.read_csv('unstructLiableSample.csv')

In [29]:
assets

Unnamed: 0,CIK,Year,"(Net of accumulated depreciation and amortization of $475,741)",TOTAL ASSETS,"Securities owned, at fair value",Investments,Due from clearing brokers,Clearing charges payable,Receivable from broker/dealers,"(Net of accumulated amortization and depreciation of $114,228)",...,Total Assets,Receivable from Non-Customer,Deposit with clearing organization,"Customer list, net",Accounts Receivable,Due from an affiliate,Security deposit,"Investments, at cost",Cash and cash equivalents [Note 1],Security deposits
0,1000147,2002,,,,,,,,,...,,,,,,,,,,
1,1000147,2003,,,,,,,,,...,,,,,,,,,,
2,1000147,2004,,,,,,,,,...,,,,,,,,,,
3,1000147,2005,,,,,,,,,...,,,,,,,,,,
4,1000147,2006,,,,,,,,,...,,,25000.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1000320,2009,,,,,,,,,...,,,,,,,,,,
95,1000320,2010,,,,,,,,,...,,,,,,,,,,
96,1000320,2011,,,,,,,,,...,,,,,,,,,,
97,1000320,2012,,,,,,,,,...,,,,,,,,,,


In [36]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cluster import AffinityPropagation, KMeans, AgglomerativeClustering

In [37]:
def word_matching(array: np.array, vectorizer_class, cluster_class) -> tuple:
    
    """
    Use machine learning clustering to map similar words and figures together
    :param: type np.array
        A numpy array of strings 
    """
    
    # Convert a collection of text documents to a matrix of token counts
    vectorizer = vectorizer_class
    print('Using the {} text vectorizer'.format(type(vectorizer_class).__name__))
    
    # produces a sparse representation of the counts using scipy.sparse.csr_matrix.
    X = vectorizer.fit_transform(array)
    
    # initialize the clustering model on words
    model = cluster_class
    
    model.fit(X)  # fit clustering model to array of strings
    
    # returns dataframe, mapping cluster labels to line items
    outDF = pd.DataFrame({'Labels': model.labels_, 'LineItems': array})
    
    print('{} clusters were found using {} algorithm'.format(model.labels_.max()+1, type(cluster_class).__name__))
    
    return model.labels_, outDF 

Liability line item clustering

In [38]:
# # Works
# Akk Kmeans clustering algorithms are successful with asset line items
# word_matching(liability.columns[2:], HashingVectorizer(), AffinityPropagation(verbose=True))
# word_matching(liability.columns[2:], CountVectorizer(), AffinityPropagation(verbose=True))
# word_matching(liability.columns[2:], TfidfVectorizer(), AffinityPropagation(verbose=True))

# # Does not work
# word_matching(liability.columns[2:], TfidfTransformer(), AffinityPropagation(verbose=True))
# None of the AffinityPropagation clustering algorithms for Assets lines

In [39]:
label, df = word_matching(assets.columns[2:], HashingVectorizer(), KMeans(n_clusters=25, verbose=1))

Using the HashingVectorizer text vectorizer
Initialization complete
Iteration  0, inertia 70.532
Iteration  1, inertia 42.573
Iteration  2, inertia 41.366
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 70.205
Iteration  1, inertia 41.995
Converged at iteration 1: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 69.989
Iteration  1, inertia 41.759
Converged at iteration 1: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 71.613
Iteration  1, inertia 42.648
Iteration  2, inertia 41.786
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 70.444
Iteration  1, inertia 43.122
Iteration  2, inertia 42.971
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 67.

In [73]:
df, label

(     Labels                                          LineItems
 0         1  (Net of accumulated depreciation and amortizat...
 1         4                                       TOTAL ASSETS
 2        17                    Securities owned, at fair value
 3        22                                        Investments
 4         2                          Due from clearing brokers
 ..      ...                                                ...
 125       2                              Due from an affiliate
 126      14                                   Security deposit
 127      22                               Investments, at cost
 128       8                 Cash and cash equivalents [Note 1]
 129      20                                  Security deposits
 
 [130 rows x 2 columns],
 array([ 1,  4, 17, 22,  2,  5, 16,  1,  5, 18,  1, 15,  9,  4,  2, 16,  1,
         3,  3,  2,  3,  4, 23, 14, 24,  1,  1,  8,  2, 11,  6,  0,  0,  9,
         1,  1,  7, 19, 24, 10,  1, 16,  5,  1, 20,  

In [40]:
for i in range(df.Labels.max()+1):
    print('Label', i)
    print(df[df.Labels == i]['LineItems'].values)
    print()

Label 0
['Non - marketable securities owned at cost' 'Marketable Securities @ FMV'
 'Marketable securities owned, at market value (identified cost - $79,991)'
 'Marketable securities owned, at cost (identified cost - $33,100)'
 'Marketable securities owned, at cost (identified cost - $79,991)'
 'Non-marketable securities owned, at cost']

Label 1
['(Net of accumulated depreciation and amortization of $475,741)'
 '(Net of accumulated amortization and depreciation of $114,228)'
 '(net of accumulated amortization and depreciation of $226,187)'
 '(net of accumulated amortization and depreciation of $197,907)'
 '(Net of accumulated depreciation and amortization of $507,226)'
 '(net of accumulated amortization and depreciation of $161,488)'
 '(Net of accumulated depreciation and amortization of $381,286)'
 '(net of accumulated amortization and depreciation of $91,226)'
 '(net of accumulated amortization and depreciation of $252,304)'
 '(net of accumulated amortization and depreciation of $22

## Latent Dirichlet Allocation with online variational Bayes algorithm (Topic Model) 

In [41]:
from sklearn.decomposition import LatentDirichletAllocation

In [42]:
sample = ['Cash and cash equivalents' 'Cash and cash equivalents [Note 1]', 'Cash and equivalents' 'Cash', 
         '(Net of accumulated depreciation and amortization of $145,744)',
          '(Net of accumulated depreciation and amortization of $90,697)',
          '(net of accumulated amortization and depreciation of $186,917)',
          '(Net of accumulated depreciation and amortization of $475,741)',
          '(net of accumulated amortization and depreciation of $88,007)',
          '(net of accumulated amortization and depreciation of $36,773)',
          '(net of accumulated amortization and depreciation of $197,907)',
          '(net of accumulated amortization and depreciation of $139,214)',
          '(net of accumulated amortization and depreciation of $91,226)',
          '(Net of accumulated depreciation and amortization of $58,205)',
          '(Net of accumulated amortization and depreciation of $114,228)',
          '(net of accumulated amortization and depreciation of $252,304)',
          '(Net of accumulated depreciation and amortization of $66,997)',
          '(net of accumulated amortization and depreciation of $161,488)',
          '(net of accumulated amortization and depreciation of $226,187)',
          '(net of accumulated amortization and depreciation of $229,702)',
          '(Net of accumulated depreciation and amortization of $381,286)',
          '(Net of accumulated depreciation and amortization of $507,226)', 
          'Cash In Bank', 'Cash in Bank', '$15,817 in 2001 and $9,217 in 2000']

In [57]:
text_mdl = TfidfVectorizer()
vec_text = text_mdl.fit_transform(sample)

In [64]:
feature_names = np.array(text_mdl.get_feature_names())

In [74]:
feature_names

array(['007', '114', '139', '145', '15', '161', '186', '187', '197',
       '2000', '2001', '205', '214', '217', '226', '228', '229', '252',
       '286', '304', '36', '381', '475', '488', '507', '58', '66', '697',
       '702', '741', '744', '773', '817', '88', '90', '907', '91', '917',
       '997', 'accumulated', 'amortization', 'and', 'bank', 'cash',
       'depreciation', 'equivalents', 'equivalentscash', 'in', 'net',
       'note', 'of'], dtype='<U15')

In [58]:
vec_text

<23x51 sparse matrix of type '<class 'numpy.float64'>'
	with 165 stored elements in Compressed Sparse Row format>

In [59]:
mdl = LatentDirichletAllocation(n_components=2, verbose=1, random_state=1)

In [60]:
mdl.fit(vec_text)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=2, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=1)

In [67]:
for topic_idx, topic in enumerate(mdl.components_):
    message = "Topic #%d: " % topic_idx
    message += " ".join([feature_names[i]
                         for i in topic.argsort()[:-2 - 1:-1]])
    print(message)
print()

Topic #0: of amortization
Topic #1: cash in



In [75]:
mdl.components_

array([[1.04846518, 1.04846224, 1.04846344, 1.04844641, 0.50408327,
        1.04846561, 1.04845242, 1.08598032, 1.04846687, 0.50408327,
        0.50408327, 1.04844952, 1.04846344, 0.50408327, 1.93204638,
        1.04846224, 1.04846241, 1.04844914, 1.04845326, 1.04844914,
        1.04840179, 1.04845326, 1.04839225, 1.04846561, 1.08597983,
        1.04844951, 1.04846354, 1.04844953, 1.04846241, 1.04839219,
        1.04844641, 1.04840254, 0.50408327, 1.04846518, 1.04844953,
        1.04846687, 1.08598136, 1.04845242, 1.04846354, 4.13587191,
        4.13587193, 3.77215358, 0.50590756, 0.50703759, 4.13587193,
        0.50556672, 0.50820264, 0.50560668, 4.13587193, 0.50556672,
        7.79308918],
       [0.51951924, 0.51952218, 0.51952097, 0.519538  , 0.8571291 ,
        0.51951881, 0.519532  , 0.51797481, 0.51951754, 0.8571291 ,
        0.8571291 , 0.5195349 , 0.51952098, 0.8571291 , 0.51943939,
        0.51952218, 0.51952201, 0.51953527, 0.51953115, 0.51953527,
        0.51958262, 0.51953

In [70]:
mdl.components_[0]

array([1.04846518, 1.04846224, 1.04846344, 1.04844641, 0.50408327,
       1.04846561, 1.04845242, 1.08598032, 1.04846687, 0.50408327,
       0.50408327, 1.04844952, 1.04846344, 0.50408327, 1.93204638,
       1.04846224, 1.04846241, 1.04844914, 1.04845326, 1.04844914,
       1.04840179, 1.04845326, 1.04839225, 1.04846561, 1.08597983,
       1.04844951, 1.04846354, 1.04844953, 1.04846241, 1.04839219,
       1.04844641, 1.04840254, 0.50408327, 1.04846518, 1.04844953,
       1.04846687, 1.08598136, 1.04845242, 1.04846354, 4.13587191,
       4.13587193, 3.77215358, 0.50590756, 0.50703759, 4.13587193,
       0.50556672, 0.50820264, 0.50560668, 4.13587193, 0.50556672,
       7.79308918])

### Consolidating clusters to structured database

In [133]:
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame, label_names:np.array) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # sum all columns, across row and map to structured dataframe
        structured_df[label] = unstructured_df[data.values].sum(axis=1)
        
    return structured_df

In [134]:
structured_data(assets, df, df['Labels'].unique())

Unnamed: 0,CIK,Year,3,2,4,6,11,14,10,0,9,13,7,5,1,12,8
0,1000147,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74600.0,0.0,0.0
1,1000147,2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75800.0,0.0,0.0
2,1000147,2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76500.0,0.0,0.0
3,1000147,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000147,2006,36301.0,0.0,332085.0,0.0,0.0,0.0,15788.0,0.0,2005.0,0.0,0.0,42132.0,253652.0,0.0,395.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1000320,2009,0.0,0.0,0.0,0.0,13000.0,0.0,0.0,653311.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,1000320,2010,0.0,0.0,0.0,0.0,15000.0,0.0,0.0,88052.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,1000320,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,1000320,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
