In [1]:
import os
import re
import boto3
import itertools
import json

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [143]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
# set_nb_theme('chesterish')

In [3]:
def cleanNumeric(value) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    try:
        # some accounting formats take () to be negative numbers
        if value[0] == '(':
            value = '-' + value

        # perform regex operation scanning for only numeric quantities/identifiers
        cleanValue = re.sub("[^0-9|.|-]", "", value)
        try:
            return float(cleanValue)
        except ValueError:
            return np.nan
    except TypeError:
        return value

In [4]:
print(cleanNumeric('-$19,223'))
print(cleanNumeric('128,123'))
print(cleanNumeric('-$21,843'))
print(cleanNumeric('$212,223'))

-19223.0
128123.0
-21843.0
212223.0


In [5]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [6]:
# sample = "Total Liabilities stockholder's equity"
sample = "MEMBER'S EQUITY"

In [7]:
regexCheck(sample, '^(?!.*liabilities).*equity$')

"MEMBER'S EQUITY"

In [8]:
def tMatch(totalA:float, totalLE:float):
    # helps match the accounting equation assets = libabilities + equities 
    try:
        multiplier = totalA / totalLE
        
        # if multiplier is zero, then total asset column is zero, we return nothing (imperfect)
        if multiplier == 0: return (None, 'Asset table is missing')
        
        # if multiplier is non-negative we accept only 1 or multiples of 10 (perfect and semi-perfect)
        elif (multiplier == 1) or (multiplier%10 == 0): return (multiplier, 'Perfect/Semi-Perfect match')
        
        # all other multipliers, should return nothing (number mismatch) 
        else: return (None, 'Accounting Error')
        
    # if total liabilites doesn't exist, but total assets either exists or does not exist  
    except ZeroDivisionError:
        return (None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")

In [9]:
print(tMatch(totalA=0, totalLE=100))
print(tMatch(totalA=0, totalLE=0))
print(tMatch(totalA=100, totalLE=0))
print(tMatch(totalA=100, totalLE=100))
print(tMatch(totalA=10000, totalLE=100))
print(tMatch(totalA=102020, totalLE=11))

(None, 'Asset table is missing')
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(1.0, 'Perfect/Semi-Perfect match')
(100.0, 'Perfect/Semi-Perfect match')
(None, 'Accounting Error')


## Interact with System Risk

In [10]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [11]:
print('Total cleaned .csv files total {}'.format(paths.size-1))

Total cleaned .csv files total 11942


## Test Case Example

In [12]:
selections = np.random.choice(paths[1:], 100)

In [13]:
index = 40

In [14]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [15]:
# complete .csv file read from the 
df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Cash,"$ 39,499"
1,1,Due from Clearing Broker,242638316
2,2,"Securities owned, at market value (Note 5)",38087111
3,3,JBO investment,50000
4,4,Equipment (net of accumulated depreciation of ...,151499
5,5,Leasehold improvements (net of accumulated amo...,31172
6,6,TOTAL ASSETS,"$ 280,997,597"
7,7,LIABILITIES AND MEMBERS' EQUITY,
8,8,Accounts payable and accrued expenses,"$ 128,392"
9,9,"Securities sold, not yet purchased, at market",243570454


In [16]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [17]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [18]:
cleanDf

Unnamed: 0,0,1
0,Cash,39499.0
1,Due from Clearing Broker,242638316.0
2,"Securities owned, at market value (Note 5)",38087111.0
3,JBO investment,50000.0
4,Equipment (net of accumulated depreciation of ...,151499.0
5,Leasehold improvements (net of accumulated amo...,31172.0
6,TOTAL ASSETS,280997597.0
8,Accounts payable and accrued expenses,128392.0
9,"Securities sold, not yet purchased, at market",243570454.0
10,TOTAL LIABILITIES,243698846.0


In [19]:
print('Data for {}'.format(selections[index]))

Data for Output/BalanceSheet/1049854-03.csv


In [20]:
def extraction(df:pd.DataFrame, file:str) -> pd.DataFrame:
    """
    Extract specific line items from cleaned dataframes (balance sheets)
    :param: df (type pd.DataFrame)
        The dataframe from
    :param: file (type str)
        The file string stored in the s3 bucket
    
    :return: A dataframe subset with critical lines extracted
    """
    
    # regex expression for searching for line items 
    keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$', 'common stock', 
                 'preferred stock', 'earnings', '^(?!.*liabilities).*equity$|^(?!.*liabilities).*equity:$', 
                 '^total liabilities.*equity$|^total liabilities.*equity:$']
    
    # key names reflecting the corresponding the regex names
    keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Payables', 'Total Liabilities',
                'Common Stock', 'Preferred Stock', 'Earnings', 'Total Equity', 'Total Liabilities & Equity']
    
    # concat the list of dataframe for each extraction 
    series = {}
    
    # creating two rows to track the CIK and year information released
    cik, year = file.split('/')[-1].split('-')
    
    series['CIK'] = cik          # CIK number for firm 
    series['Year'] = year[:2]    # Year for firm filing  
    
    # select key for search names
    for i, key in enumerate(keySearch):
        # filter regex for corresponding string expression
        filterSet = df[df.columns[0]].str.contains(key, regex=True, flags=re.IGNORECASE)
        
        # matching names for search and constructing a row (dictionary form)
        # filter dataframe and sum corresponding data column
        series[keyNames[i]] = df[filterSet][df.columns[1]].sum()
  
    # match the T-table, asset/liabilities/equity for rows
    mul, msg = tMatch(series['Total Assets'], series['Total Liabilities & Equity'])
    
    if mul != None:
        series['Total Liabilities & Equity'] = series['Total Liabilities & Equity'] * mul
        
        # computing the series for other available line item
        series['Other Assets'] = series['Total Assets'] - series['Cash & Equivalents'] - series['Deposits'] - series['Receivables']
        series['Other Liabilities'] = series['Total Liabilities'] - series['Payables']
        series['Other Equity'] = series['Total Equity'] - series['Common Stock'] - series['Preferred Stock'] - series['Earnings']

        # convert the filtered dictionaries to a dataframe
        comboDF = pd.DataFrame.from_dict(series, orient='index')

        return comboDF.transpose()
    else:
        return msg

In [21]:
extraction(cleanDf, selections[index])

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1049854,3,39499,0,0,280998000.0,128392,243699000.0,0,0,0,37298800.0,280998000.0,280958000.0,243570000.0,37298800.0


## Constructing an Unstructured Database
### Retrieving all Asset and Liabilities & Equity line items 

In [12]:
obsRange = paths[1:100]

In [13]:
def balanceSheetLines(fileNames:np.ndarray) -> tuple:
    """
    Retrieving balance sheet information line item names for s3 files
    :param: (type np.ndarray)
        An array of file names for .csv files from s3
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    Asset = []
    LiabilityandEquity = []

    # iterate through files from s3 bucket 
    for file in fileNames:
        
        # download temporary file from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        n = df.columns.size   # the number of columns
        
        if n > 1: # if there are more than 1 column we continue
            arr = df[df.columns[1]].dropna().values     # all line item names for balance sheet info

            for i, item in enumerate(arr):
                # search for val check for liabilties, name
                val = re.search('liabilities|liability ', item, flags=re.I)
                
                # if val not None, we reached a liability line item 
                if val is not None:
                    # split the asset and liability portions of key line items
                    Asset.append(arr[:i])
                    LiabilityandEquity.append(arr[i:])
                    break
                    
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    # flatten series for all values
    left = np.hstack(np.array(Asset))
    right = np.hstack(np.array(LiabilityandEquity))
    
    return (left, right)

In [16]:
leftSide, rightSide = balanceSheetLines(obsRange)

In [17]:
# storing unique list of asset times and liability line items
with open('assetLines.txt', 'w') as f: json.dump(list(set(leftSide)), f)
with open('liabilityLines.txt', 'w') as f: json.dump(list(set(rightSide)), f)

print('All line items have been stored')

All line items have been stored


### Iterate through file paths extracting all data for line items

In [18]:
with open('assetLines.txt', 'r') as f: assetSide = json.loads(f.read())
with open('liabilityLines.txt', 'r') as f: liableSide = json.loads(f.read())
    
print('Asset and Liabilities & Equity lines loaded')

Asset and Liabilities & Equity lines loaded


In [149]:
assetDict = dict([(i, np.nan) for i in assetSide])
liableDict = dict([(i, np.nan) for i in liableSide])

In [175]:
def unstructured_data(filepaths, lineDictionary, lineItems):
    """
    Forms unstructured data frame from s3 bucket
    :param: filepaths
        filepaths from s3
    :paran: lineDictionary
        dictionary of total unstructured line items
    :paran: lineItems
        list of line items
    """
    itters = []
    error = {}

    # iterate through files from s3 
    for file in filepaths:
        
        # create temporary dictionary copy for storage of values
        tempDict = lineDictionary.copy()
          
        # creating two rows to track the CIK and year information released
        cik, year = file.split('/')[-1].split('-')

        tempDict['CIK'] = cik                 # CIK number for firm 
        tempDict['Year'] = '20' + year[:2]    # Year for firm filing  
        
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')

        # clean dataframes for only the items and their immediate values
        cleanDF = df[df.columns[1:]] 

        # clean dataframe should be of size greater than 1
        if len(cleanDF.columns) > 1:
            
            # extract line items from each dataframe (balance sheet)
            lines = cleanDF[cleanDF.columns[0]]
            
            # filter dataframes according line items 
            filterDF = cleanDF[np.isin(lines, lineItems)]
            filterDF = filterDF.set_index(filterDF.columns[0])
            
            # iterate through items from filterlist 
            for item in filterDF.index:
                lineVal = filterDF.loc[item]                 # line item e.g. Cash $72,343 $71,231
                recentVal = cleanNumeric(lineVal.iloc[0])    # first column value e.g. 72343
                
                # value of line items for the adjacent column (current year)
                if recentVal != np.nan:
                    tempDict[item] = recentVal
                else:
                    try:
                        # if the first column is blank we assume the second column is filled with totals
                        recentVal = cleanNumeric(lineVal.iloc[1])
                        
                        # if second column value is not-nan we attach those values
                        if recentVal != np.nan:
                            tempDict[item] = recentVal
                    except IndexError: pass
            
            # convert the dictionary values to dataframe for database construction 
            row = pd.DataFrame.from_dict(tempDict, orient='index')
            
            # append dataframe set to array transposing 
            itters.append(row.transpose())
            
        else:
            error[file] = 'Issue reading PDF'

        # remove local file after it has been created
        os.remove('temp.pdf')
    
    return itters

In [177]:
# the output directory for both asset and liability figures 
assetDF = pd.concat(unstructured_data(obsRange, assetDict, list(assetDict.keys())))
liableDF = pd.concat(unstructured_data(obsRange, liableDict, list(liableDict.keys())))

In [192]:
# re-order the CIK and Year columns to appear as the first two columns
remap = assetDF.columns[~np.isin(assetDF.columns, ['CIK', 'Year'])]
unstructADF = assetDF[np.insert(remap, [0, 0], ['CIK', 'Year'])]

In [193]:
# filter out columns with NaN values, return only values
filterNaN = unstructADF.isnull().all()
cleanCols = filterNaN[filterNaN == False].index

In [194]:
unstructADF[cleanCols].head()

Unnamed: 0,CIK,Year,Due from customer,"Furniture, equipment and leasehold improvements (Net of accumulated depreciation and amortization of $223,956)","(Net of accumulated depreciation and amortization of $145,744)",Due from clearing broker,TOTAL CURRENT ASSETS,"Non-marketable securities owned, at cost",Due from registered representatives,Prepaid Expense,...,"(net of accumulated amortization and depreciation of $229,702)",Deposit with clearing organization,Assets Receivable from Non-Customer,Security Deposit,"$15,817 in 2001 and $9,217 in 2000","Due from Parent, net","(Net of accumulated depreciation and amortization of $381,286)",Due from an affiliate,Prepaid Expenses,"(Net of accumulated depreciation and amortization of $507,226)"
0,1000147,2002,,,,,,,,,...,,,,,,,,,,
0,1000147,2003,,,,,,,,,...,,,,,,,,,,
0,1000147,2004,,,,,,,,,...,,,,,,,,,,
0,1000147,2005,,,,,,,,,...,,,,,,,,,,
0,1000147,2006,,,,,332085.0,,,,...,,25000.0,,,,36301.0,,,,


In [195]:
unstructADF[cleanCols].to_csv('unstructAssetSample.csv', index=False)

In [196]:
# re-order the CIK and Year columns to appear as the first two columns
remap = liableDF.columns[~np.isin(liableDF.columns, ['CIK', 'Year'])]
unstructLDF = liableDF[np.insert(remap, [0, 0], ['CIK', 'Year'])]

In [197]:
# filter out columns with NaN values, return only values
filterNaN = unstructLDF.isnull().all()
cleanCols = filterNaN[filterNaN == False].index

In [198]:
unstructLDF[cleanCols].head()

Unnamed: 0,CIK,Year,Deferred affilation fee,Illinois personal property replacement tax payable - current,"1,000 Shares, Issued and Outstanding",Note payable,Total Liabilities and Members' Equity,Total Members' Equity,Fails to Receive,Paid in capital,...,Replacement tax accrual 2016,Liability and Shareholder's Equity Liability: Due to Parent Company,"issued and outstanding, at stated value",Income taxes payable,TOTAL LIABILITIES AND MEMBERS' EQUITY,Additional Paid-in Capital,"Common Stock $.01 par value, 3,000 shares authorized, 1,000 shares issued and outstanding",Accrued Expenses,"Common stock, at stated value, (1,000 shares authorized, 100 shares issued and outstanding)","authorized, 3,000 shares issued and outstanding"
0,1000147,2002,,,,,,,,,...,,4200.0,,,,,,,,
0,1000147,2003,,,,,,,,,...,,,25000.0,,,,,,,
0,1000147,2004,,,,,,,,,...,,,,,,,,,,
0,1000147,2005,,,,,,,,,...,,,,,,,,,,
0,1000147,2006,,,,,,,,,...,,,25000.0,,,,,,,


In [199]:
unstructLDF[cleanCols].to_csv('unstructLiableSample.csv', index=False)

### Consolidating Columns by Text analysis
#### Using clustering machine-learning algorithms for determing word similitaries

In [12]:
# reading cleaned unstructured data sets for samples 
assets = pd.read_csv('unstructAssetSample.csv')
liability = pd.read_csv('unstructLiableSample.csv')

In [13]:
assets

Unnamed: 0,CIK,Year,Due from customer,"Furniture, equipment and leasehold improvements (Net of accumulated depreciation and amortization of $223,956)","(Net of accumulated depreciation and amortization of $145,744)",Due from clearing broker,TOTAL CURRENT ASSETS,"Non-marketable securities owned, at cost",Due from registered representatives,Prepaid Expense,...,"(net of accumulated amortization and depreciation of $229,702)",Deposit with clearing organization,Assets Receivable from Non-Customer,Security Deposit,"$15,817 in 2001 and $9,217 in 2000","Due from Parent, net","(Net of accumulated depreciation and amortization of $381,286)",Due from an affiliate,Prepaid Expenses,"(Net of accumulated depreciation and amortization of $507,226)"
0,1000147,2002,,,,,,,,,...,,,,,,,,,,
1,1000147,2003,,,,,,,,,...,,,,,,,,,,
2,1000147,2004,,,,,,,,,...,,,,,,,,,,
3,1000147,2005,,,,,,,,,...,,,,,,,,,,
4,1000147,2006,,,,,332085.0,,,,...,,25000.0,,,,36301.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1000320,2009,,,,,,,,,...,,,,,,,,,,
95,1000320,2010,,,,,,,,,...,,,,,,,,,,
96,1000320,2011,,,,,,,,,...,,,,,,,,,,
97,1000320,2012,,,,,,,,,...,,,,,,,,,,


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cluster import AffinityPropagation, KMeans, AgglomerativeClustering

In [136]:
def word_matching(array: np.array, vectorizer_class, cluster_class) -> tuple:
    
    """
    Use machine learning clustering to map similar words and figures together
    :param: type np.array
        A numpy array of strings 
    """
    
    # Convert a collection of text documents to a matrix of token counts
    vectorizer = vectorizer_class
    print('Using the {} text vectorizer'.format(type(vectorizer_class).__name__))
    
    # produces a sparse representation of the counts using scipy.sparse.csr_matrix.
    X = vectorizer.fit_transform(array)
    
    # initialize the clustering model on words
    model = cluster_class
    
    model.fit(X)  # fit clustering model to array of strings
    
    # returns dataframe, mapping cluster labels to line items
    outDF = pd.DataFrame({'Labels': model.labels_, 'LineItems': array})
    
    print('{} clusters were found using {} algorithm'.format(model.labels_.max()+1, type(cluster_class).__name__))
    
    return model.labels_, outDF 

Liability line item clustering

In [137]:
# # Works
# word_matching(liability.columns[2:], HashingVectorizer(), AffinityPropagation(verbose=True))
# word_matching(liability.columns[2:], CountVectorizer(), AffinityPropagation(verbose=True))
# word_matching(liability.columns[2:], TfidfVectorizer(), AffinityPropagation(verbose=True))

# # Does not work
# word_matching(liability.columns[2:], TfidfTransformer(), AffinityPropagation(verbose=True))
# None of the AffinityPropagation clustering algorithms for Assets lines

In [138]:
label, df = word_matching(assets.columns[2:], HashingVectorizer(), KMeans(n_clusters=25, verbose=1))

Using the HashingVectorizer text vectorizer
Initialization complete
Iteration  0, inertia 71.825
Iteration  1, inertia 42.281
Iteration  2, inertia 41.976
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 68.688
Iteration  1, inertia 41.567
Iteration  2, inertia 40.894
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 71.700
Iteration  1, inertia 42.955
Iteration  2, inertia 42.411
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 70.710
Iteration  1, inertia 41.367
Iteration  2, inertia 41.062
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.731362e-11
Initialization complete
Iteration  0, inertia 73.413
Iteration  1, inertia 41.136
Iteration  2, inertia 40.766
Converged at iteration 2: center shift 0.000000e+00 within tolerance 8.73

In [124]:
df

Unnamed: 0,Labels,LineItems
0,3,Due from customer
1,2,"Furniture, equipment and leasehold improvement..."
2,2,(Net of accumulated depreciation and amortizat...
3,3,Due from clearing broker
4,4,TOTAL CURRENT ASSETS
...,...,...
125,3,"Due from Parent, net"
126,2,(Net of accumulated depreciation and amortizat...
127,3,Due from an affiliate
128,11,Prepaid Expenses


In [142]:
for i in range(df.Labels.max()+1):
    print('Label', i)
    print(df[df.Labels == i]['LineItems'].values)
    print()

Label 0
['Accrued expenses' 'Accrued Expense'
 'Accounts payable and accrued expenses' 'Accrued Payroll' 'Investments'
 'Finantia USA Ltd. Statement of Financial Condition December 31, 2002'
 'Prepaids' 'Other receivables']

Label 1
['Furniture, equipment and leasehold improvements (Net of accumulated depreciation and amortization of $223,956)'
 'Leasehold improvements, furniture and equipment (net of accumulated amortization and depreciation of $257,383)'
 'Leasehold improvements, furniture and equipment (net of accumulated amortization and depreciation of $255,603)'
 'Leasehold improvements, furniture and equipment (net of accumulated amortization and depreciation of $275,585)'
 'Furniture and equipment, net of accumulated depreciation of $157,148 (Note']

Label 2
['TOTAL CURRENT ASSETS' 'TOTAL ASSETS' 'Total Other Assets'
 'Total Current Assets' 'Total Assets' 'Total assets'
 'Total current assets']

Label 3
['Due from clearing broker' 'Due from clearing brokers'
 'Due from clearing

### Consolidating clusters to structured database

In [133]:
def structured_data(unstructured_df:pd.DataFrame, cluster_df:pd.DataFrame, label_names:np.array) -> pd.DataFrame:
    """
    Constructs a structured dataset from an unstructured column set
    
    :param: unstructured_df (type pandas.DataFrame)
        unstuructured pandas dataframe with loose column construction 
    :param: cluster_df (type pandas.DataFrame)
        a pandas dataframe of clustered labels and corresponding line items
    :param: (type numpy array)
        all corresponding cluster labels cirresponding with 'cluster_df' parameter
        
    :return: (type pandas DataFrame)
    """
    
    structured_df = pd.DataFrame()
    
    # assume that the there exists columns 'CIK' and 'Year' for unstructured data
    structured_df = unstructured_df[['CIK', 'Year']]
    
    for label in label_names:
        data = cluster_df[cluster_df['Labels'] == label]['LineItems']     # filter by corresponding cluster
        
        # sum all columns, across row and map to structured dataframe
        structured_df[label] = unstructured_df[data.values].sum(axis=1)
        
    return structured_df

In [134]:
structured_data(assets, df, df['Labels'].unique())

Unnamed: 0,CIK,Year,3,2,4,6,11,14,10,0,9,13,7,5,1,12,8
0,1000147,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74600.0,0.0,0.0
1,1000147,2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75800.0,0.0,0.0
2,1000147,2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76500.0,0.0,0.0
3,1000147,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000147,2006,36301.0,0.0,332085.0,0.0,0.0,0.0,15788.0,0.0,2005.0,0.0,0.0,42132.0,253652.0,0.0,395.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1000320,2009,0.0,0.0,0.0,0.0,13000.0,0.0,0.0,653311.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,1000320,2010,0.0,0.0,0.0,0.0,15000.0,0.0,0.0,88052.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,1000320,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,1000320,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
assets[df[df.Labels == 1]['LineItems'].values]

Unnamed: 0,Cash and cash equivalents,Cash and cash equivalents [Note 1],Cash and equivalents,Cash In Bank,Cash,Cash in Bank,"$15,817 in 2001 and $9,217 in 2000"
0,,,74600.0,,,,
1,,,75800.0,,,,
2,,,76500.0,,,,
3,,,,,,,
4,253652.0,,,,,,
...,...,...,...,...,...,...,...
94,,,,,,,
95,,,,,,,
96,,,,,,,
97,,,,,,,
