In [33]:
import sys
import os
import re
sys.path.append("..")

import spacy
nlp = spacy.load("en_core_web_lg")

from utils.case import Case, DocketEntry, Document, Party
DATA_LOCATION = "../data/results_nysb_all_chap_11"

In [68]:
def find_repeat(s):
    tokens = s.split()
    if len(tokens) <= 1:
        return s
    if len(tokens) == 4:
        if tokens[0] == tokens[2] and tokens[1] == tokens[3]:
            return tokens[0]+" "+tokens[1]
    if len(tokens) == 6:
        if tokens[0] == tokens[3] and tokens[1] == tokens[4] and tokens[2] == tokens[5]:
            return tokens[0]+" "+tokens[1]+" "+tokens[2]
    return s

In [89]:
#aggregate orgs mentioned in file
remove_words = ['po box', 'el', 'signature'] #words to remove from an entity
def clean_org(orig):
    cleaned = orig.lower().strip()
    cleaned = re.sub('  ', ' ', cleaned)
    cleaned = re.sub('[^A-Za-z0-9 ]+', '', cleaned)
    cleaned = find_repeat(cleaned)
    
    no_spaces = re.sub(' +', '', cleaned)
    if (len(no_spaces) == 0):
        return ""
    #all the same letter (i.e. 'iii iii iii')
    if (no_spaces == len(no_spaces) * no_spaces[0]):
        return ""
    
    # remove remove_words from entity
    for word in remove_words:
        cleaned = re.sub(word, '', cleaned)
    
    tokens = cleaned.split()
    if len(tokens) <= 1:
        return ""
    if len(max(tokens, key=len)) <= 2: #no token longer than 2 characters
        return ""
    if tokens[0] == "the":
        return ""
    for token in tokens: #get rid of zip codes -> presume address
        if token.isdigit() and len(token) == 5:
            return ""
    
    #get rid of trailing whitespace
    cleaned = cleaned.rstrip()
    if cleaned.isdigit():
        return ""
    return cleaned

blacklist = ["court", "united states", 'debtors', 'social security', 'motion', "employer", "debtor",
             'form', 'docket', 'state of', 'agreement', 'floor', 'street', 'taxpayer', 'petition', "fee", "number"]

def most_freq_orgs(ents):
    orgs = {}
    for ent in ents:
        if ent.label_ == "ORG":
            org = clean_org(ent.text)
            #blacklist certain org words:
            for word in blacklist:
                if word in org:
                    org = ""
                    break
            if org == "":
                continue
            if org in orgs:
                orgs[org] += 1
            else:
                orgs[org] = 1
    return orgs

In [90]:
#aggregate people mentioned in file
remove_words_ppl = ["signature"]

def clean_name(orig):
    cleaned = orig.lower().strip()
    cleaned = re.sub('  ', ' ', cleaned)
    cleaned = re.sub('[^A-Za-z ]+', '', cleaned)
    
    tokens = cleaned.split()
    if (len(tokens) <= 1): 
        return "" 
    if len(max(tokens, key=len)) <= 2: #no token longer than 2 characters
        return ""
    
    # remove remove_words from entity
    for word in remove_words_ppl:
        cleaned = re.sub(word, '', cleaned)
    
    cleaned = find_repeat(cleaned)
        
    #all the same letter (i.e. 'iii iii iii')
    no_spaces = re.sub(' +', '', cleaned)
    if (no_spaces == len(no_spaces) * no_spaces[0]):
        return ""
    
    #get rid of trailing whitespace
    cleaned = cleaned.rstrip()
    return cleaned

blacklist_ppl = ["court", "clerk", "name of", "debtor", "united states"]

def most_freq_persons(ents):
    people = {}
    for ent in ents:
        if ent.label_ == "PERSON":
            person = clean_name(ent.text)
            #blacklist certain org words:
            for word in blacklist_ppl:
                if word in person:
                    person = ""
                    break
            if person == "":
                continue
            if person in people:
                people[person] += 1
            else:
                people[person] = 1
    return people

In [51]:
# move potential non-people in ppl to org 
company_indicator = ["inc", "llc", "llp", "corp"]

#adds name, count to dict
def add_to_dict(d, name, count):
    if name in d:
        d[name] = d[name] + count
    else:
        d[name] = count
    return d

#move things that are actually orgs from ppl to orgs
def move_ppl_to_orgs(orgs,ppl):
    to_delete = []
    for person, count in ppl.items():
        tokens = person.split()
        if tokens[-1] in company_indicator:
            orgs = add_to_dict(orgs,person,count)
            to_delete.append(person)
    for delete in to_delete:
        del ppl[delete]
    return orgs, ppl

In [65]:
def print_top_3(A):
    top_keys = sorted(A, key=A.get, reverse=True)[:3]
    for key in top_keys:
        print (A[key],key)

In [127]:
def levenshteinDistance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

#returns if it's close (bool), string before, string after (one will become the other)
def close_enough(s1,s2):
    dist = levenshteinDistance(s1,s2)
    #one contained in another
    if s1 in s2 and dist < 8:
        return True, s2, s1
    if s2 in s1 and dist < 8:
        return True, s1, s2
    #edit distance is small
    if dist < 3:
        return True, s1, s2 #this is an arbitrary choice
    return False, None, None
    
#combines similar entities
def agg_dict(A):
    print(A)
    combined = {} # before:after
    
    for s1 in A.keys():
        for s2 in A.keys():
            if s1 == s2: 
                continue
            is_close, before, after = close_enough(s1,s2)
            if s1 == after:
                continue
            if is_close:
                # make sure not a loop
                if after not in combined or combined[after] != before:
                    combined[before] = after
                    print(before,"->", after)
#                 if combined[after] == before:
#                     continue
#                 combined[before] = after
#                 print(before,"->", after)
                
    to_delete = []
    for before, after in combined.items():
        item = after
        count = A[before]
        A = add_to_dict(A, item, count)
        to_delete.append(before)
    for delete in to_delete:
        del A[delete]
    print(A)
    return A

In [105]:
# EXTRACT ORGS AND PPL INTO MAPS
file_to_orgs = {} # maps file name to a map of orgs + counts
file_to_ppl = {} # maps file name to a map of people + counts

for file in os.listdir(DATA_LOCATION):
    doc = nlp(open(os.path.join(DATA_LOCATION, file)).read())
    orgs = most_freq_orgs(doc.ents)
    ppl = most_freq_persons(doc.ents)
    orgs, ppl = move_ppl_to_orgs(orgs, ppl)
    print(file)
    print("ORGS:", orgs)
    print("PPL:", ppl)
    print("\n")
    file_to_ppl[file] = ppl
    file_to_orgs[file] = orgs

6938010
ORGS: {'3 clearing bank': 1, 'tm garden inc': 1, 'paolo secondo': 1, 'tenant of residential property': 1, 'garden inc': 1}
PPL: {'allan l gropper': 1, 'lawrence morrison': 1, 'paolo secondo': 1}


16798867
ORGS: {'ybrant media acquisition': 1, 'ybrant digital limited': 2, 's sanford p rosen': 1, 'rosen  associates': 1, 'pc 747': 1}
PPL: {'sanford p rosen': 1}


6949417
ORGS: {'life extension realty llc': 3, 'limited liability company': 1, 'limited liability partnership': 1, ' health care business': 1, ' commodity broker': 1, ' clearing bank': 1, 'north american industry classification system': 1, 'shenwick  associates': 2, 'state fill': 1, 'sl green realty corp': 1, 'pas owner llc': 1, 'green realty corp': 1, 'lexington avenue': 1, 'litigant couns for life extension realty llc': 1}
PPL: {'eric braverman': 2, 'james h shenwick': 2, 'matthew leonardi': 1, 'stempel bennett': 1}


11079394
ORGS: {'mf global holdings ltd': 2, 'plan administrator': 1, 'sapere cta fund': 1, 'global ho

6864620
ORGS: {'henry inc': 2, 'chief executive': 1, 'morgan stanley senior funding inc': 1, 'administrative agent and syndication agent': 1, 'morgan stanley': 1, 'financial distress and efforts to raise capital': 1, 'dorman  fawcett': 5, 'dorman  fawcetts': 1, 'washington national airport 1 aviation circle': 1, 'indianapolis airport authority': 1, 'international air terminal llc': 1, 'mineta san jose international airport': 1, 'customer data': 2, 'widepoint corporation': 1, 'henry incorporated': 1, 'andll identity solutions': 1, 'difrancesco bateman coley': 1, 'kunzman davis  lehrer': 1, 'bain capital valhalla partners': 1, 'boston office 8000': 1, 'hig ventures': 1, 'rho capital partners': 1, 'suite 1350': 1, 'oak investment partners': 1, 'bertram capital management llc operational research consultants inc': 1, 'waples mill san mateo': 1, 'west fid road san francisco': 1}
PPL: {'james e moroney': 4, 'steven brill': 1, 'ciistnmer data': 1, 'john f kennedy': 1, 'ronald regan': 1, 'h we

6919902
ORGS: {' commodity broker  clearing bank': 1, ' other taxexempt entity': 1, 'exhibit c': 1, 'tenant of residential property': 1, 'x s ste': 1, 'starr  starr': 2, 'pllc firm name': 1, 'authorized individual': 1, 'minoa blvd mineola': 1}
PPL: {'filing fee': 2, 'michael gibson': 4, 'r bankr': 2, 'john doe': 1, 'david m namm': 1, 'starr date stephen': 1}


25113438
ORGS: {'kasen  kasen': 2, 'buddy warren': 1, 'warren inc': 1, 'new york state department of labor': 2, 'bowling green': 1, 'kasen  kasen rc': 1}
PPL: {'michael j kasen': 2, 'sean h lane': 1, 'michael kasen': 1, 'j kasen': 1}


6869599
ORGS: {'fox llp': 1, 'official committee of unsecured creditors': 1, 'k  republic enterprises inc': 1}
PPL: {'robert m hirsh': 1, 'jordana l renert': 1, 'arthur j gonzalez': 1}


6937853
ORGS: {}
PPL: {'e ortiz': 1, 'norrna e ortiz': 1}


6886399
ORGS: {'olshan grundman frome rosenzweig  wolosky llp park avenue tower': 1, 'marriott international inc': 1, 'rosenberg calica  birney llp': 1, '

6869472
ORGS: {'marvin traub associates inc': 5, 'merchandising consultant': 10, 'rock  republic enterprises inc': 6, 'triple r': 4, 'marvin traub associates': 10, 'spizz  johns': 3, 'pc proposed couns': 1, 'triple r inc': 2, 'associates inc': 1, 't0 rock  republic': 1, 'rock  republic enterprises': 7, 'fox llp': 1, 'official committee of unsecured creditors': 1, 'bowling green': 3, 'via email': 3, 'spizz  johns rc': 2, 'rook  republic enterprises': 1, 'marvin trade associates inc': 1, 'marvin traub retail review': 1, 'rnr coo meeting': 1, 'bowling green new york': 1, 'marvin traub rock  republic': 1, 'retail review': 1, 'marvin trade assdbiates': 1, 'ny 100195820': 1, 'triple rinc': 1}
PPL: {'marvin traub': 10, 'mortimer singer': 3, 'kelsey scroggins': 5, 'arthur j gonzalez': 2, 'i introduction a': 1, 'alex spizz': 4, 'arthur goldsteln': 1, 'jill makower': 1, 'richarci morrissev': 1, 'schuyler g carroll': 4, 'richard morrissey': 4, 'james m sullivan': 4, 'alex splzz': 1}


11071228
OR

6870142
ORGS: {'southern district of new york': 1, 'mount vernon monetary management corp all other names': 1, 'sole corporate': 6, 'fti consulting': 1, 'llp partnership': 1, 'health care business': 1, 'railroad stockbroker': 1, 'commodity broker': 1, 'clearing bank': 1, 'single asset real estate': 1, 'ultqulcu ul eve': 1, 'exhibit c': 1, 'tenant of residential property': 1, 'mount vernon monetary management corp': 6, 'firm name': 1, 'n saw mill': 1, 'american armored car ltd': 2, 'annex corp': 3, '7 armored money services llc': 1, 'atm management': 2, 'barron atm corp': 3, 'crystal public communications inc': 3, 'district central station alarm corp': 3, 'district central alarm service corp 13': 1, 'district security services': 3, 'ezki realty corp': 3, 'ezri realty corp': 4, 'gnc payroll plus inc': 3, 'gt public services': 3, 'public access networks services inc': 3, 'manhattan money branchcom inc': 2, 'michle corp': 3, 'money kiosk corp': 3, 'montgomery check cashing corp': 3, 'mount

11073769
ORGS: {'morrison  foerster llp': 3, 'pepper hamilton llp': 1, 'freeh group international solutions': 1, 'mesa air group inc': 1, 'keene corp': 3, 'ames dept': 1, 'stores inc': 1, 'see keene corp': 1, 'houlihan lokey howard  zukin capital v': 1, '205 br': 1, 'mesa air group': 1, 'cct comm': 2, 'mf global inc': 4, 'mf global uk ltd': 1, 'report of investigation': 1, 'mf global': 1, 'smith v edwards  hale': 1, 'worldwide direct inc': 1, 'wd mich 2009': 1, 'private placement lp': 1, 'professional services': 1, 'of town couns': 1, 'peppers application': 1, 'pepper and the trustee': 1, 'lj preparation for creditors': 1, 'fc attend creditors': 1, 'razzano committee': 1, 'committee scc': 1, '162012 trav expense  frank razzano  1034': 1, '10232012 trav': 1, 'pepper hamilton llp morrison  foerster llp': 1, 'ny 101040050': 1, 'special couns for the chapter': 1, 'lazard freres  co': 1, 'agenda continued': 2, 'lipton rosen  katz': 1, 'buttner hammock': 1, 'pa as litigation consultants': 1,

6869342
ORGS: {'fox llp': 1, 'official committee of unsecured creditors': 1, 'k  republic enterprises inc': 1, 'committee of unsecured creditors': 1, 'authorized to provide professional services': 1, 'official committeeof unsecured': 1, 'sale and disposition of assets': 2, 'overtime meals  seamless wee 1317': 1, 'overtime meals  cabs': 1, 'overtime meals  seamless': 1, 'overtime meals  seamless web 2491': 1, 'fox citystate': 5, 'morrissey esq': 1, 'la indicato': 6, 'sullivan carroll renert renert sullivan': 1, 'sullivan renert': 1, 'financial affairs': 1, 'review committee': 1, 'jr renert send committee': 1, 'jm sullivan provide': 1, 'jr renert review': 3, 'jm sullivan communicate': 1, 'sg carroll review': 1, 'jm sullivan review fti': 1, 'statement of financial affairs': 1, 'jma sullivan': 1, 'ujljuj ljal dc luyuu': 1, 'jordana renert': 3, 'jr renert draft': 2, 'renert attend': 1, 'renert draft': 1, '511 committee': 1, 'indicato update': 1, 'renert calendar': 1, 'renert organize': 1, '

6864634
ORGS: {'qualication and power': 2, 'customer data': 20, 'organization of the buyer': 2, 'operation of business': 2, 'veried identity pass inc': 2, 'assumed liabilities': 1, 'indemnied party': 1, 'indemnifying party': 1, 'permit of any governmental authority': 1, 'morgan stanley senior funding inc': 1, 'morgan stanley  co incorporated': 1, 'any governmental authority': 5, 'a governmental authority': 2, 'assumption of liabilities': 1, 'j p morgan  co': 1, 'closing by': 1, 'internal revenue service': 1, 'buyer at closing': 1, 'ada11 0aja': 1, 'security interests': 2, 'intlectual property': 2, 'customer usage data': 8, 'any representative of the sler shall constitute': 1, 'notices and consents': 1, 'governmental authority': 2, 'lockheed martin': 1, 'hanify  kings': 1, 'dorman  fawcetts': 1, 'each party': 2, 'acquired contracts': 1, 'afliate of sler': 2, 'customer data transfer and destruction': 1, 'registered traver': 1, 'transportation security administration': 1, 'press reases an

6883489
ORGS: {'ango  maxies': 5, 'nature of business': 1, 'clearing bank': 1, 'vgluntary pettwn ango  maxies': 1, 'tenant of residential property': 1, 'vilmntary petltlon ango  maxies': 1, 'firm name': 1, 'ango  maxies llc': 1, 'rattet pasternak llp': 1, '2225 fourth llc': 1, 'orda management corp': 3, 'buckhead beef company': 3, 'mslavin  sons': 2, 'food center drive unit': 3, 'food service': 1, 'district 55han30n place': 1, 'sysco food service': 1, 'nys dept of tax  finance': 1, 'hanson place': 2, 'sam tl  son inc': 1, 'bruces bakery': 1, 'llc case no': 1, 'american express': 2, 'borax paper products inc': 1, 'hper paper  supplies i halper paper  supplies': 1, 'borax paper products inc 10': 1, ' southern wine  spirits of southern wine  spirits of ny': 1, 'pd box': 1, 'richmond va 232182499': 1, 'militia foods mivila foods': 1, ' isl reed goidstein': 1, 'ango  maxiesz': 1, 'al cesspool service corp': 1, 'afi food svs distributors 1 ikea drive izabeth': 1, 'american express  1270': 1,

6946977
ORGS: {'ybrant media acquisition inc': 4, 'eagle location': 1, 'icici bank usa': 1, 'limited liability company': 1, 'limited liability partnership': 1, ' partnership  other': 1, ' health care business': 1, ' single asset real estate': 1, ' commodity broker': 1, ' clearing bank': 1, 'north american industry classification system': 1, 'rosen  associates': 3, 'pc firm': 1, 'ybrant media acquisition': 2, 'ybrant digital limited': 1, 'ybrant media acguisition inc': 1, 'ybrant digital limited road no': 1, 'litigant couns': 1, 'uisition2 inc': 1}
PPL: {'suresh k reddy': 1, 'sanford p rosen': 3, 's suibsh k reddy    suresh k reddy': 1, 's suresh k reddy': 1}


6886410
ORGS: {'olshan grundman frome rosenzweig  wolosky llp park avenue tower': 1, '92nd st hot associates llc': 1, 'hot      associates': 1, 'rosenberg calica  birney llp': 1, 'rivkin radler llp': 1, 'hot associates': 2, 'rosenzweig  wolosky llp': 1, 'j knys': 1, 'westport capital partners': 2, 'katten muchin rosenman llp': 1,

6869484
ORGS: {'atlas strategic advisors': 10, 'rock  republic enterprises inc': 5, 'triple r inc': 5, 'investment banker and financial': 1, 'rock  republic enterprises': 2, 'triple r': 3, 'a limited liability corporation': 1, 'banking services': 1, 'official committee of unsecured creditors': 2, 'lord  taylor': 1, 'saks fifth avenue': 1, 'hof alloo': 1, 'frnm dszd': 1, 'banking advisors for rock  republic': 1, 'enterprises inc': 1, 'spizz  johns': 1, 'pc proposed couns': 1, 'investment banker and financial advisor': 1, 'rock et republic enterprises': 1, 'atlas strategic advisers': 1, 'term loan': 1, 'indemnified party': 1, 'board of directors': 1, 'palden namgyal': 1, 'investment banking services': 1, '3ma 10': 1, 'cit n js ne': 1, '20ma 1o': 1, 'j s ne outreach': 1, 'nda 25ma 10': 1, 'continental airlines': 1, 'hot marriott': 1, 'hot marriott  apr 1415': 1, 'e9 999': 1, 'js ne working': 1, 'disbursements for professionals in southern district': 1, 'reviewind annlications for compensa

6886416
ORGS: {'suozzi english  klein rc': 1, 'employ rsr consulting': 1, 'rsr consulting': 1, 'spectrum of': 1, 'crc parent corporation': 1, 'a chem rx corporation': 1, 'olshan grundman frome rosenzweig  wolosky llp': 1, 'mmemgoa b mag': 1, 'o2 e 8802': 1, 'e23 mano e': 1, 'engaz wwonzcms': 1, 'st hot associates': 1, 'llc code llc': 1, 'corporation service co': 1, 'environmental control board faber': 1, 'spalding llp': 1, 'lodging advisors': 1, 'new york hot': 1, 'mot trade council': 1, 'nys dept': 1, 'taxation  finance': 1, 'raich ende maiter  co': 1, 'rosenberg calica  bimey seiden  sehein': 1, 'louis tener consulting services': 1, 'young conaway stargatt  taylor': 1}
PPL: {'thomas r slome': 1, 'robert rosenfeld': 3, 'i am the managing': 1, 'staci b': 1, 'eocmcm u': 1, 'adam b': 1, 'jerome gillman consulting': 1, 'j kny': 1, 'mcdaniel pllc': 1, 'margolin winer evens': 1, 'grundman f': 1, 'rivkin radler': 1, 'korngold weiss': 1}


23249295
ORGS: {'warren inc': 1, 'chrystie tenant lp'

6868978
ORGS: {'berger  fischoff': 2, 'steinberg fineo berger  f': 1, 'brian r schechter notary pubtie': 1, 'bu yao pa llc': 1}
PPL: {'gary c fischoff': 3, 'tsu yue wang': 1, 'tsu yue wangs': 1, 'the tsu yue wang': 1, 'heath s berger': 1, 'brian p schechter': 1, 'patricia reill': 1, 'janine pfersching': 1}


6932446
ORGS: {'conversion consulting llc': 1, 'flip services': 2, 'bowling green': 1, 'rach s blumenfd conversion consulting llc': 1, 'rblmnf aolcom': 1, 'federal office building': 1, 'varick st': 1}
PPL: {'warren s dank': 1, 'martin glenn': 2, 'warren s dank warren dank': 1, 'serene k nakano': 1}


12290332
ORGS: {'silas metro holdings corp': 2, 'limited liability company': 1, 'limited liability partnership': 1, ' single asset real estate': 1, ' commodity broker': 1, ' clearing bank': 1, 'north american industry classification system': 1, 'state fill': 1, 'easco boiler corp': 1, 'renaissance drive las vegas': 1}
PPL: {'oneill dominica': 1, 'oneill': 1, 'david carlebach': 3}


110

6869498
ORGS: {'fox llp': 1, 'official committee of unsecured creditors': 1, 'ollck  republic enterprises inc': 1, 'rock  republic enterprises': 1, 'triple r inc': 1, 'simms sigal': 1, 'official committee': 1}
PPL: {'schuyler g carroll': 1, 'james m sullivan': 2, 'jordana l renert': 2, 's schuyler g carroll schuyler g carroll': 1}


6884843
ORGS: {'attorneys at law braybar building': 1, 'lexington avenue': 1, 'bowling green new york': 1, 'pavarini mcgovern': 2, 'waterscape resort': 1, 'loar corporate servicing': 1, 'aa maintenance': 1, 'midland avenue': 1, 'miron  sons linen service': 1, 'holland  knight attn': 1, 'frederick r rohn': 1, 'koni corp': 1, 'bank national association': 1, 'usb capital resources': 1, 'broadway concrete corp': 1, 'john civetta  sons inc 1123': 1, ' walker llp': 1, 'internal revenue service': 1, 'lee w stremba troutman sanders llp': 1}
PPL: {'eric w sleeper': 1, 'm bernstein': 1, 'armando rodriguez': 1, 'caneld madden': 1, 'john madden': 1, 'mark golden': 1, '

6944571
ORGS: {'va presto': 1, 'health care business single asset real estate': 1, 'llp partnership': 1, 'clearing bank other': 1, 'tenant of residential property': 1, 'firm name': 1, 'hvattii hq': 1, 'pasta bar': 2, 'ross riedman ference llp': 1, 'sichenzia ross friedman ference llp': 2}
PPL: {'pasta bar': 1, 'filing fee': 3, 'official form': 1, 'nonmain proceeding': 1, 'ie iii iii iii iii': 1, 'ralph preite': 2, 'ralph e preite': 1, 'john scotto': 5, 'printed name': 1, 'scotto ii': 4}


6934898
ORGS: {'proposed couns': 2, 'binder  binder': 26, 'ssdi holdings inc': 1, 'national veterans disability advocates llc': 2, 'us bank national association': 1, 'stlus capital investment corporation': 1, 'moore  van allen pllc': 1, 'lowenstein sandler llp': 1, 'ssdi holdings': 1, '8 binder  binder': 1, 'katten muchin rosenman llp': 1}
PPL: {'lowenstein sandler llp kenneth a rosen': 1, 'nicholas b vislocky': 2, 'joseph lubertazzi': 1, 'kenneth j ottaviano': 1, 'stephen e gruendel': 1, 'kenneth a r

In [133]:
print_top_3(file_to_orgs["6871731"])

9 boston generating
6 general couns
6 ebg holdings


In [132]:
agg_dict(file_to_orgs["6871731"])

{'boston generating': 7, 'sithe boston generating': 1, 'exon boston generating': 1, 'fifth avenue': 2, ' single asset real estate': 1, 'llp  railroad': 1, ' clearing bank': 1, 'tenant of residential preperty': 1, 'boston generatin llc': 1, 'latham  watkins llp d': 1, 'states code': 1, 'ebg holdings llc': 5, 'fore river devopment': 1, 'mystic devopment': 1, 'bg new england power services': 1, 'bg boston services': 1, 'llc action by': 1, 'a daware limited liability company': 1, 'constlation holdings inc': 2, 'constlation energy group inc': 1, 'retention of professionals': 1, 'latham  watkins llp': 3, 'jp morgan securities': 1, 'perla weinberg partners': 1, 'brown rudnick llp': 1, 'fti consulting': 1, 'anderson kill  olick': 1, 'authorized representative of the company': 1, 'joint administration requested': 2, 'department of creditor': 5, 'agency loan operations': 1, 'credit suisse': 1, 'administrative agent for the lenders': 1, 'lipton rosen  katz': 1, 'harold olsen stroock  stroock  lav

{'boston generating': 9,
 'fifth avenue': 2,
 ' single asset real estate': 1,
 'llp  railroad': 1,
 ' clearing bank': 1,
 'tenant of residential preperty': 1,
 'boston generatin llc': 1,
 'states code': 1,
 'fore river devopment': 1,
 'mystic devopment': 1,
 'bg new england power services': 1,
 'bg boston services': 1,
 'llc action by': 1,
 'a daware limited liability company': 1,
 'constlation holdings inc': 2,
 'constlation energy group inc': 1,
 'retention of professionals': 1,
 'latham  watkins llp': 4,
 'jp morgan securities': 1,
 'perla weinberg partners': 1,
 'brown rudnick llp': 1,
 'fti consulting': 1,
 'anderson kill  olick': 1,
 'authorized representative of the company': 1,
 'joint administration requested': 2,
 'department of creditor': 5,
 'agency loan operations': 1,
 'credit suisse': 1,
 'administrative agent for the lenders': 1,
 'lipton rosen  katz': 1,
 'harold olsen stroock  stroock  lavan llp': 1,
 'distrigas of massachusetts': 1,
 'stroock  stroock  lavan llp': 1,

In [32]:
# returns a map of people+counts for multiple files
# parameter: array of strings of file names
def mult_files_ppl(files):
    if len(files) == 0 or files[0] not in file_to_ppl: 
        return None
    starting_set = file_to_ppl[files[0]]
    for index in range(1,len(files)):
        curr_ppl = file_to_ppl[files[index]]
        # add this files' items to starting_set
        for name, count in curr_ppl.items():
            if name in starting_set:
                starting_set[name] = starting_set[name] + count
            else:
                starting_set[name] = count
    return starting_set