In [1]:
import sys
import os
import re
sys.path.append("..")

import spacy
nlp = spacy.load("en_core_web_lg")

from utils.case import Case, DocketEntry, Document, Party
DATA_LOCATION = "../data/results_nysb_all_chap_11"

In [34]:
def find_repeat(s):
    i = (s+" "+s).find(s, 1, -1)
    return s if i == -1 else s[:i] #returns whole string if no repeat found

In [109]:
#aggregate orgs mentioned in file
single_blacklist = ['llc', 'company', 'llp', 'fed', 'po', 'box', 'br', 'office', 'firm',
                    'corporation', 'name', 'smb', 'committee', 'trustee', 'restaurant'] #words that aren't entities by themselves
remove_words = ['po box'] #words to remove from an entity
def clean_org(orig):
    cleaned = orig.lower()
    cleaned = re.sub('[^A-Za-z0-9 ]+', '', cleaned)
    cleaned = find_repeat(cleaned)
    
    no_spaces = re.sub(' +', '', cleaned)
    if (len(no_spaces) == 0):
        return ""
    #all the same letter (i.e. 'iii iii iii')
    if (no_spaces == len(no_spaces) * no_spaces[0]):
        return ""
    
    # remove remove_words from entity
    for word in remove_words:
        cleaned = re.sub(word, '', cleaned)
    
    tokens = cleaned.split()
    if len(tokens) == 0:
        return ""
    if tokens[0] == "the":
        return ""
    if len(tokens) == 1 and tokens[0] in single_blacklist:
        return ""
    for token in tokens: #get rid of zip codes -> presume address
        if token.isdigit() and len(token) == 5:
            return ""
    
    #get rid of trailing whitespace
    cleaned = cleaned.rstrip()
    if cleaned.isdigit():
        return ""
    return cleaned

blacklist = ["court", "united states", 'debtors', 'social security', 'motion', 
             'form', 'docket', 'state of', 'agreement', 'floor', 'street']

def most_freq_orgs(ents):
#     print(ents)
    orgs = {}
    for ent in ents:
        if ent.label_ == "ORG":
            org = clean_org(ent.text)
            #blacklist certain org words:
            for word in blacklist:
                if word in org:
                    org = ""
                    break
            if org == "":
                continue
            if org in orgs:
                orgs[org] += 1
            else:
                orgs[org] = 1
    print(orgs)
    print("\n")

In [110]:
#aggregate people mentioned in file
def clean_name(orig):
    cleaned = orig.lower()
    cleaned = re.sub('[^A-Za-z ]+', '', cleaned)
    
    tokens = cleaned.split()
    if (len(tokens) <= 1): 
        return "" 
    
    cleaned = find_repeat(cleaned)
        
    #all the same letter (i.e. 'iii iii iii')
    no_spaces = re.sub(' +', '', cleaned)
    if (no_spaces == len(no_spaces) * no_spaces[0]):
        return ""
    
    #get rid of trailing whitespace
    cleaned = cleaned.rstrip()
    return cleaned

def most_freq_persons(ents):
#     print(ents)
    people = {}
    for ent in ents:
        if ent.label_ == "PERSON":
            person = clean_name(ent.text)
            if person == "":
                continue
            if person in people:
                people[person] += 1
            else:
                people[person] = 1
    print(people)
    print("\n")

In [111]:
case_files = []
for file in os.listdir(DATA_LOCATION):
    # reads in file and loads it w spacy model
    print(file) #print file name
    doc = nlp(open(os.path.join(DATA_LOCATION, file)).read())

    # most_freq_persons(doc.ents)
    most_freq_orgs(doc.ents)

6938010
{'ein': 1, 'state': 2, 'location of principal assets of business debtor': 1, '3 clearing bank': 1, 'el partnership': 1, 'el debtor': 3, 'el filing fee': 1, 'd filing fee': 1, 'el e1': 1, 'el 121': 1, 'el e1 e1': 1, 'tm garden inc': 1, 'spouse': 1, 'paolo secondo': 1, 'tenant of residential property': 1, 'garden inc': 1, 'signatureof n': 1, 'socialsecurity': 1, 'bankruptcy petition preparer': 1, 'socralsecurity': 1, 'united': 1}


16798867
{'ybrant media acquisition': 1, 'bankr': 1, 'ybrant digital limited': 2, 's sanford p rosen': 1, 'rosen  associates': 1, 'pc 747': 1}


6949417
{'el check': 2, 'life extension realty llc': 3, 'employer identification': 1, 'ein': 1, 'limited liability company': 1, 'limited liability partnership': 1, 'el partnership': 1, 'el other': 1, 'el health care business': 1, 'el railroad': 1, 'el commodity broker': 1, 'el clearing bank': 1, 'el investment': 2, 'north american industry classification system': 1, 'usc11161b': 1, 'el': 1, 'el 0': 1, 'mm  dd'

{'veried': 8, 'henry inc': 2, 'chief executive': 1, 'tsa': 3, 'customer': 1, 'morgan stanley senior funding inc': 1, 'administrative agent and syndication agent': 1, 'morgan stanley': 1, 'loan': 1, 'lockheed': 4, 'financial distress and efforts to raise capital': 1, 'dorman  fawcett': 5, 'dorman  fawcetts': 1, 'lenders': 2, 'specically': 1, 'washington national airport 1 aviation circle': 1, 'indianapolis airport authority': 1, 'iaa': 1, 'international air terminal llc': 1, 'mineta san jose international airport': 1, 'customer data': 2, 'widepoint corporation': 1, 'henry incorporated': 1, 'andll identity solutions': 1, 'lender': 1, 'difrancesco bateman coley': 1, 'kunzman davis  lehrer': 1, 'ombudsman': 3, 'verified': 1, 'bain capital valhalla partners': 1, 'boston office 8000': 1, 'hig ventures': 1, 'attn': 4, 'rho capital partners': 1, 'suite 1350': 1, 'oak investment partners': 1, 'ca': 1, 'bertram capital management llc operational research consultants inc': 1, 'waples mill san mat

{'olshan grundman frome rosenzweig  wolosky llp park avenue tower': 1, 'examiner': 1, 'gladstone': 7, 'usc': 2, 'hotel': 1, 'marriott international inc': 1, 'rosenberg calica  birney llp': 1, 'comanaging': 1, 'movants': 5, 'opposition': 14, 'gladstone as comanaging': 3, 'hotel associates': 4, 'percentage interests': 1, 'rosenberg calica  bimey llp': 1, 'gladstone and': 1, 'hotel association': 1, 'hotel  associates': 1, 'jkny': 19, 'jkn ys': 1, 'gecc': 3, 'taic': 5, 'marriott': 1, 'new york supreme': 1, 'ge': 2, 'savills llc': 1, 'westport': 1, 'olshan grundman frome rosenzweig  wolosky llp': 1}


6881148
{'mosdos chofetz chaim inc': 11, 'schedules': 1, 'c  t n': 1, 'bank of america': 1, 'chestnut ridge rd': 1, 'montvale': 1, 'state': 4, 'usc': 1, 'customer': 1, 'schedule': 3, 'd  creditors holding secured': 1, 'el check': 2, 'mayser': 1, 'ci contributions': 1, 'ci certain': 1, 'taxes': 1, 'fdic': 1, 'rtc': 1, 'comptroller of the currency': 1, 'board of governors': 1, 'internal revenue 

{'suozzi english  klein rc': 2, 'rc': 1, 'employ rsr consulting': 1, 'llc as': 1, 'examiner': 17, 'rsr consulting': 1, 'rsr': 22, 'application': 1, 'support of examiners': 1, 'llc bankruptcy meyer': 1}


11088746
{'ortiz  ortiz': 3, 'llp hearing': 1, 'nyctl 2015a trust': 3, 'nyc': 1, 'water board': 1, 'nyctl 2015a': 1, 'plaintiff': 1, 'bankr': 1, 'ameribuild const': 1, 'bh s  b holdings': 1, 'lexis 1024': 1, 'saint anns ave tax class': 1, 'ex av 0 0': 1, 'trans ex av 0 0': 1, 'taxablebillable assessed value': 1, 'nycprogerty proposed order': 1}


6884918
{'chrysler building': 1, 'waterscape resort llc': 1, 'bank national association': 1, 'usb capital resources': 1, 'consolidated': 3, 'usb capital': 2, 'acquisition loan mortgage': 1, 'mezzanine loan mortgage': 1, 'bank assignment': 1, 'asset': 4, 'affidavit of service': 4, 'an affidavit of service': 2, 'internal': 1, 'revenue service': 1, 'eastern time': 4, 'pavarini mcgovern llc': 1, 'bank': 1, 'honorable stuart m bernstein': 1, 'attn'

{'el check': 1, 'debtor2': 1, 'individual taxpayer': 1, 'employer identification': 1, 'ein': 1, 'el over': 1, 'individuals filing for bankruptcy': 1, 'hale': 1, 'el': 1, 'health care business': 1, 'single asset real estate': 1, 'el disability': 2, 'cl': 1, 'schedule c': 1, 'warnin': 1, 'voluntary petition for individuals filing for bankruptcy': 1, 'bankruptcyresourcesapprovedcredit anddethounselorsaspx': 1, 'matrix': 1, 'grand pacific finance': 1, 'heinz': 1, 'nissan motor acceptance  371447 pittsburgh': 1, 'seterus  1077': 1}


6886406
{'united': 1, 'states trustee': 1, 'examiner': 8, 'order': 1, 's stuart m bernstein stuart m bernstein': 1}


12072149
{'suozzi english  klein rc': 1, 'proposed counsel': 1, 'espresso management holding inc': 2, 'et': 1, 'espresso stores inc': 1, 'afl llc': 1, 'f6 chelsea inc': 1, 'espresso dreams llc': 3, 'nondebtor': 1, 'levis schedules': 1, 'statement of financial affairs': 1, 'filicori': 3, 'zecchini usa corps': 1, 'gruppo industriale filicori': 1, 

{'supplemental background': 1, 'supplement': 1, 'first supplement': 1, 'merrick d holdings inc': 1, 'ny co': 1, 'nakano': 2, 'application': 4, 'wl': 2, 'matco electronics': 1, 'group inc': 1, 'bankr': 1, 'granite partners': 1, 'lp': 1, 'wiley brown  assoc': 1, 'big mac marine inc': 1, 'boltonemerson inc': 1, 'us federal office': 1, 'varick st': 1}


22258584
{'green tree servicing llc': 1}


6938837
{'southern district of': 1, 'ein': 4, 'state': 8, 'location of principal assets of business debtor': 1, 'el debts': 1, 'izi debts': 1, 'el debtor': 4, 'number of creditors': 1, 'izi el el': 1, 'el el iii el el': 1, 'spouse': 1, 'afliate of this debtor': 1, 'tenant of residential property': 1, 'uc': 1, 'uhujllell': 1, 'pick  zabicki llp': 3, 'firm name': 1, 'socialsecurity': 4, 'bky louya corp': 1, 'louya corp': 11, 'nature': 1, 'chase manhattan': 1, 'bank': 1, 'scandia seafood 260a': 3, 'woleo foods': 1, 'wholesalers inc': 1, 'hunts point': 1, 'pauls meat company': 2, 'n woodmere': 1, 'tom 

{'qualication and power': 2, 'customer data': 20, 'organization of the buyer': 2, 'operation of business': 2, 'condentiality': 3, 'severability': 1, 'veried identity pass inc': 2, 'buyer': 121, 'seller': 101, 'parties': 23, 'closing': 3, 'customer': 19, 'reg': 1, 'assumed liabilities': 1, 'indemnied party': 1, 'indemnifying party': 1, 'permit of any governmental authority': 1, 'morgan stanley senior funding inc': 1, 'morgan stanley  co incorporated': 1, 'any governmental authority': 5, 'a governmental authority': 2, 'assumption of liabilities': 1, 'auction': 1, 'j p morgan  co': 1, 'closing by': 1, 'action': 2, 'party': 42, 'internal revenue service': 1, 'irs': 3, 'organization': 1, 'buyer at closing': 1, 'ada11 0aja': 1, 'security interests': 2, 'goaa': 4, 'intellectual property': 2, 'customer usage data': 8, 'express': 1, 'faults': 1, 'any representative of the seller shall constitute': 1, 'brokers fees': 1, 'notices and consents': 1, 'permits': 2, 'governmental authority': 2, 'lockh

{'angelo  maxies': 5, 'voluntary petition': 1, 'individualtaxpayer 1': 1, 'individualtaxpayer id': 1, 'state': 2, 'location of principal assets of business debtor': 1, 'nature of business': 1, 'clearing bank': 1, 'el partnership': 1, 'el debtor': 1, 'number of creditors': 1, 'el e1 el cl c': 1, 'vgluntary pettwn angelo  maxies': 1, 'spouse': 1, 'tobe': 1, 'tenant of residential property': 1, 'vilmntary petltlon angelo  maxies': 1, 'usc': 1, 'firm name': 1, 'socialsecurity': 1, 'united': 1, 'angelo  maxies llc': 1, 'preservation': 1, 'ofcers': 1, 'rattet pasternak llp': 1, 'childs': 1, '2225 fourth llc': 1, 'orda management corp': 3, 'buckhead beef company': 3, 'mslavin  sons': 2, 'food center drive unit': 3, 'food service': 1, 'district 55han30n place': 1, ' son': 1, 'sysco food service': 1, 'nys dept of tax  finance': 1, 'hanson place': 2, 'sam tell  son inc': 1, 'bruces bakery': 1, 'llc case no': 1, 'american express': 2, 'borax paper products inc': 1, 'helper paper  supplies i halpe

{'todtman': 1, 'nachamie': 3, 'spizz  johns': 2, 'pc counsel': 1, 'rock  republic enterprises inc': 1, 'triple r inc': 1, 'rock  republic enterprises': 1, 'triple r': 2, 'atlas strategic advisors': 2, 'investment banker and financial': 1, 'special litigation counsel for rr': 1, 'manderson': 1, 'schafer  mckinlay': 1, 'special corporate counsel': 2, 'attorneys for the official committee of unsecured creditors': 1, 'fti consulting inc': 2, 'financial': 1, 'applications': 4, 'spizz  august 1': 1, 'special litigation counsel': 1, 'debtor rock  republic': 1, 'enterprises inc': 1, 'schafer  august 1': 1, 'official committee': 1, 'unsecured creditors': 1, 'financial advisors': 1, 'committee of unsecured': 1}


6946977
{'el check': 1, 'ybrant media acquisition inc': 4, 'ein': 1, 'eagle location': 1, 'icici bank usa': 1, 'limited liability company': 1, 'limited liability partnership': 1, 'el partnership el other': 1, 'el health care business': 1, 'el single asset real estate': 1, 'el commodity 

{'atlas strategic advisors': 10, 'rock  republic enterprises inc': 5, 'triple r inc': 5, 'usc': 1, 'monthly fees': 1, 'application': 9, 'atlas': 27, 'investment banker and financial': 1, 'rock  republic enterprises': 2, 'triple r': 3, 'success fee': 6, 'a limited liability corporation': 1, 'associates': 1, 'banking services': 1, 'official committee of unsecured creditors': 2, 'nordstrom': 1, 'bloomingdales': 1, 'lord  taylor': 1, 'saks fifth avenue': 1, 'nda': 4, 'hof alloo': 1, 'frnm dszd': 1, 'banking advisors for rock  republic': 1, 'enterprises inc': 1, 'nachamie': 1, 'spizz  johns': 1, 'pc proposed counsel': 1, 'investment banker and financial advisor': 1, 'fox': 1, 'order': 1, 'source': 1, 'facility': 2, 'monthly fee': 1, 'rock et republic enterprises': 1, 'atlas strategic advisers': 1, 'engagement': 1, 'shareholders': 3, 'cit': 3, 'clt': 1, 'rkf': 4, 'term loan': 1, 'sale': 2, 'ch': 1, 'campany': 1, 'provide': 1, 'oui': 1, 'peisqil': 1, 'cdmpany': 1, 'adviilii': 1, 'indemnified 

{'suozzi english  klein rc': 1, 'employ rsr consulting': 1, 'rsr consulting': 1, 'rsr': 20, 'examiner': 5, 'spectrum of': 1, 'crc parent corporation': 1, 'a chem rx corporation': 1, 'olshan grundman frome rosenzweig  wolosky llp': 1, 'newwmoovmaw': 1, 'emymmwzow': 1, 'mcoe': 1, 'mmemgoa b mag': 1, 'mmemao': 1, 'pe': 1, 'es': 3, 'e3': 1, 'emaokm': 1, 'namoo': 1, 'o2': 1, 'o2 e 8802': 1, 'eu': 1, 'e23 mano e': 1, 'mmeesq': 1, 'eoo': 1, 'engaz wwonzcms': 1, 'memz': 1, 'st hotel associates': 1, 'llc code llc': 1, 'corporation service co': 1, 'environmental control board faber': 1, 'gladstone': 1, 'kc': 1, 'spalding llp': 1, 'lodging advisors': 1, 'new york hotel': 1, 'motel trade council': 1, 'nys dept': 1, 'taxation  finance': 1, 'raich ende maiter  co': 1, 'rosenberg calica  bimey seiden  sehein': 1, 'louis  tener consulting services': 1, 'lippman': 1, 'genes': 1, 'young conaway stargatt  taylor': 1}


23249295
{'warren inc': 1, 'mmnmmmnmm': 1, 'usc': 1, 'chrystie tenant lp': 3, 'beh cor

{'steinberg': 1, 'berger  fischoff': 2, 'steinberg fineo berger  f': 1, 'sf bf': 11, 'lo': 1, 'sfbf': 1, 'brian r schechter notary pubtie': 1}


6932446
{'conversion consulting llc': 1, 'flip services': 2, 'bowling green': 1, 'landlord': 1, 'rachel s blumenfeld conversion consulting llc': 1, 'rblmnf aolcom': 1, 'federal office building': 1, 'varick st': 1}


12290332
{'el check': 2, 'silas metro holdings corp': 2, 'ein': 1, 'limited liability company': 1, 'limited liability partnership': 1, 'el partnership': 1, 'el other': 1, 'el single asset real estate': 1, 'el commodity broker': 1, 'el clearing bank': 1, 'el investment': 2, 'north american industry classification system': 1, 'el': 2, 'usc11161b': 1, 'el 200': 1, 'el 0': 1, 'state fill': 1, 'deduction': 1, 'casa': 1, 'easco boiler corp': 1, 'renaissance drive las vegas': 1}


11078947
{'new y direc': 1, 'via hand delivery and electronic filing': 1, 'mf global holdings ltd': 1, 'plan administrator': 1, 'mf global inc': 1, 'mfgi': 2, '

{'llf': 1, 'attorneys at law braybar building': 1, 'lexington avenue': 1, ' fax': 1, ' bartdnesejlicim': 1, 'bowling green new york': 1, 'pavarini mcgovern': 2, 'waterscape resort': 1, 'subcontractors': 1, 'hohoiadle': 1, 'loar corporate servicing': 1, 'attn': 1, 'aa maintenance': 1, 'midland avenue': 1, 'miron  sons linen service': 1, 'holland  knight attn': 1, 'frederick r rohn': 1, 'koni corp': 1, 'bank national association': 1, 'usb capital resources': 1, 'broadway concrete corp': 1, 'john civetta  sons inc 1123': 1, ' walker llp': 1, 'internal revenue service': 1}


6886380
{'qcplciiiucl lj': 1, 'dla piper llp us': 2, 'usc': 1, 'statement': 1, 'hotel': 7}


6886342
{'childs': 1, 'state': 2, 'ezfilirtg': 1, 'el': 2, 'crmiplere': 1, 'ezfiling': 3, 'ie': 1, 'uuiuucib vucpiiii': 1, 'fee': 1, 'cooperatives': 1, 'nw list': 1, 'setoffs': 1, 'ifthe': 1, 'raich ende malter  co': 1, 'attn': 1, 'raich ende matter': 1, 'current partners': 1, 'nob': 1, 'tax consolidation group': 1, 'print name

{'mf global holdings ltd': 3, 'plan administrator': 2, 'sapere cta fund': 3, 'lp': 1, 'global holdings ltd': 1, 'mf': 1, 'global finance usa inc': 1, 'mf global capital llc': 2, 'global market services llc': 2, 'mf global holdings usa inc': 2, 'mfgh': 10, 'mf global': 1, 'assigned assets llc': 1, 'mf global finance usa inc': 1, 'mf global fx clear llc': 1, 'di 1382': 1, 'individual defendants': 1, 'coo of mfgh': 1, 'coo of holdings': 1, 'board of directors': 1, 'sipa': 3, 'mf global inc': 2, 'sipa trustee': 1, 'sale': 1, 'mfgi': 3, 'graber': 1, 'trustee of the mf global litigation trust': 1, 'mdl': 3, 'plaintiffs': 2, 'term sheet': 3, 'litigation trustee': 2, 'eo': 4, 'do': 2, 'settlement': 9, 'cftc corzine reserve': 1, 'cftc obrien reserve': 1, 'identied dissenters policies': 1, 'supplement': 4, 'adnnlnxniro': 1, 'cftc': 1, 'customer class counsel': 1, 'mfg': 1, 'defendants and settling plaintiffs': 1, 'insurers and defendants': 1, 'mfgaa': 2, 'defendants': 3, 'mfg plaintiffs': 1, 'li