In [1]:
import numpy as np
import pandas as pd
import os, errno

def silentremove(filename):
   try:
       os.remove(filename)
   except OSError as e: # this would be "except OSError, e:" before Python 2.6
       if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
           raise # re-raise exception if a different error occurred

# Setup
base = '../../../'

###################################
## Collapsed Parent Company File ##
###################################
# Attach to the HDF5 file where we will store the data rolled-up to the parent level
collapse_filename = base + 'Data/ReferenceUSA/parent_collapse_2015.h5'
# Delete the file that's already in this location in order to ensure that we're not appending data twice
silentremove(collapse_filename)
# Setup the connection to the parent file
collapse = pd.HDFStore(collapse_filename)


########################
## Get Reference Data ##
########################
# Create a filename where we can store all records in an HDF5 file
parent_filename = base + 'Data/ReferenceUSA/parent.h5'
# Attach to the HDF5 file with the clean referenceUSA data
store = pd.HDFStore(parent_filename)
# Get the geo data that will help to seperate companies with the same name in different places
geo = store.select('geo')
# Delete the geo data that we don't need, since memory is tight
del geo['County_Code']
del geo['CBSA_Code']
del geo['ZipCode']
del geo['City']

# Get the main data to identify companies
main = store.select('main')

# Join the main and structure dataframes for processing
# main['State'] = geo['State']
main = main.join(geo)
# Delete the structure dataframe to free up memory
del geo

main['State'] = main.State.astype('object')

group_id, levels = pd.factorize(pd.lib.fast_zip([main.Company.values,
                                               main.State.values,
                                               main.archive_version_year.values]))

main['group_id'] = group_id

# main = main[main.archive_version_year == 2015]

# Create groups of company names that are repeated within the same state
main_group = main.groupby(['group_id'],sort=True)
# Delete the main data to free up memory
del main

In [2]:
#######################
## Main Parent Table ##
#######################
# Get the main information from the first observation in each firm group. Since this information is the same for all 
# observations in a group, it doesn't matter which observation it comes from
collapse_main = main_group.first()
del main_group
# Only keep the Company Name, the state, and the archive version
collapse_main = collapse_main[['Company', 'State', 'archive_version_year']]
# Simplify the datatypes to save space
collapse_main['State'] = collapse_main.State.astype('category')
# Save the main data
collapse.append('main', collapse_main, index=False, dropna=False, expectedrows = 30196810)
# Delete the main parent data to save memory
del collapse_main

In [3]:
####################
## Descript Table ##
####################
descript = store.select('descript')
descript['group_id'] = group_id
parent_group = descript.groupby(['group_id'], sort=True)
del descript
parent_desc = parent_group.sum()
parent_desc2 = parent_group.min()
del parent_group
parent_desc['year_established'] = parent_desc2['year_established']
del parent_desc2
collapse.append('descript', parent_desc, index=False, dropna=False, expectedrows = 30196810)
del parent_desc

In [4]:
####################
## Industry Table ##
####################
industry = store.select('industry')
industry['group_id'] = group_id
collapse_group = industry.groupby(['group_id'], sort=True)
del industry
collapse_industry = collapse_group.first()
del collapse_group
collapse.append('industry', collapse_industry, index=False, dropna=False, expectedrows = 30196810)
del collapse_industry

In [38]:
###############
## Geo Table ##
###############
geo = store.select('geo')
geo['group_id'] = group_id
geo['State'] = geo.State.astype('object')
geo['City'] = geo.City.astype('object')
collapse_group = geo.groupby(['group_id'], sort=True)
del geo
collapse_geo = collapse_group.first()
del collapse_group
collapse.append('geo', collapse_geo, index=False, dropna=False, expectedrows = 30196810)
del collapse_geo

In [47]:
#####################
## Structure Table ##
#####################
struct = store.select('structure')
struct['group_id'] = group_id
collapse_group = struct.groupby(['group_id'], sort=True)
del struct
collapse_struct = collapse_group.first()
del collapse_group
collapse.append('structure', collapse_struct, index=False, dropna=False, expectedrows = 30196810)
del collapse_struct