In [1]:
import numpy as np
import pandas as pd
import os, errno

def silentremove(filename):
   try:
       os.remove(filename)
   except OSError as e: # this would be "except OSError, e:" before Python 2.6
       if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
           raise # re-raise exception if a different error occurred

# Create a relative path to the base directory
base = '../'

group_map = dict(
    main = ['Company', 'ABI', 'archive_version_year'],
    geo = ['City', 'State', 'ZipCode', 'County_Code',
          'CBSA_Code'],
    industry = ['Primary_SIC_Code', 'Primary_NAICS_Code',
               'SIC_Code', 'SIC_Code_1', 'SIC_Code_2'],
    structure = ['Business_Status_Code', 'Company_Holding_Status',
                'Subsidiary_Number', 'Parent_Number', 'IDCode'],
    descript = ['year_established', 'employee_size_location', 'sales_volume_location']
)

col_types = {'Company':object,  
             'City': object,
             'State': object,
             'ZipCode': np.float64,
             'County_Code': np.float64,
             'IDCode': np.float64,
             'Primary_SIC_Code': np.float64,
             'SIC_Code': np.float64,
             'SIC_Code_1': np.float64,
             'SIC_Code_2': np.float64,
             'Primary_NAICS_Code': np.float64,
             'Business_Status_Code': np.float64,
             'Company_Holding_Status': np.float64,
             'ABI': np.int32,
             'Parent_Number':np.float64,
             'Subsidiary_Number':np.float64,
             'CBSA_Code': np.float64,
             'archive_version_year': np.int16,
             'year_established': np.float64,
             'employee_size_location':np.float64,
             'sales_colume_location': np.float64}

# Create a filename where we can store all records in an HDF5 file
filename = base + 'Data/ReferenceUSA/sample.h5'
# Clear the HdF5 file so we're not doubling up the data
silentremove(filename)
# Attach to the HDF5 file
store = pd.HDFStore(filename)

for obs_num in range(1,260000001,10000000):
    print(obs_num)
    # Set the file path to read in the CSV data
    raw_data = base + 'Data/ReferenceUSA/sample_cond_' + str(obs_num) +'.csv'
    # Read in the ReferenceUSA data from CSV
    data = pd.read_csv(raw_data, dtype=col_types)
    # Only keep data after 2012, since stripe data only starts sin 2013
    data = data[data.archive_version_year>=2013]
    # Fix the Parent_Number Column so that it's a float, and not a string
    #data['Parent_Number'] = data.Parent_Number.astype('float64')
    # Append the data to an HDF5 file for easier storage
    
    try:
        nrows = store.get_storer('main').nrows
    except:
        nrows = 0
    
    data.index = pd.Series(data.index) + nrows
    
    for name, fields in group_map.items():
        store.append(name, data[fields], dropna=False, expectedrows = 47500177)
                      

1
10000001
20000001
30000001
40000001
50000001
60000001
70000001
80000001
90000001
100000001
110000001
120000001
130000001
140000001
150000001
160000001
170000001
180000001
190000001
200000001
210000001
220000001
230000001
240000001
250000001
