In [1]:
import pandas as pd

In [2]:
data_folder = 'G:/Geoff/Data/housing-production'

sale_property_id = 'sr_property_id'
assr_property_id = 'SA_PROPERTY_ID'

## Load the data and inspect its shape

In [3]:
# make the filenames
counties = ['alameda', 'contra_costa', 'marin', 'napa', 'san_francisco', 'san_mateo', 'santa_clara', 'solano', 'sonoma']
assessor_file_template = '{}/assessor/ARB_ASSR_{}.csv'
sales_file_template = '{}/sales/ARB_HIST_{}_v3.csv'
assessor_files = [assessor_file_template.format(data_folder, county.upper()) for county in counties]
sales_files = [sales_file_template.format(data_folder, county.upper()) for county in counties]

In [4]:
# load assessor data
df_from_each_file = (pd.read_csv(file, low_memory=False) for file in assessor_files)
df_assr = pd.concat(df_from_each_file, ignore_index=True)
df_assr.shape

(2206509, 190)

In [None]:
# load sales data
dtypes = {'ucb_geo_id' : str,
          'sr_date_transfer' : str}
df_from_each_file = (pd.read_csv(file, low_memory=False, dtype=dtypes) for file in sales_files)
df_sale = pd.concat(df_from_each_file, ignore_index=True)
df_sale.shape

In [None]:
df_sale[sale_property_id].sort_values().head()

In [None]:
df_assr[assr_property_id].sort_values().head()

## Merge property sales data with assessor data

This attaches unit characteristics to sales records

In [None]:
df = pd.merge(left=df_sale, right=df_assr, how='left', 
              left_on=sale_property_id, right_on=assr_property_id)

In [None]:
shape = df.shape
shape

## Filter the dataset to retain only useful columns

In [None]:
retain = ['sr_property_id',        #Unique DataQuick primary key identifier assigned to a property.
          'SA_PROPERTY_ID',        #Joined to Assessor data to merge Assessor and Recorder data.  Internal identification number assigned to every property
          'MM_MUNI_NAME',          #The full name of the jurisdiction.  
          'MM_FIPS_MUNI_CODE',     #Federal Information Processing Standards (FIPS) code for the county.  
          'SA_SITE_MAIL_SAME',     #Indicates if the site and mail address are the same, Owner Occupied.  
          'USE_CODE_STD',          #The DataQuick property use type code mapped to the jurisdictional use code.
          'SA_ZONING',             #The zoning code assigned to a property by a county/city/other government bureau which defines the allowed size, type, structure, nature, and use of property and/or buildings.  This code is not standardized and is subjective to the specific local government regulation.
          'SA_ARCHITECTURE_CODE',  #Indicates the architectural style of the structure.
          'SA_BLDG_SQFT',          #The square footage of the building/structure on the property.
          'SA_CONSTRUCTION_CODE',  #Indicates the material used in the construction of the framework for the structure on the  property.
          'SA_NBR_BATH_DQ',        #Indicates the number of baths in real estate terms. For example, a property containing a one quarter bath, half bath and full bath would have an SA_NBR_BATH value of 1.75.  
          'SA_NBR_BEDRMS',         #Indicates the number of bedrooms for all structures on the property.
          'SA_NBR_UNITS',          #Indicates the total number of units for all structures on the property. This field will include the number of apartment or commercial units.
          'SA_YR_BLT',             #Year in which the primary structure was built on the property
          'SA_YR_BLT_EFFECT',      #Year in which "permitted" major improvements were made to the property
          'sr_date_transfer',      #Contains the official filing date for the transaction that is normally stamped or printed on the document
          'ucb_geo_id',            #11-character concatenation of the state, county, and census tract FIPS codes
          'ucb_price_sqft',        #price per square foot in nominal dollars
          'ucb_price_sqft_adj',    #price per square foot adjusted to 2010 dollars using national headline CPI
          'ucb_condo_subdiv_flag', #marks a single record saved from a multi-property transaction
          'ucb_condo_subdiv_sqft'] #combined square footage from a multi-property transaction

df = df[retain]
df.shape

In [None]:
df.head()

In [None]:
df.iloc[0]

In [None]:
# save it
df.to_csv('{}/merged.csv'.format(data_folder), index=False, encoding='utf-8')