In [65]:
import gc

from IPython.core.interactiveshell import InteractiveShell  
import geopandas as gpd
from ordered_set import OrderedSet
import pandas as pd

# Enable multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"
# Show all columns
pd.set_option('display.max_columns', None)

In [66]:
input_data_dir = '/data/census_of_agriculture/input/2016'
output_data_dir = '/data/census_of_agriculture/output/2016/tabular'

# 1.0 Process Excel sheet with column names and descriptions
The compilation of all of the file geodatabase dataset columns should match this dataset

In [67]:
print("Reading Excel sheet with variables")

data_description = pd.read_excel(f'{input_data_dir}/CEAG16_VariablesDescriptions_REAG16_EN_FR.xlsx', skiprows=3,
                       usecols=['Variables', 'Long description of the variables'])
data_description.rename(columns={'Variables': 'variables', 'Long description of the variables': 'description_en'}, inplace=True)
data_description['variables'] = data_description['variables'].str.lower()

# There are duplicate variables that are identical. For example, opermore_n
data_description = data_description.groupby(['variables', 'description_en']).last().reset_index()

# 2.0 Process Provinces and Territories
## 2.1 Process Agricultural Operations
**TODO:** 
- Mistakes:
    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns

In [68]:
dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')

ao_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')
# Lowercase column names
ao_pr.columns = [x.lower() for x in ao_pr.columns]

# Calculate dguid
ao_pr['geo_pruid'] = '2016A0002' + ao_pr['geo_pruid']
ao_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_pr.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_pr.columns))

# Get rid of the geometry column and shape area, length
ao_pr = ao_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_pr = ao_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'farms_n1',
 'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_length'}

## 2.2 Process Crop Cultures

In [30]:
dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'

print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')

cc_pr = gpd.read_file(dataset, 
                      layer='lpr_000b16a_ceag16_n')

# Lowercase column names
cc_pr.columns = [x.lower() for x in cc_pr.columns]

# Calculate dguid
cc_pr['geo_pruid'] = '2016A0002' + cc_pr['geo_pruid']
cc_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_pr.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_pr.columns))

# Get rid of the geometry column and shape area, length
cc_pr = cc_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_pr = cc_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 2.3 Process Farm Operators
**TODO:** 
- Mistakes:
    - Column `more_avg_a` should be called `more_avg_age`
    - Column `one_avg_ag` should be called `one_avg_age`
    - On the Excel sheet, there are four `OPER_N`, with the same definition
        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`
    - No idea what `opermore_1` is supposed to be
    - Column `operone_n1` is duplicate of `operone_n`

In [31]:
dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')

fo_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')

# Lowercase column names
fo_pr.columns = [x.lower() for x in fo_pr.columns]

# Calculate dguid
fo_pr['geo_pruid'] = '2016A0002' + fo_pr['geo_pruid']
fo_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)

# Fix mistakes
fo_pr.rename(columns={
    'more_avg_a': 'more_avg_age',
    'one_avg_ag': 'one_avg_age'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_pr.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_pr.columns))

# Get rid of the geometry column and shape area, length
fo_pr = fo_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_pr = fo_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'oper_n1',
 'oper_n2',
 'oper_n3',
 'oper_n4',
 'opermore_1',
 'operone_n1',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 2.4 Process Livestock Poultry Bees

In [32]:
dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'

print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')

lpb_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')

# Lowercase column names
lpb_pr.columns = [x.lower() for x in lpb_pr.columns]

# Calculate dguid
lpb_pr['geo_pruid'] = '2016A0002' + lpb_pr['geo_pruid']
lpb_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_pr.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_pr.columns))

# Get rid of the geometry column and shape area, length
lpb_pr = lpb_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_pr = lpb_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 2.5 Process Use Tenure Practices

In [33]:
dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'

print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')

utp_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')

# Lowercase column names
utp_pr.columns = [x.lower() for x in utp_pr.columns]

# Calculate dguid
utp_pr['geo_pruid'] = '2016A0002' + utp_pr['geo_pruid']
utp_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_pr.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_pr.columns))

# Get rid of the geometry column and shape area, length
utp_pr = utp_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_pr = utp_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 2.6 Join the DataFrames and Export

In [34]:
print("Merging all Province and Territories dataframes into one")
pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \
                .merge(fo_pr, how='inner', on='pr_dguid') \
                .merge(lpb_pr, how='inner', on='pr_dguid') \
                .merge(utp_pr, how='inner', on='pr_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(pr_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid")
set(pr_merge.columns) - set(data_description['variables'])

# Export
print("Exporting pr_2016.parquet")
pr_merge.to_parquet(f'{output_data_dir}/pr_2016.parquet', index=False, compression='zstd')

# Create country as well
# TODO: check if -1 values subtracted from the sum
country = pd.read_parquet(f'{output_data_dir}/pr_2016.parquet')
country['pr_dguid'] = '2016A000011124'
country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)
country = country.groupby(['country_dguid']).sum()
country.reset_index(inplace=True)
# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
country = country.convert_dtypes(**params)
print("Exporting country_2016.parquet")
country.to_parquet(f'{output_data_dir}/country_2016.parquet', index=False, compression='zstd')

del(ao_pr)
del(cc_pr)
del(fo_pr)
del(lpb_pr)
del(utp_pr)
del(pr_merge)
del(country)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid


{'pr_dguid'}

0

# 3.0 Process Census Agricultural Regions

## 3.1 Process Agricultural Operations
**TODO:** 
- Mistakes:
    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns

In [70]:
dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')

ao_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')

# Lowercase column names
ao_car.columns = [x.lower() for x in ao_car.columns]

# Calculate dguid
ao_car['geo_caruid'] = '2016S0501' + ao_car['geo_caruid']
ao_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_car.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_car.columns))

# Get rid of the geometry column and shape area, length
ao_car = ao_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_car = ao_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'farms_n1',
 'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_length'}

## 3.2 Process Crop Cultures

In [36]:
dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'

print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')

cc_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')

# Lowercase column names
cc_car.columns = [x.lower() for x in cc_car.columns]

# Calculate dguid
cc_car['geo_caruid'] = '2016S0501' + cc_car['geo_caruid']
cc_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_car.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_car.columns))

# Get rid of the geometry column and shape area, length
cc_car = cc_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_car = cc_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 3.3 Process Farm Operators
**TODO:** 
- Mistakes:
    - Column `more_avg_a` should be called `more_avg_age`
    - Column `one_avg_ag` should be called `one_avg_age`
    - On the Excel sheet, there are four `OPER_N`, with the same definition
        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`
    - No idea what `opermore_1` is supposed to be
    - Column `operone_n1` is duplicate of `operone_n`

In [37]:
dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')

fo_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')

# Lowercase column names
fo_car.columns = [x.lower() for x in fo_car.columns]

# Calculate dguid
fo_car['geo_caruid'] = '2016S0501' + fo_car['geo_caruid']
fo_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)

# Fix mistakes
fo_car.rename(columns={
    'more_avg_a': 'more_avg_age',
    'one_avg_ag': 'one_avg_age'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_car.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_car.columns))

# Get rid of the geometry column and shape area, length
fo_car = fo_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_car = fo_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'oper_n1',
 'oper_n2',
 'oper_n3',
 'oper_n4',
 'opermore_1',
 'operone_n1',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 3.4 Process Livestock Poultry Bees

In [38]:
dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')

lpb_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')

# Lowercase column names
lpb_car.columns = [x.lower() for x in lpb_car.columns]

# Calculate dguid
lpb_car['geo_caruid'] = '2016S0501' + lpb_car['geo_caruid']
lpb_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_car.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_car.columns))

# Get rid of the geometry column and shape area, length
lpb_car = lpb_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_car = lpb_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 3.5 Process Use Tenure Practices

In [39]:
dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'

print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')

utp_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')

# Lowercase column names
utp_car.columns = [x.lower() for x in utp_car.columns]

# Calculate dguid
utp_car['geo_caruid'] = '2016S0501' + utp_car['geo_caruid']
utp_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_car.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_car.columns))

# Get rid of the geometry column and shape area, length
utp_car = utp_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_car = utp_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 3.6 Join the DataFrames and Export

In [40]:
print("Merging all Census Agricultural Regions dataframes into one")
car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \
                .merge(fo_car, how='inner', on='car_dguid') \
                .merge(lpb_car, how='inner', on='car_dguid') \
                .merge(utp_car, how='inner', on='car_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(car_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid")
set(car_merge.columns) - set(data_description['variables'])

# Export
print("Exporting car_2016.parquet")
car_merge.to_parquet(f'{output_data_dir}/car_2016.parquet', index=False, compression='zstd')

del(ao_car)
del(cc_car)
del(fo_car)
del(lpb_car)
del(utp_car)
del(car_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid


{'car_dguid'}

0

# 4.0 Process Census Divisions
## 4.1 Process Agricultural Operations
**TODO:** 
- Mistakes:
    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns

In [75]:
dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')

ao_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')

# Lowercase column names
ao_cd.columns = [x.lower() for x in ao_cd.columns]

# Calculate dguid
ao_cd['geo_cduid'] = '2016A0003' + ao_cd['geo_cduid']
ao_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_cd.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_cd.columns))

# Get rid of the geometry column and shape area, length
ao_cd = ao_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_cd = ao_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'farms_n1',
 'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_length'}

## 4.2 Process Crop Cultures

In [42]:
dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'

print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')

cc_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')

# Lowercase column names
cc_cd.columns = [x.lower() for x in cc_cd.columns]

# Calculate dguid
cc_cd['geo_cduid'] = '2016A0003' + cc_cd['geo_cduid']
cc_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_cd.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_cd.columns))

# Get rid of the geometry column and shape area, length
cc_cd = cc_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_cd = cc_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 4.3 Process Farm Operators
**TODO:** 
- Mistakes:
    - Column `more_avg_a` should be called `more_avg_age`
    - Column `one_avg_ag` should be called `one_avg_age`
    - On the Excel sheet, there are four `OPER_N`, with the same definition
        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`
    - No idea what `opermore_1` is supposed to be
    - Column `operone_n1` is duplicate of `operone_n`

In [43]:
dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')

fo_cd = gpd.read_file(dataset, 
                      layer='lcd_000b16a_ceag16_n')

# Lowercase column names
fo_cd.columns = [x.lower() for x in fo_cd.columns]

# Calculate dguid
fo_cd['geo_cduid'] = '2016A0003' + fo_cd['geo_cduid']
fo_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'cd_dguid')

# Fix mistakes
fo_cd.rename(columns={
    'more_avg_a': 'more_avg_age',
    'one_avg_ag': 'one_avg_age'}, inplace=True)

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_cd.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_cd.columns))

# Get rid of the geometry column and shape area, length
fo_cd = fo_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_cd = fo_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'oper_n1',
 'oper_n2',
 'oper_n3',
 'oper_n4',
 'opermore_1',
 'operone_n1',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 4.4 Process Livestock Poultry Bees

In [44]:
dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'

print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')

lpb_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')

# Lowercase column names
lpb_cd.columns = [x.lower() for x in lpb_cd.columns]

# Calculate dguid
lpb_cd['geo_cduid'] = '2016A0003' + lpb_cd['geo_cduid']
lpb_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_cd.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_cd.columns))

# Get rid of the geometry column and shape area, length
lpb_cd = lpb_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_cd = lpb_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 4.5 Process Use Tenure Practices

In [45]:
dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'

print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')

utp_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')

# Lowercase column names
utp_cd.columns = [x.lower() for x in utp_cd.columns]

# Calculate dguid
utp_cd['geo_cduid'] = '2016A0003' + utp_cd['geo_cduid']
utp_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_cd.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_cd.columns))

# Get rid of the geometry column and shape area, length
utp_cd = utp_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_cd = utp_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 4.6 Join the DataFrames and Export

In [46]:
print("Merging all Census Divisions dataframes into one")
cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \
                .merge(fo_cd, how='inner', on='cd_dguid') \
                .merge(lpb_cd, how='inner', on='cd_dguid') \
                .merge(utp_cd, how='inner', on='cd_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(cd_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid")
set(cd_merge.columns) - set(data_description['variables'])

# Export
cd_merge.to_parquet(f'{output_data_dir}/cd_2016.parquet', index=False, compression='zstd')

del(ao_cd)
del(cc_cd)
del(fo_cd)
del(lpb_cd)
del(utp_cd)
del(cd_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid


{'cd_dguid'}

0

# 5.0 Process Consolidated Subdivisions
## 5.1 Process Agricultural Operations
**TODO:** 
- Mistakes:
    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns

In [77]:
dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')

ao_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')

# Lowercase column names
ao_ccs.columns = [x.lower() for x in ao_ccs.columns]

# Calculate dguid
ao_ccs['ccsuid'] = '2016S0502' + ao_ccs['ccsuid']
ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_ccs.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_ccs.columns))

# Get rid of the geometry column and shape area, length
ao_ccs = ao_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_ccs = ao_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'farms_n1',
 'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_length'}

## 5.2 Process Crop Cultures

In [48]:
dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'

print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')

cc_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')

# Lowercase column names
cc_ccs.columns = [x.lower() for x in cc_ccs.columns]

# Calculate dguid
cc_ccs['geo_ccsuid'] = '2016S0502' + cc_ccs['geo_ccsuid']
cc_ccs.rename(columns={'geo_ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_ccs.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_ccs.columns))

# Get rid of the geometry column and shape area, length
cc_ccs = cc_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_ccs = cc_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 5.3 Process Farm Operators
**TODO:** 
- Mistakes:
    - Column `more_avg_a` should be called `more_avg_age`
    - Column `one_avg_ag` should be called `one_avg_age`
    - On the Excel sheet, there are four `OPER_N`, with the same definition
        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`
    - No idea what `opermore_1` is supposed to be
    - Column `operone_n1` is duplicate of `operone_n`

In [49]:
dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'

print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')

fo_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')

# Lowercase column names
fo_ccs.columns = [x.lower() for x in fo_ccs.columns]

# Calculate dguid
fo_ccs['ccsuid'] = '2016S0502' + fo_ccs['ccsuid']
fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'ccs_dguid')

# Fix mistakes
fo_ccs.rename(columns={
    'more_avg_a': 'more_avg_age',
    'one_avg_ag': 'one_avg_age'}, inplace=True)

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_ccs.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_ccs.columns))

# Get rid of the geometry column and shape area, length
fo_ccs = fo_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_ccs = fo_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'oper_n1',
 'oper_n2',
 'oper_n3',
 'oper_n4',
 'opermore_1',
 'operone_n1',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 5.4 Process Livestock Poultry Bees

In [50]:
dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'

print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')

lpb_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')

# Lowercase column names
lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]

# Calculate dguid
lpb_ccs['ccsuid'] = '2016S0502' + lpb_ccs['ccsuid']
lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_ccs.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_ccs.columns))

# Get rid of the geometry column and shape area, length
lpb_ccs = lpb_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_ccs = lpb_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 5.5 Process Use Tenure Practices

In [51]:
dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'

print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')

utp_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')

# Lowercase column names
utp_ccs.columns = [x.lower() for x in utp_ccs.columns]

# Calculate dguid
utp_ccs['ccsuid'] = '2016S0502' + utp_ccs['ccsuid']
utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables
variable_names = list(data_description['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_ccs.columns) -  set(variable_names)

variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_ccs.columns))

# Get rid of the geometry column and shape area, length
utp_ccs = utp_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_ccs = utp_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geo_descr1',
 'geo_descr_',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 5.6 Join the DataFrames and Export

In [52]:
print("Merging all Census Consolidated Subdivisions dataframes into one")
ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \
                  .merge(fo_ccs, how='inner', on='ccs_dguid') \
                  .merge(lpb_ccs, how='inner', on='ccs_dguid') \
                  .merge(utp_ccs, how='inner', on='ccs_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(ccs_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid")
set(ccs_merge.columns) - set(data_description['variables'])

# Export
print("Exporting ccs_2016.parquet")
ccs_merge.to_parquet(f'{output_data_dir}/ccs_2016.parquet', index=False, compression='zstd')

del(ao_ccs)
del(cc_ccs)
del(fo_ccs)
del(lpb_ccs)
del(utp_ccs)
del(ccs_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid


{'ccs_dguid'}

0