In [40]:
import gc

from IPython.core.interactiveshell import InteractiveShell  
import geopandas as gpd
import pandas as pd

# Enable multiple outputs per cell
InteractiveShell.ast_node_interactivity = "all"
# Show all columns
pd.set_option('display.max_columns', None)

In [41]:
input_data_dir = '/data/census_of_agriculture/input/2021'
output_data_dir = '/data/census_of_agriculture/output/2021/tabular'

# 1.0 Process Excel sheet with column names and descriptions
The compilation of all of the file geodatabase dataset columns should match this dataset

In [42]:
print("Reading Excel sheet with variables")

data_description = pd.read_excel(f'{input_data_dir}/CEAG21_VariablesDescriptions_REAG21_EN_FR.xlsx', skiprows=2,
                       usecols=['2021 Variables', 'Categories', '2021 Long description of the variables (EN)'])
data_description.rename(columns={'2021 Variables': 'variables', 'Categories': 'categories', '2021 Long description of the variables (EN)': 'description_en'}, inplace=True)
data_description['variables'] = data_description['variables'].str.lower()

# 2.0 Process Provinces and Territories
## 2.1 Process Agricultural Operations
**TODO:** 
- Figure out the -1 values
- Figure out why `valoeq` is float and not integer
    - It is float because Nova Scotia has a value of 82047377.0000004
- Figure out why the data types are integer64

In [43]:
dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')

ao_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')
# Lowercase column names
ao_pr.columns = [x.lower() for x in ao_pr.columns]

# Calculate dguid
ao_pr['pruid'] = '2021A0002' + ao_pr['pruid']
ao_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_pr.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
ao_pr = ao_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_pr = ao_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}

## 2.2 Process Crop Cultures

In [44]:
dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'

print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')

cc_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')

# Lowercase column names
cc_pr.columns = [x.lower() for x in cc_pr.columns]

# Calculate dguid
cc_pr['pruid'] = '2021A0002' + cc_pr['pruid']
cc_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_pr.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
cc_pr = cc_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_pr = cc_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geometry',
 'prename',
 'prfname',
 'shape_area',
 'shape_area_1',
 'shape_length',
 'shape_length_1'}

## 2.3 Process Farm Operators

In [45]:
dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')

fo_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')

# Lowercase column names
fo_pr.columns = [x.lower() for x in fo_pr.columns]

# Calculate dguid
fo_pr['pruid'] = '2021A0002' + fo_pr['pruid']
fo_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)

# Select the variables for farm operators
variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_pr.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
fo_pr = fo_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_pr = fo_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}

## 2.4 Process Livestock Poultry Bees

In [46]:
dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'

print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')

lpb_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')

# Lowercase column names
lpb_pr.columns = [x.lower() for x in lpb_pr.columns]

# Calculate dguid
lpb_pr['pruid'] = '2021A0002' + lpb_pr['pruid']
lpb_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)

# Select the variables for livestock poultry bees
variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_pr.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
lpb_pr = lpb_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_pr = lpb_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}

## 2.5 Process Use Tenure Practices

In [47]:
dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'

print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')

utp_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')

# Lowercase column names
utp_pr.columns = [x.lower() for x in utp_pr.columns]

# Calculate dguid
utp_pr['pruid'] = '2021A0002' + utp_pr['pruid']
utp_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)

# Select the variables for tenure practices
variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])
variable_names.insert(0, 'pr_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_pr.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
utp_pr = utp_pr[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_pr = utp_pr.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}

## 2.6 Join the DataFrames and Export

In [48]:
print("Merging all Province and Territories dataframes into one")
pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \
                .merge(fo_pr, how='inner', on='pr_dguid') \
                .merge(lpb_pr, how='inner', on='pr_dguid') \
                .merge(utp_pr, how='inner', on='pr_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(pr_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid")
set(pr_merge.columns) - set(data_description['variables'])

# Export
print("Exporting pr_2021.parquet")
pr_merge.to_parquet(f'{output_data_dir}/pr_2021.parquet', index=False, compression='zstd')

# Create country as well
# TODO: check if -1 values subtracted from the sum
country = pd.read_parquet(f'{output_data_dir}/pr_2021.parquet')
country['pr_dguid'] = '2021A000011124'
country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)
country = country.groupby(['country_dguid']).sum()
country.reset_index(inplace=True)
# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
country = country.convert_dtypes(**params)
print("Exporting country_2021.parquet")
country.to_parquet(f'{output_data_dir}/country_2021.parquet', index=False, compression='zstd')

del(ao_pr)
del(cc_pr)
del(fo_pr)
del(lpb_pr)
del(utp_pr)
del(pr_merge)
del(country)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid


{'pr_dguid'}

308

# 3.0 Process Census Agricultural Regions

## 3.1 Process Agricultural Operations

In [49]:
dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')

ao_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')

# Lowercase column names
ao_car.columns = [x.lower() for x in ao_car.columns]

# Calculate dguid
ao_car['caruid'] = '2021S0501' + ao_car['caruid']
ao_car.rename(columns={'caruid':'car_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_car.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
ao_car = ao_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_car = ao_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}

## 3.2 Process Crop Cultures

In [50]:
dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'

print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')

cc_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')

# Lowercase column names
cc_car.columns = [x.lower() for x in cc_car.columns]

# Calculate dguid
cc_car['caruid'] = '2021S0501' + cc_car['caruid']
cc_car.rename(columns={'caruid':'car_dguid'}, inplace=True)

# Select the variables for crop cultures
variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_car.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
cc_car = cc_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_car = cc_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}

## 3.3 Process Farm Operators
Census of Agriculture release made a mistake in this file:
- `more_avg_age` is now `more_avg_a` in the file
- `more_med_age` is now `more_med_a` in the file
- `one_avg_age` is now `one_avg_ag` in the file
- `one_med_age` is now `one_med_ag` in the file
- `plan_nodis_n` is now `plan_nodis` in the file

In [51]:
dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')

fo_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')

# Lowercase column names
fo_car.columns = [x.lower() for x in fo_car.columns]

# Calculate dguid
fo_car['caruid'] = '2021S0501' + fo_car['caruid']
fo_car.rename(columns={'caruid':'car_dguid'}, inplace=True)

# Select the variables for farm operators
variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])
variable_names.insert(0, 'car_dguid')

# Rename mistakes
fo_car.rename(columns={
    'more_avg_a': 'more_avg_age',
    'more_med_a': 'more_med_age',
    'one_avg_ag': 'one_avg_age',
    'one_med_ag': 'one_med_age',
    'plan_nodis': 'plan_nodis_n'
}, inplace=True)

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_car.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
fo_car = fo_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_car = fo_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'carename',
 'carfname',
 'geometry',
 'shape_area',
 'shape_leng',
 'shape_length'}

## 3.4 Process Livestock Poultry Bees

In [52]:
dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')

lpb_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')

# Lowercase column names
lpb_car.columns = [x.lower() for x in lpb_car.columns]

# Calculate dguid
lpb_car['caruid'] = '2021S0501' + lpb_car['caruid']
lpb_car.rename(columns={'caruid':'car_dguid'}, inplace=True)

# Select the variables for livestock poultry bees
variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_car.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
lpb_car = lpb_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_car = lpb_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}

## 3.5 Process Use Tenure Practices

In [53]:
dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'

print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')

utp_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')

# Lowercase column names
utp_car.columns = [x.lower() for x in utp_car.columns]

# Calculate dguid
utp_car['caruid'] = '2021S0501' + utp_car['caruid']
utp_car.rename(columns={'caruid':'car_dguid'}, inplace=True)

# Select the variables for tenure practices
variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])
variable_names.insert(0, 'car_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_car.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
utp_car = utp_car[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_car = utp_car.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}

## 3.6 Join the DataFrames and Export

In [54]:
print("Merging all Census Agricultural Regions dataframes into one")
car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \
                .merge(fo_car, how='inner', on='car_dguid') \
                .merge(lpb_car, how='inner', on='car_dguid') \
                .merge(utp_car, how='inner', on='car_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(car_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid")
set(car_merge.columns) - set(data_description['variables'])

# Export
print("Exporting car_2021.parquet")
car_merge.to_parquet(f'{output_data_dir}/car_2021.parquet', index=False, compression='zstd')

del(ao_car)
del(cc_car)
del(fo_car)
del(lpb_car)
del(utp_car)
del(car_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid


{'car_dguid'}

0

# 4.0 Process Census Divisions
## 4.1 Process Agricultural Operations

In [55]:
dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')

ao_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')

# Lowercase column names
ao_cd.columns = [x.lower() for x in ao_cd.columns]

# Calculate dguid
ao_cd['cduid'] = '2021A0003' + ao_cd['cduid']
ao_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_cd.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
ao_cd = ao_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_cd = ao_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}

## 4.2 Process Crop Cultures

In [56]:
dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'

print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')

cc_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')

# Lowercase column names
cc_cd.columns = [x.lower() for x in cc_cd.columns]

# Calculate dguid
cc_cd['cduid'] = '2021A0003' + cc_cd['cduid']
cc_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_cd.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
cc_cd = cc_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_cd = cc_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}

## 4.3 Process Farm Operators
Census of Agriculture release made a mistake in this file:
- `more_avg_age` is now `more_avg_a` in the file
- `more_med_age` is now `more_med_a` in the file
- `one_avg_age` is now `one_avg_ag` in the file
- `one_med_age` is now `one_med_ag` in the file
- `plan_nodis_n` is now `plan_nodis` in the file

In [57]:
dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')

fo_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')

# Lowercase column names
fo_cd.columns = [x.lower() for x in fo_cd.columns]

# Calculate dguid
fo_cd['cduid'] = '2021A0003' + fo_cd['cduid']
fo_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])
variable_names.insert(0, 'cd_dguid')

# Rename mistakes
fo_cd.rename(columns={
    'more_avg_a': 'more_avg_age',
    'more_med_a': 'more_med_age',
    'one_avg_ag': 'one_avg_age',
    'one_med_ag': 'one_med_age',
    'plan_nodis': 'plan_nodis_n'
}, inplace=True)

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_cd.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
fo_cd = fo_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_cd = fo_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_leng', 'shape_length'}

## 4.4 Process Livestock Poultry Bees

In [58]:
dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'

print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')

lpb_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')

# Lowercase column names
lpb_cd.columns = [x.lower() for x in lpb_cd.columns]

# Calculate dguid
lpb_cd['cduid'] = '2021A0003' + lpb_cd['cduid']
lpb_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)

# Select the variables for livestock poultry bees
variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_cd.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
lpb_cd = lpb_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_cd = lpb_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}

## 4.5 Process Use Tenure Practices

In [59]:
dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'

print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')

utp_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')

# Lowercase column names
utp_cd.columns = [x.lower() for x in utp_cd.columns]

# Calculate dguid
utp_cd['cduid'] = '2021A0003' + utp_cd['cduid']
utp_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)

# Select the variables for tenure practices
variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])
variable_names.insert(0, 'cd_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_cd.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
utp_cd = utp_cd[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_cd = utp_cd.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}

## 4.6 Join the DataFrames and Export

In [60]:
print("Merging all Census Divisions dataframes into one")
cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \
                .merge(fo_cd, how='inner', on='cd_dguid') \
                .merge(lpb_cd, how='inner', on='cd_dguid') \
                .merge(utp_cd, how='inner', on='cd_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(cd_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid")
set(cd_merge.columns) - set(data_description['variables'])

# Export
print("Exporting cd_2021.parquet")
cd_merge.to_parquet(f'{output_data_dir}/cd_2021.parquet', index=False, compression='zstd')

del(ao_cd)
del(cc_cd)
del(fo_cd)
del(lpb_cd)
del(utp_cd)
del(cd_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid


{'cd_dguid'}

0

# 5.0 Process Consolidated Subdivisions
## 5.1 Process Agricultural Operations

In [61]:
dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')

ao_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')

# Lowercase column names
ao_ccs.columns = [x.lower() for x in ao_ccs.columns]

# Calculate dguid
ao_ccs['ccsuid'] = '2021S0502' + ao_ccs['ccsuid']
ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(ao_ccs.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
ao_ccs = ao_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
ao_ccs = ao_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}

## 5.2 Process Crop Cultures

In [62]:
dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'

print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')

cc_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')

# Lowercase column names
cc_ccs.columns = [x.lower() for x in cc_ccs.columns]

# Calculate dguid
cc_ccs['ccsuid'] = '2021S0502' + cc_ccs['ccsuid']
cc_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(cc_ccs.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
cc_ccs = cc_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
cc_ccs = cc_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}

## 5.3 Process Farm Operators

In [63]:
dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'

print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')

fo_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')

# Lowercase column names
fo_ccs.columns = [x.lower() for x in fo_ccs.columns]

# Calculate dguid
fo_ccs['ccsuid'] = '2021S0502' + fo_ccs['ccsuid']
fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables for agricultural operations
variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(fo_ccs.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
fo_ccs = fo_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
fo_ccs = fo_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}

## 5.4 Process Livestock Poultry Bees

In [64]:
dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'

print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')

lpb_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')

# Lowercase column names
lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]

# Calculate dguid
lpb_ccs['ccsuid'] = '2021S0502' + lpb_ccs['ccsuid']
lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)


# Select the variables for livestock poultry bees
variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(lpb_ccs.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
lpb_ccs = lpb_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
lpb_ccs = lpb_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}

## 5.5 Process Use Tenure Practices

In [65]:
dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'

print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')

utp_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')

# Lowercase column names
utp_ccs.columns = [x.lower() for x in utp_ccs.columns]

# Calculate dguid
utp_ccs['ccsuid'] = '2021S0502' + utp_ccs['ccsuid']
utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)

# Select the variables for tenure practices
variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])
variable_names.insert(0, 'ccs_dguid')

# Quick check
print("Quick check on columns that are on the geodataframe but not on the variables list")
set(utp_ccs.columns) -  set(variable_names)

# Get rid of the geometry column and shape area, length
utp_ccs = utp_ccs[variable_names]

# Convert to lowest data type
params = {
    'convert_string': False,
    'convert_boolean': False
}
utp_ccs = utp_ccs.convert_dtypes(**params)

Quick check on columns that are on the geodataframe but not on the variables list


{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}

## 5.6 Join the DataFrames and Export

In [66]:
print("Merging all Census Consolidated Subdivisions dataframes into one")
ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \
                  .merge(fo_ccs, how='inner', on='ccs_dguid') \
                  .merge(lpb_ccs, how='inner', on='ccs_dguid') \
                  .merge(utp_ccs, how='inner', on='ccs_dguid')

# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr
print("Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr")
set(data_description['variables']) - set(ccs_merge.columns)
print("Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid")
set(ccs_merge.columns) - set(data_description['variables'])

# Export
print("Exporting ccs_2021.parquet")
ccs_merge.to_parquet(f'{output_data_dir}/ccs_2021.parquet', index=False, compression='zstd')

del(ao_ccs)
del(cc_ccs)
del(fo_ccs)
del(lpb_ccs)
del(utp_ccs)
del(ccs_merge)
gc.collect()

Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr


{'geo_descr_en', 'geo_descr_fr'}

Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid


{'ccs_dguid'}

0