In [11]:
import requests
from pathlib import Path
import pandas as pd
import zipfile
import os
from google.cloud import bigquery
import shutil


In [5]:
# URL of the ZIP file
url = 'https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/prof/details/download-telecharger/comp/GetFile.cfm?Lang=E&FILETYPE=CSV&GEONO=006_Ontario'

# Path where you want to save the ZIP file
output_path = Path('census_data.zip')

# Downloading the ZIP file
response = requests.get(url)
output_path.write_bytes(response.content)


757020277

In [6]:
# Unzipping the file
with zipfile.ZipFile(output_path, 'r') as zip_ref:
    zip_ref.extractall('census_data')


In [5]:
file_path = 'census_data/98-401-X2021006_English_CSV_data_Ontario.csv'

# read 100000 CSV file with ISO-8859-1 encoding
df = pd.read_csv(file_path,nrows = 100000, encoding='ISO-8859-1')



  df = pd.read_csv(file_path,nrows = 100000, encoding='ISO-8859-1')


In [6]:
df.shape

(100000, 23)

In [7]:
filtered_df = df[df['GEO_LEVEL'] == 'Dissemination area']

In [8]:
filtered_df.shape

(86845, 23)

In [8]:
print(os.getcwd())

/home/jupyter/row_community_database/census_data_2021


In [10]:
import pandas as pd
from google.cloud import bigquery

# Set your BigQuery and CSV settings
project_id = 'row-community-database'
dataset_name = 'unprocessed'
table_name = 'census2021'
csv_file_path = 'census_data/98-401-X2021006_English_CSV_data_Ontario.csv'
chunksize = 100000  # so that we don't have to read entire CSV in the memory

# Specify the columns you want to read from the CSV
selected_columns = ['DGUID','GEO_LEVEL','GEO_NAME', 'CHARACTERISTIC_NAME', 'C1_COUNT_TOTAL','C10_RATE_TOTAL']

# Construct a BigQuery client object
client = bigquery.Client(project=project_id)

# Define your table ID
table_id = f'{project_id}.{dataset_name}.{table_name}'

# Define the job configuration for BigQuery
job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,  # Append to the table
    autodetect=True,  # Auto-detect schema
    source_format=bigquery.SourceFormat.CSV,  # The source format is CSV
)

# Encoding type
encoding = 'ISO-8859-1'  # Adjust this based on your CSV's encoding

# Process the CSV file in chunks
for chunk_index, chunk in enumerate(pd.read_csv(csv_file_path, chunksize=chunksize, encoding=encoding, usecols=selected_columns, low_memory=False)):
    
    # Filter the chunk to only include rows where GEO_LEVEL equals "Dissemination area"
    filtered_chunk = chunk[chunk['GEO_LEVEL'] == 'Dissemination area']
    
    # If the filtered chunk is empty, skip to the next chunk
    if filtered_chunk.empty:
        print(f'Chunk {chunk_index} is empty after filtering.')
        continue
    
    try:
        # Load the filtered chunk into BigQuery
        job = client.load_table_from_dataframe(filtered_chunk, table_id, job_config=job_config)
        job.result()  # Wait for the job to complete

        print(f'Loaded chunk {chunk_index} with {len(filtered_chunk)} rows.')
    except Exception as e:
        print(f'Error loading chunk {chunk_index}: {e}')


Loaded chunk 0 with 86845 rows.
Loaded chunk 1 with 97369 rows.
Loaded chunk 2 with 97369 rows.
Loaded chunk 3 with 100000 rows.
Loaded chunk 4 with 97369 rows.
Loaded chunk 5 with 94738 rows.
Loaded chunk 6 with 94584 rows.
Loaded chunk 7 with 92261 rows.
Loaded chunk 8 with 92107 rows.
Loaded chunk 9 with 97149 rows.
Loaded chunk 10 with 92327 rows.
Loaded chunk 11 with 100000 rows.
Loaded chunk 12 with 100000 rows.
Loaded chunk 13 with 100000 rows.
Loaded chunk 14 with 100000 rows.
Loaded chunk 15 with 100000 rows.
Loaded chunk 16 with 100000 rows.
Loaded chunk 17 with 100000 rows.
Loaded chunk 18 with 100000 rows.
Loaded chunk 19 with 100000 rows.
Loaded chunk 20 with 100000 rows.
Loaded chunk 21 with 100000 rows.
Loaded chunk 22 with 100000 rows.
Loaded chunk 23 with 100000 rows.
Loaded chunk 24 with 100000 rows.
Loaded chunk 25 with 100000 rows.
Loaded chunk 26 with 100000 rows.
Loaded chunk 27 with 100000 rows.
Loaded chunk 28 with 100000 rows.
Loaded chunk 29 with 100000 rows.


In [12]:
os.remove("census_data.zip")
shutil.rmtree("census_data")
