# voters - data engineering class project 2024
### cris crawford
This is a python file to read in about 1G of voter data that includes all registered voters in Massachusetts. The purpose is to write it into a Google cloud bucket.

In [2]:
import zipfile
import pandas as pd
from google.cloud import storage

In [3]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/cris/.gc/keys.json'

### Here we're reading the file in from a Google cloud bucket and unzipping it.

In [3]:
# Initialize GCS client
client = storage.Client()

# Define GCS bucket and zip file path
bucket_name = 'cris-voter-data'
zip_file_path = 'voter_extract_20220828.zip'
destination_path = 'temp2.zip'

try:
    # Get the bucket
    bucket = client.get_bucket(bucket_name)
    print("Bucket:", bucket)

    # Get the blob (file) from the bucket
    blob = bucket.blob(zip_file_path)
    print("Blob:", blob)

    # Download the blob's content and write it to a file
    with open(destination_path, 'wb') as file:
        blob.download_to_file(file)
    
    print("Downloaded zip file to:", destination_path)

    # Check if download was successful
    if os.path.exists(destination_path):
        # Extract the zip file
        with zipfile.ZipFile(destination_path, 'r') as zip_ref:
            zip_ref.extractall('extracted_voter')
        print("Successfully extracted zip file contents.")
    else:
        print("Failed to download zip file.")

except Exception as e:
    print("Error:", e)


Bucket: <Bucket: cris-voter-data>
Blob: <Blob: cris-voter-data, voter_extract_20220828.zip, None>
Downloaded zip file to: temp2.zip
Successfully extracted zip file contents.


### Here, I'm defining a schema. 'tmp' variables are names and addresses and will not be read into parquet.

In [4]:
schema = {
  'city_code': 'int32',
  'city_name': 'object',
  'county_name': 'object',
  'voter_id': 'object',
  'tmp1': 'object',
  'tmp2': 'object',
  'tmp3': 'object',
  'tmp4': 'object',
  'tmp5': 'object',
  'tmp6': 'object',
  'tmp7': 'object',
  'tmp8': 'object',
  'zip': 'object',
  'tmp9': 'object',
  'tmp10': 'object',
  'tmp11': 'object',
  'tmp12': 'object',
  'tmp13': 'object',
  'party_affiliation': 'object',
  'tmp14': 'object',
  'tmp15': 'object',
  'tmp16': 'object',
  'ward': 'object',
  'precinct': 'object',
  'congressional_district': 'int32',
  'senatorial_district': 'int32',
  'state_rep_district': 'int32',
  'voter_status': 'object'
}

### Now I'm reading in the files and writing them out to parquet, and moving them back to the Google cloud bucket.

In [8]:
bucket_name = 'cris-voter-data'
client = storage.Client()
bucket = client.get_bucket(bucket_name)

selected_columns=['city_code', 'city_name', 'county_name', 'voter_id', 'zip', 'party_affiliation', 'ward', 'precinct', 'congressional_district', 'senatorial_district', 'state_rep_district', 'voter_status']

def csv_to_parquet(filename):
    df = pd.read_csv('extracted_voter/' + filename, delimiter=',', names=list(schema.keys()), dtype=schema)
    new_df = df[selected_columns]
    new_df.to_parquet(filename[:-4] + '.parquet', engine='pyarrow')
    blob = bucket.blob('voter_output/' + filename[:-4] + '.parquet')
    blob.upload_from_filename(filename[:-4] + '.parquet')
    print(filename, len(new_df))

In [9]:
csv_to_parquet('voter001.csv')
csv_to_parquet('voter002.csv')
csv_to_parquet('voter003.csv')
csv_to_parquet('voter004.csv')
csv_to_parquet('voter005.csv')
csv_to_parquet('voter006.csv')
csv_to_parquet('voter007.csv')

voter001.csv 591286
voter002.csv 762552
voter003.csv 634527
voter004.csv 765429
voter005.csv 731690
voter006.csv 759911
voter007.csv 593821


### clean up

In [10]:
!rm *.parquet