In [2]:
# Process the CSV file to aggregate data by state, handling different delimiters
import csv

input_file = 'DECENNIALDP2020.DP1-Data.csv'  # Assuming you have processed the headers already
output_file = 'census_aggregated_by_state.csv'

state_data = {}
headers = []

with open(input_file, 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    
    # Read the header row
    headers = next(reader)
    
    # The index of the 'Geographic Area Name' column
    area_name_index = 1  # Since we skipped the 'Geography' column
    
    # Process each row
    for row in reader:
        if len(row) < len(headers):
            continue  # Skip incomplete rows
        
        # Extract the state from 'Geographic Area Name'
        geographic_area_name = row[area_name_index]
        
        # Try splitting by semicolon first
        if ';' in geographic_area_name:
            parts = geographic_area_name.split(';')
        else:
            # If no semicolons, split by comma
            parts = geographic_area_name.split(',')
        
        state = parts[-1].strip()  # Assuming the state is the last part
        
        # Initialize the state's data if not already done
        if state not in state_data:
            state_data[state] = [0] * (len(headers) - 2)  # Exclude 'Geographic Area Name' and 'Geography' columns
        
        # Process numerical columns
        for i in range(2, len(headers)):
            try:
                value = int(row[i].strip())
            except (ValueError, IndexError):
                value = 0  # Treat missing or malformed values as 0
            state_data[state][i - 2] += value  # Aggregate the value for the state

# Write the aggregated data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    
    # Write the header
    new_headers = ['State'] + headers[2:]  # Replace 'Geographic Area Name' with 'State'
    writer.writerow(new_headers)
    
    # Write the aggregated data for each state
    for state, data_values in state_data.items():
        row = [state] + data_values
        writer.writerow(row)