In [1]:
import pandas as pd
import csv

In [9]:
# Read the cancer dataset
cancer_data = pd.read_csv("StateCancerDatawPop.csv")
cancer_data.head()

new_columns = ['State', '2010_LungBronchus', '2010_AllSites', '2010_Population',
    '2011_LungBronchus', '2011_AllSites', '2011_Population',
    '2012_LungBronchus', '2012_AllSites', '2012_Population',
    '2013_LungBronchus', '2013_AllSites', '2013_Population',
    '2014_LungBronchus', '2014_AllSites', '2014_Population',
    '2015_LungBronchus', '2015_AllSites', '2015_Population',
    '2016_LungBronchus', '2016_AllSites', '2016_Population',
    '2017_LungBronchus', '2017_AllSites', '2017_Population',
    '2018_LungBronchus', '2018_AllSites', '2018_Population',
    '2019_LungBronchus', '2019_AllSites', '2019_Population',
    '2020_LungBronchus', '2020_AllSites', '2020_Population']

# print(cancer_withRegion)
# print(len(new_columns))
cancer_data.columns = new_columns
cancer_data.head()

# Dictionary mapping state names to abbreviations
state_abbr = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR',
  'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'US']


# Add a new column with state abbreviations
cancer_data['State Abbreviation'] = state_abbr


cancer_data

Unnamed: 0,State,2010_LungBronchus,2010_AllSites,2010_Population,2011_LungBronchus,2011_AllSites,2011_Population,2012_LungBronchus,2012_AllSites,2012_Population,...,2018_LungBronchus,2018_AllSites,2018_Population,2019_LungBronchus,2019_AllSites,2019_Population,2020_LungBronchus,2020_AllSites,2020_Population,State Abbreviation
0,Alabama,4160,23640,4779736,4240,25530,4799069,4440,26440,4815588,...,4190,27830,4887681,4150,28950,4903185,4230,28570,5031864,AL
1,Alaska,360,2860,710231,380,3090,722128,490,3640,730443,...,460,3550,735139,400,3090,731545,400,2960,732964,AK
2,Arizona,4030,29780,6392017,3820,31550,6472643,3970,31990,6554978,...,4460,34740,7158024,4290,37490,7278717,4200,36730,7186683,AZ
3,Arkansas,2620,15320,2915918,2660,16070,2940667,2760,16120,2952164,...,2720,16130,3009733,2690,16580,3017804,2760,17200,3014348,AR
4,California,18490,157320,37253956,17660,163480,37638369,18060,165810,37948800,...,18760,178130,39461588,18990,186920,39512223,18040,172040,39503200,CA
5,Colorado,2270,21340,5029196,2250,22390,5121108,2400,22820,5192647,...,2560,25570,5691287,2690,26800,5758736,2550,27290,5785219,CO
6,Connecticut,2640,20750,3574097,2680,21440,3588283,2720,21530,3594547,...,2700,21240,3571520,2580,21950,3565287,2650,20300,3577586,CT
7,Delaware,800,4890,897934,780,5130,907381,800,5340,915179,...,890,6110,965479,840,5870,973764,890,6660,991862,DE
8,District of Columbia,360,2760,601723,360,2830,619800,370,2980,634924,...,310,3260,701547,340,3190,705749,300,3600,670839,DC
9,Florida,18390,107000,18801310,17150,113400,19053237,17860,117580,19297822,...,18710,135170,21244317,18560,131470,21477737,18150,150500,21591299,FL


In [24]:
# First, create new columns for rates per capita for each year
years = range(2010, 2021)
# Convert population columns to numeric data type
population_columns = [f'{year}_Population' for year in years]
cancer_data[population_columns] = cancer_data[population_columns].apply(pd.to_numeric, errors='coerce')

# Calculate rates per capita for each year
for year in years:
    # Calculate rates per capita for LungBronchus and AllSites
    lung_bronchus_per_capita = cancer_data[f'{year}_LungBronchus'] / cancer_data[f'{year}_Population']
    all_sites_per_capita = cancer_data[f'{year}_AllSites'] / cancer_data[f'{year}_Population']

    # Round the calculated values to 6 decimal places
    lung_bronchus_per_capita_rounded = round(lung_bronchus_per_capita, 6)
    all_sites_per_capita_rounded = round(all_sites_per_capita, 6)

    # Assign the rounded values to new columns
    cancer_data[f'{year}_LungBronchus_perCapita'] = lung_bronchus_per_capita_rounded
    cancer_data[f'{year}_AllSites_perCapita'] = all_sites_per_capita_rounded

# Display the DataFrame with the new columns
# print(cancer_data)

# Extract columns related to each year
columns_by_year = {}
for year in years:
    columns_by_year[year] = [f'{year}_LungBronchus', f'{year}_LungBronchus_Rate',
                             f'{year}_AllSites', f'{year}_AllSites_Rate',
                             f'{year}_Population']

# Get all column names
all_columns = list(cancer_data.columns)

# Remove 'State' and 'State Abbreviation' from the list
all_columns.remove('State')
all_columns.remove('State Abbreviation')

# Sort the remaining columns alphabetically
sorted_columns = sorted(all_columns)

# Reorder the DataFrame columns
cancer_data = cancer_data[['State', 'State Abbreviation'] + sorted_columns]

# Display the reordered DataFrame
print(cancer_data)

# Save cancer rates per capita
cancer_data.to_csv('stateCancerRates_perCapita.csv', index=False)

                   State State Abbreviation  2010_AllSites  \
0                Alabama                 AL          23640   
1                 Alaska                 AK           2860   
2                Arizona                 AZ          29780   
3               Arkansas                 AR          15320   
4             California                 CA         157320   
5               Colorado                 CO          21340   
6            Connecticut                 CT          20750   
7               Delaware                 DE           4890   
8   District of Columbia                 DC           2760   
9                Florida                 FL         107000   
10               Georgia                 GA          40480   
11                Hawaii                 HI           6670   
12                 Idaho                 ID           7220   
13              Illinois                 IL          63890   
14               Indiana                 IN          33020   
15      

In [25]:
energy = pd.read_csv("all_energy_plus_all_renewables.csv")
energy.head()

Unnamed: 0.1,Unnamed: 0,State,2011(C),2012(C),2013(C),2014(C),2015(C),2016(C),2017(C),2018(C),...,2011(W),2012(W),2013(W),2014(W),2015(W),2016(W),2017(W),2018(W),2019(W),2020(W)
0,0,AK,15481,15521,14819,18225,19511,16613,16376,17325,...,121,356,1386,1445,1487,1562,1305,1410,1269,1127
1,1,AL,651032,547004,565051,575912,494311,410168,378890,377152,...,0,0,0,0,0,0,0,0,0,0
2,2,AR,306119,296732,327099,339214,226889,246437,267591,304145,...,0,0,0,0,0,0,0,0,0,0
3,3,AZ,459909,420570,454865,447849,385822,323878,334512,331492,...,2483,5059,4291,4451,4210,4998,5248,4824,4933,5647
4,4,CA,55264,43832,38151,39486,30967,32077,33663,33256,...,75317,92821,122335,123546,113906,124661,118074,127590,122228,119083


In [26]:
energy.drop(columns='Unnamed: 0', inplace=True)
energy.rename(columns={'State': 'State Abbreviation'}, inplace=True)

energy.head()

Unnamed: 0,State Abbreviation,2011(C),2012(C),2013(C),2014(C),2015(C),2016(C),2017(C),2018(C),2019(C),...,2011(W),2012(W),2013(W),2014(W),2015(W),2016(W),2017(W),2018(W),2019(W),2020(W)
0,AK,15481,15521,14819,18225,19511,16613,16376,17325,17620,...,121,356,1386,1445,1487,1562,1305,1410,1269,1127
1,AL,651032,547004,565051,575912,494311,410168,378890,377152,317180,...,0,0,0,0,0,0,0,0,0,0
2,AR,306119,296732,327099,339214,226889,246437,267591,304145,239798,...,0,0,0,0,0,0,0,0,0,0
3,AZ,459909,420570,454865,447849,385822,323878,334512,331492,257727,...,2483,5059,4291,4451,4210,4998,5248,4824,4933,5647
4,CA,55264,43832,38151,39486,30967,32077,33663,33256,30886,...,75317,92821,122335,123546,113906,124661,118074,127590,122228,119083


In [27]:
merged_df = energy.merge(cancer_data, on='State Abbreviation')
merged_df

Unnamed: 0,State Abbreviation,2011(C),2012(C),2013(C),2014(C),2015(C),2016(C),2017(C),2018(C),2019(C),...,2019_AllSites,2019_AllSites_perCapita,2019_LungBronchus,2019_LungBronchus_perCapita,2019_Population,2020_AllSites,2020_AllSites_perCapita,2020_LungBronchus,2020_LungBronchus_perCapita,2020_Population
0,AK,15481,15521,14819,18225,19511,16613,16376,17325,17620,...,3090,0.004224,400,0.000547,731545,2960,0.004038,400,0.000546,732964.0
1,AL,651032,547004,565051,575912,494311,410168,378890,377152,317180,...,28950,0.005904,4150,0.000846,4903185,28570,0.005678,4230,0.000841,5031864.0
2,AR,306119,296732,327099,339214,226889,246437,267591,304145,239798,...,16580,0.005494,2690,0.000891,3017804,17200,0.005706,2760,0.000916,3014348.0
3,AZ,459909,420570,454865,447849,385822,323878,334512,331492,257727,...,37490,0.005151,4290,0.000589,7278717,36730,0.005111,4200,0.000584,7186683.0
4,CA,55264,43832,38151,39486,30967,32077,33663,33256,30886,...,186920,0.004731,18990,0.000481,39512223,172040,0.004355,18040,0.000457,39503200.0
5,CO,368871,370085,363532,350526,340107,321476,315835,284474,273338,...,26800,0.004654,2690,0.000467,5758736,27290,0.004717,2550,0.000441,5785219.0
6,CT,6081,9290,7679,9097,6536,2340,2507,4023,867,...,21950,0.006157,2580,0.000724,3565287,20300,0.005674,2650,0.000741,3577586.0
7,DC,48,77,4,48,44,34,30,37,5,...,3190,0.00452,340,0.000482,705749,3600,0.005366,300,0.000447,670839.0
8,DE,17881,17384,18254,10238,7149,8199,4804,4296,2179,...,5870,0.006028,840,0.000863,973764,6660,0.006715,890,0.000897,991862.0
9,FL,552730,482984,505155,557882,466530,426188,407538,327807,233542,...,131470,0.006121,18560,0.000864,21477737,150500,0.00697,18150,0.000841,21591299.0


In [28]:
# Save file to csv
merged_df.to_csv('energy_cancer_statePerYearwPop.csv', index=False)