### Vitoria Moreno-Costa's Notebook
DSI Group Project

Team 6: Emily Fuller, Karthik Nambiar, Vitoria Moreno-Costa, and David Wagenhurst

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests

### Read in datasets, reduce content to Texas only, and export to cleaned_datasets

#### US Petroleum Refineries

In [2]:
# df_refineries = pd.read_csv('datasets/us-petroleum-refineries.csv')

# df_refineries.columns = df_refineries.columns.astype(str).str.strip().str.lower().str.replace(' ','_')

# df_texas_refineries = df_refineries[df_refineries['state'] == 'Texas']

# df_texas_refineries.loc[:,'state'] = 'TX'

# df_texas_refineries.to_csv('cleaned_datasets/texas_petroleum_refineries.csv',index=False)

#### EPA 2017 AirToxScreen: Cancer Risk by Source Group

In [3]:
# df_epa_cancer_by_source = pd.read_excel('datasets/national_cancerrisk_by_tract_srcgrp.xlsx')

# df_epa_cancer_by_source.columns = df_epa_cancer_by_source.columns.astype(str).str.strip().str.lower().str.replace(' ','_')

# df_epa_cancer_by_source_TX = df_epa_cancer_by_source[df_epa_cancer_by_source['state'] == 'TX']

# df_epa_cancer_by_source_TX.to_csv('cleaned_datasets/texas_cancerrisk_by_srcgrp.csv',index=False)

#### EPA 2017 AirToxScreen: Cancer Risk by Pollutant

In [4]:
# df_epa_cancer_by_pollutant = pd.read_excel('datasets/national_cancerrisk_by_tract_poll.xlsx',)

# df_epa_cancer_by_pollutant.columns = df_epa_cancer_by_pollutant.columns.astype(str).str.strip().str.lower().str.replace(' ','_')

# df_epa_cancer_by_pollutant_TX = df_epa_cancer_by_pollutant[df_epa_cancer_by_pollutant['state'] == 'TX']

# df_epa_cancer_by_pollutant_TX.to_csv('cleaned_datasets/texas_cancerrisk_by_tract_poll.csv', index=False)

#### EPA 2017 AirToxScreen: Emissions by Source

In [5]:
# df_epa_emissions_by_source = pd.read_excel('datasets/point_fac_2017_emissions.xlsx')

# df_epa_emissions_by_source.columns = df_epa_emissions_by_source.columns.astype(str).str.strip().str.lower().str.replace(' ','_')

# df_epa_emissions_by_source_TX = df_epa_emissions_by_source[df_epa_emissions_by_source['state'] == 'Texas']

# df_epa_emissions_by_source_TX.loc[:,'state'] = 'TX'

# # Drop non-numerical values for facility-id
# df_epa_emissions_by_source_TX['facility_id'] = pd.to_numeric(df_epa_emissions_by_source_TX['facility_id'], errors='coerce')

# df_epa_emissions_by_source_TX.dropna(inplace=True)

# df_epa_emissions_by_source_TX.to_csv('cleaned_datasets/texas_point_fac_2017_emissions.csv', index=False)

#### EPA 2017 AirToxScreen: All Hazard Indices by Source

In [6]:
# df_epa_all_hazard_index = pd.read_excel('datasets/national_allhi_by_tract.xlsx')

# df_epa_all_hazard_index.columns = df_epa_all_hazard_index.columns.astype(str).str.strip().str.lower().str.replace(' ','_')

# df_epa_all_hazard_index_TX = df_epa_all_hazard_index[df_epa_all_hazard_index['state'] == 'TX']

# df_epa_all_hazard_index_TX.to_csv('cleaned_datasets/texas_allhi_by_tract.csv', index=False)

### Find census tract number for datasets with missing

In [7]:
# df_refineries = pd.read_csv('cleaned_datasets/texas_petroleum_refineries.csv')

# df_emissions = pd.read_csv('cleaned_datasets/texas_point_fac_2017_emissions.csv')

#### Use census API to identify FIPs for each refinery

In [8]:
def coord_to_tract(lat, long):
    """ This function identifies the FIPs census tract number for a given latitude and longitude using the 2010 census.
    
    args:
        lat (float): Latitude
        long (float): Longitude
        
    return:
        tract (int): census tract number, which is the first 11 digits of the census block FIPs
    """
    url = 'https://geo.fcc.gov/api/census/block/find'
    
    params = {
        'latitude': lat,
        'longitude': long,
        'censusYear': 2010,
        'format':'json'
    }
    res = requests.get(url,params)
    if res.status_code == 200:
        fips = res.json()['Block']['FIPS']
        return fips[0:11]
    else:
        print('request_failed')

In [9]:
# refinery_tract = []
# for refinery in df_refineries.index:
#     coords = df_refineries.loc[refinery,['latitude','longitude']]
#     refinery_tract.append(coord_to_tract(coords[0],coords[1]))

# df_refineries['tract'] = refinery_tract

# # save to replace csv
# df_refineries.to_csv('cleaned_datasets/texas_petroleum_refineries.csv',index=False)

#### Now do the same for the emissions data

In [10]:
# emission_tract = []
# for source in df_emissions.index:
#     coords = df_emissions.loc[source,['latitude','longitude']]
#     emission_tract.append(coord_to_tract(coords[0],coords[1]))

# df_emissions['tract'] = emission_tract

# # save to replace csv
# df_emissions.to_csv('cleaned_datasets/texas_point_fac_2017_emissions.csv',index=False)

### Reduce the columns in the datasets

#### Refineries

In [40]:
df_refineries = pd.read_csv('cleaned_datasets/texas_petroleum_refineries.csv')

In [41]:
df_refineries = df_refineries[['company', 'corp', 'tract']]

In [42]:
df_refineries['tract'].value_counts() # there are duplicate refineries in tracts

48355006300    3
48167726200    3
48245011201    1
48297950100    1
48201233703    1
48201252600    1
48493000500    1
48199031000    1
48245011600    1
48201324200    1
48355000600    1
48201320500    1
48245006600    1
48201343600    1
48341950200    1
48227950100    1
48039662700    1
48233951000    1
48245000700    1
48201253700    1
48029192200    1
48201324100    1
48423000600    1
48141003501    1
48201252500    1
Name: tract, dtype: int64

In [43]:
# Let's combine refineries to 1 tract number and add a number of refineries to each tract

df_refineries['number_refineries'] = 1

In [44]:
# For the repeats, let's combine refinery company names and corporation names

for i in df_refineries['tract'].value_counts()[0:2].index:
    matches = df_refineries[(df_refineries['tract'] == i)]
    company_names = matches['company'].str.cat(sep=', ')
    corp_names = matches['corp'].str.cat(sep=', ')
    df_refineries.loc[matches.index[0],['company','corp','number_refineries']] = [company_names, corp_names, matches.shape[0]]
df_refineries.drop_duplicates(subset='tract',inplace=True)
df_refineries.reset_index(inplace=True,drop=True)

In [45]:
df_refineries.set_index('tract',inplace=True)

#### Emissions by source

In [17]:
df_emissions = pd.read_csv('cleaned_datasets/texas_point_fac_2017_emissions.csv')

In [18]:
df_emissions = df_emissions[['tract','benzene_(year_2017_tons)',
        'toluene_(year_2017_tons)','ethyl_benzene_(year_2017_tons)',
        'xylenes_(mixed_isomers)_(year_2017_tons)','diesel_pm_(year_2017_tons)',
        '2,2,4-trimethylpentane_(year_2017_tons)','coke_oven_emissions_(year_2017_tons)',
        '1,3-butadiene_(year_2017_tons)']
                            ]

In [19]:
df_emissions_by_tract = df_emissions.groupby('tract').sum()

In [20]:
df_emissions_by_tract.head(3)

Unnamed: 0_level_0,benzene_(year_2017_tons),toluene_(year_2017_tons),ethyl_benzene_(year_2017_tons),xylenes_(mixed_isomers)_(year_2017_tons),diesel_pm_(year_2017_tons),"2,2,4-trimethylpentane_(year_2017_tons)",coke_oven_emissions_(year_2017_tons),"1,3-butadiene_(year_2017_tons)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
48001950100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48001950600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48001951000,3.2e-05,0.3284,2.712,14.7564,0.0,0.0,0.0,0.4454


In [75]:
df_emissions_by_tract.shape

(870, 8)

#### Emissions and Cancer Risk by tract

In [21]:
df_cancer_tract = pd.read_csv('cleaned_datasets/texas_cancerrisk_by_tract_poll.csv')

In [22]:
key_pollutants = [x.replace('_(year_2017_tons)','') for x in df_emissions.columns]

In [23]:
for x in key_pollutants:
    if x in df_cancer_tract.columns:
        print(f"'{x}',")

'tract',
'benzene',
'coke_oven_emissions',
'1,3-butadiene',


In [24]:
df_cancer_tract = df_cancer_tract[['tract', 'benzene',
'coke_oven_emissions',
'1,3-butadiene']]

In [25]:
df_cancer_tract.set_index('tract',inplace=True)

In [26]:
df_cancer_tract.columns = df_cancer_tract.columns + '_cancer_risk_(per_million)'

In [27]:
df_cancer_tract.head(3)

Unnamed: 0_level_0,benzene_cancer_risk_(per_million),coke_oven_emissions_cancer_risk_(per_million),"1,3-butadiene_cancer_risk_(per_million)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48000000000,1.652095,0.0,0.480044
48001000000,1.164202,0.0,0.195928
48001950100,1.10716,0.0,0.163939


#### Emissions and Cancer Risk by source group

In [64]:
df_cancer_source = pd.read_csv('cleaned_datasets/texas_cancerrisk_by_srcgrp.csv')

In [65]:
df_cancer_source = df_cancer_source[['tract','total_cancer_risk_(per_million)','pt-stationarypoint_cancer_risk_(per_million)']]

In [66]:
df_cancer_source.set_index('tract',inplace=True)

In [67]:
df_cancer_source

Unnamed: 0_level_0,total_cancer_risk_(per_million),pt-stationarypoint_cancer_risk_(per_million)
tract,Unnamed: 1_level_1,Unnamed: 2_level_1
48000000000,30,4.206690
48001000000,30,0.260001
48001950100,30,0.380271
48001950401,30,0.132819
48001950402,30,0.412504
...,...,...
48507000000,30,0.023144
48507950100,20,0.035947
48507950200,20,0.007127
48507950301,30,0.023831


#### Other hazards

In [109]:
df_all_hi = pd.read_csv('cleaned_datasets/texas_allhi_by_tract.csv')

In [110]:
df_all_hi = df_all_hi[['tract','county','population','respiratory_hi', 'neurological_hi','developmental_hi',
       'reproductive_hi', 'kidney_hi','immunological_hi','whole_body_hi']]

In [111]:
df_all_hi.set_index('tract',inplace=True)

In [112]:
df_all_hi.sample(3)

Unnamed: 0_level_0,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
48141004317,El Paso,4263,0.3,0.05,0.05,0.04,0.008,0.02,7e-05
48201232701,Harris,7235,0.6,0.09,0.04,0.07,0.02,0.03,0.0004
48121020506,Denton,1888,0.4,0.02,0.03,0.03,0.008,0.01,0.0002


### Combine Datasets

In [113]:
df_refineries.shape

(25, 3)

In [114]:
df_all_hi.shape

(5493, 9)

In [115]:
df_merged = pd.merge(df_refineries,df_all_hi,how = 'outer',left_index=True, right_index=True)

In [116]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
48000000000,,,,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,0.010,0.00020
48001000000,,,,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020
48001950100,,,,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020
48001950401,,,,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,0.006,0.00020
48001950402,,,,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,0.006,0.00020
...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,,,,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006
48507950100,,,,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,0.005,0.00007
48507950200,,,,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,0.005,0.00007
48507950301,,,,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006


In [117]:
df_merged['number_refineries'].fillna(0,inplace=True)

In [118]:
df_merged.fillna('No refineries',inplace=True)

In [119]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,0.010,0.00020
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,0.006,0.00020
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,0.006,0.00020
...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,0.005,0.00007
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,0.005,0.00007
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006


In [120]:
df_merged = pd.merge(df_merged,df_cancer_source,how = 'outer',left_index=True, right_index=True)

In [121]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi,total_cancer_risk_(per_million),pt-stationarypoint_cancer_risk_(per_million)
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,0.010,0.00020,30,4.206690
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.260001
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.380271
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,0.006,0.00020,30,0.132819
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,0.006,0.00020,30,0.412504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023144
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,0.005,0.00007,20,0.035947
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,0.005,0.00007,20,0.007127
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023831


In [122]:
df_merged = pd.merge(df_merged,df_cancer_tract,how = 'outer',left_index=True, right_index=True)

In [123]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi,total_cancer_risk_(per_million),pt-stationarypoint_cancer_risk_(per_million),benzene_cancer_risk_(per_million),coke_oven_emissions_cancer_risk_(per_million),"1,3-butadiene_cancer_risk_(per_million)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,0.010,0.00020,30,4.206690,1.652095,0.0,0.480044
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.260001,1.164202,0.0,0.195928
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.380271,1.107160,0.0,0.163939
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,0.006,0.00020,30,0.132819,0.969738,0.0,0.158329
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,0.006,0.00020,30,0.412504,0.976017,0.0,0.169988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023144,1.093991,0.0,0.041224
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,0.005,0.00007,20,0.035947,1.017388,0.0,0.033241
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,0.005,0.00007,20,0.007127,1.064307,0.0,0.037423
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023831,1.072941,0.0,0.040616


In [124]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,immunological_hi,whole_body_hi,total_cancer_risk_(per_million),pt-stationarypoint_cancer_risk_(per_million),benzene_cancer_risk_(per_million),coke_oven_emissions_cancer_risk_(per_million),"1,3-butadiene_cancer_risk_(per_million)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,0.010,0.00020,30,4.206690,1.652095,0.0,0.480044
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.260001,1.164202,0.0,0.195928
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,0.007,0.00020,30,0.380271,1.107160,0.0,0.163939
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,0.006,0.00020,30,0.132819,0.969738,0.0,0.158329
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,0.006,0.00020,30,0.412504,0.976017,0.0,0.169988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023144,1.093991,0.0,0.041224
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,0.005,0.00007,20,0.035947,1.017388,0.0,0.033241
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,0.005,0.00007,20,0.007127,1.064307,0.0,0.037423
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,0.006,0.00006,30,0.023831,1.072941,0.0,0.040616


In [125]:
df_merged = pd.merge(df_merged,df_emissions_by_tract,how='left',left_index=True, right_index=True)

In [126]:
df_merged

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,...,coke_oven_emissions_cancer_risk_(per_million),"1,3-butadiene_cancer_risk_(per_million)",benzene_(year_2017_tons),toluene_(year_2017_tons),ethyl_benzene_(year_2017_tons),xylenes_(mixed_isomers)_(year_2017_tons),diesel_pm_(year_2017_tons),"2,2,4-trimethylpentane_(year_2017_tons)",coke_oven_emissions_(year_2017_tons),"1,3-butadiene_(year_2017_tons)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,...,0.0,0.480044,,,,,,,,
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,...,0.0,0.195928,,,,,,,,
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,...,0.0,0.163939,0.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,...,0.0,0.158329,,,,,,,,
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,...,0.0,0.169988,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,...,0.0,0.041224,,,,,,,,
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,...,0.0,0.033241,,,,,,,,
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,...,0.0,0.037423,,,,,,,,
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,...,0.0,0.040616,0.385,0.136,0.025,0.036,0.0,0.0,0.0,0.0


In [127]:
df_merged.isna().sum()

company                                             0
corp                                                0
number_refineries                                   0
county                                              0
population                                          0
respiratory_hi                                      0
neurological_hi                                     0
developmental_hi                                    0
reproductive_hi                                     0
kidney_hi                                           0
immunological_hi                                    0
whole_body_hi                                       0
total_cancer_risk_(per_million)                     0
pt-stationarypoint_cancer_risk_(per_million)        0
benzene_cancer_risk_(per_million)                   0
coke_oven_emissions_cancer_risk_(per_million)       0
1,3-butadiene_cancer_risk_(per_million)             0
benzene_(year_2017_tons)                         4626
toluene_(year_2017_tons)    

In [128]:
df_merged.fillna('Not available')

Unnamed: 0_level_0,company,corp,number_refineries,county,population,respiratory_hi,neurological_hi,developmental_hi,reproductive_hi,kidney_hi,...,coke_oven_emissions_cancer_risk_(per_million),"1,3-butadiene_cancer_risk_(per_million)",benzene_(year_2017_tons),toluene_(year_2017_tons),ethyl_benzene_(year_2017_tons),xylenes_(mixed_isomers)_(year_2017_tons),diesel_pm_(year_2017_tons),"2,2,4-trimethylpentane_(year_2017_tons)",coke_oven_emissions_(year_2017_tons),"1,3-butadiene_(year_2017_tons)"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48000000000,No refineries,No refineries,0.0,Entire State,25145250,0.4,0.03,0.030,0.030,0.0080,...,0.0,0.480044,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48001000000,No refineries,No refineries,0.0,Anderson,58458,0.3,0.02,0.020,0.020,0.0020,...,0.0,0.195928,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48001950100,No refineries,No refineries,0.0,Anderson,4685,0.3,0.02,0.020,0.020,0.0020,...,0.0,0.163939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48001950401,No refineries,No refineries,0.0,Anderson,5422,0.3,0.01,0.010,0.020,0.0010,...,0.0,0.158329,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48001950402,No refineries,No refineries,0.0,Anderson,7535,0.3,0.01,0.020,0.020,0.0020,...,0.0,0.169988,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48507000000,No refineries,No refineries,0.0,Zavala,11677,0.3,0.01,0.005,0.005,0.0010,...,0.0,0.041224,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48507950100,No refineries,No refineries,0.0,Zavala,1232,0.3,0.01,0.004,0.004,0.0009,...,0.0,0.033241,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48507950200,No refineries,No refineries,0.0,Zavala,1880,0.3,0.01,0.005,0.005,0.0007,...,0.0,0.037423,Not available,Not available,Not available,Not available,Not available,Not available,Not available,Not available
48507950301,No refineries,No refineries,0.0,Zavala,2254,0.3,0.01,0.005,0.005,0.0010,...,0.0,0.040616,0.385,0.136,0.025,0.036,0.0,0.0,0.0,0.0


In [131]:
df_merged.columns

Index(['company', 'corp', 'number_refineries', 'county', 'population',
       'respiratory_hi', 'neurological_hi', 'developmental_hi',
       'reproductive_hi', 'kidney_hi', 'immunological_hi', 'whole_body_hi',
       'total_cancer_risk_(per_million)',
       'pt-stationarypoint_cancer_risk_(per_million)',
       'benzene_cancer_risk_(per_million)',
       'coke_oven_emissions_cancer_risk_(per_million)',
       '1,3-butadiene_cancer_risk_(per_million)', 'benzene_(year_2017_tons)',
       'toluene_(year_2017_tons)', 'ethyl_benzene_(year_2017_tons)',
       'xylenes_(mixed_isomers)_(year_2017_tons)',
       'diesel_pm_(year_2017_tons)', '2,2,4-trimethylpentane_(year_2017_tons)',
       'coke_oven_emissions_(year_2017_tons)',
       '1,3-butadiene_(year_2017_tons)'],
      dtype='object')

In [139]:
df_merged.groupby('company')[['total_cancer_risk_(per_million)','number_refineries']].mean().astype(int).sort_values(by='total_cancer_risk_(per_million)',ascending=False)

Unnamed: 0_level_0,total_cancer_risk_(per_million),number_refineries
company,Unnamed: 1_level_1,Unnamed: 2_level_1
EQUISTAR CHEMICALS LP,80,1
PETROMAX REFINING CO LLC,60,1
KINDER MORGAN CRUDE & CONDENSATE,50,1
VALERO REFINING CO TEXAS LP,50,1
TOTAL PETROCHEMICALS & REFINING USA,50,1
PASADENA REFINING SYSTEMS INC,50,1
PREMCOR REFINING GROUP INC,50,1
DEER PARK REFINING LTD PARTNERSHIP,50,1
WESTERN REFINING COMPANY LP,50,1
EXXONMOBIL REFINING & SUPPLY CO,40,1


In [141]:
# save to dataset
df_merged.to_csv('cleaned_datasets/combined_texas_refinery_risks_emissions.csv')