# The purpose of this notebook is to combine our variety of data into a single file

In [1]:
import pandas as pd
import geopandas as gpd

## Read in datasets

In [2]:
shape_df = gpd.read_file('./cleaned_datasets/joined_txshp/tx_shp_mod.shp')

cancer_df = pd.read_csv('./cleaned_datasets/cancer_rates.csv')

refinery_df = pd.read_csv('./cleaned_datasets/combined_texas_refinery_risks_emissions.csv')

## Clean up column names

- limited to 10 characters
- drop unnecessary columns from individual files

In [3]:
cancer_df.columns

Index(['Unnamed: 0', 'Year', 'StateAbbr', 'CountyFIPS', 'LocationName',
       'DataSource', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Low_Confidence_Limit', 'High_Confidence_Limit',
       'TotalPopulation', 'Geolocation', 'LocationID'],
      dtype='object')

In [4]:
cancer_df.columns = ['drop_1','drop_2','drop_3','county_num','tract_num','drop_4','drop_5','drop_6','drop_7','cancer_%','cancer_%_l','cancer_%_h','total_pop','geo_loc','drop_8']

cancer_df.drop(columns=['drop_1', 'drop_2', 'drop_3', 'drop_4', 'drop_5', 'drop_6', 'drop_7', 'drop_8'], inplace=True)

cancer_df.head(1)

Unnamed: 0,county_num,tract_num,cancer_%,cancer_%_l,cancer_%_h,total_pop,geo_loc
0,48005,48005000102,6.5,6.3,6.7,4447,POINT (-94.87095395 31.32711676)


In [5]:
shape_df.columns

Index(['statefp', 'countyfp', 'tractce', 'geoid', 'name', 'namelsad', 'mtfcc',
       'funcstat', 'aland', 'awater', 'intptlat', 'intptlon', 'households',
       'below_pove', 'hh_snap', 'hh_nowork', 'tract_name', 'population',
       'below50pov', 'below125po', 'below150po', 'below185po', 'below200po',
       'below300po', 'below400po', 'below500po', 'geometry'],
      dtype='object')

In [6]:
shape_df.columns = ['drop_1', 'drop_2', 'drop_3', 'tract_num', 'drop_4', 'drop_5', 'mtfcc',
       'drop_6', 'aland', 'awater', 'intptlat', 'intptlon', 'households',
       'below_pove', 'hh_snap', 'hh_nowork', 'tract_name', 'drop_7',
       'below50pov', 'below125po', 'below150po', 'below185po', 'below200po',
       'below300po', 'below400po', 'below500po', 'geometry']

shape_df.drop(columns=['drop_1', 'drop_2', 'drop_3', 'drop_4', 'drop_5', 'drop_6', 'drop_7'], inplace=True)

shape_df.head(1)

Unnamed: 0,tract_num,mtfcc,aland,awater,intptlat,intptlon,households,below_pove,hh_snap,hh_nowork,tract_name,below50pov,below125po,below150po,below185po,below200po,below300po,below400po,below500po,geometry
0,48201542301,G5020,5956733,15456,29.820499,-95.7316493,4081,152,181,2.5,"Census Tract 5423.01, Harris County, Texas",265,1299,1952,3181,3696,6695,9614,11731,"POLYGON ((-95.75224 29.83169, -95.75104 29.831..."


In [7]:
refinery_df.columns

Index(['tract', 'company', 'corp', 'number_refineries', 'county', 'population',
       'respiratory_hi', 'neurological_hi', 'developmental_hi',
       'reproductive_hi', 'kidney_hi', 'immunological_hi', 'whole_body_hi',
       'total_cancer_risk_(per_million)',
       'pt-stationarypoint_cancer_risk_(per_million)',
       'benzene_cancer_risk_(per_million)',
       'coke_oven_emissions_cancer_risk_(per_million)',
       '1,3-butadiene_cancer_risk_(per_million)', 'benzene_(year_2017_tons)',
       'toluene_(year_2017_tons)', 'ethyl_benzene_(year_2017_tons)',
       'xylenes_(mixed_isomers)_(year_2017_tons)',
       'diesel_pm_(year_2017_tons)', '2,2,4-trimethylpentane_(year_2017_tons)',
       'coke_oven_emissions_(year_2017_tons)',
       '1,3-butadiene_(year_2017_tons)'],
      dtype='object')

In [8]:
refinery_df.columns = ['tract_num','company','corp','num_refine','county','drop_1','respir_hi','neurol_hi','develop_hi',
                       'reprodu_hi','kidney_hi','immuno_hi','tot_bod_hi','total_r/m','ptstat_r/m','benzen_r/m','coke_r/m',
                       'butadi_r/m','benzene_t','toluene_t','eth_benz_t','xylenes_t','diesel_t','trimeth_t','coke_t','butadien_t']

refinery_df.drop(columns='drop_1', inplace=True)

refinery_df.head(1)

Unnamed: 0,tract_num,company,corp,num_refine,county,respir_hi,neurol_hi,develop_hi,reprodu_hi,kidney_hi,...,coke_r/m,butadi_r/m,benzene_t,toluene_t,eth_benz_t,xylenes_t,diesel_t,trimeth_t,coke_t,butadien_t
0,48000000000,No refineries,No refineries,0.0,Entire State,0.4,0.03,0.03,0.03,0.008,...,0.0,0.480044,,,,,,,,


## Check for duplicate column names

- just tract_num for joining

In [9]:
[col for col in cancer_df.columns if col in shape_df.columns or col in refinery_df.columns]

['tract_num']

In [10]:
[col for col in refinery_df.columns if col in shape_df.columns or col in cancer_df.columns]

['tract_num']

In [11]:
[col for col in shape_df.columns if col in cancer_df.columns or col in refinery_df.columns]

['tract_num']

## Join dataframes into one and save to shape file

In [12]:
print(shape_df.shape)
print(refinery_df.shape)
print(cancer_df.shape)

(5265, 20)
(5493, 25)
(5222, 7)


In [15]:
combined.columns

Index(['tract_num', 'company', 'corp', 'num_refine', 'county', 'respir_hi',
       'neurol_hi', 'develop_hi', 'reprodu_hi', 'kidney_hi', 'immuno_hi',
       'tot_bod_hi', 'total_r/m', 'ptstat_r/m', 'benzen_r/m', 'coke_r/m',
       'butadi_r/m', 'benzene_t', 'toluene_t', 'eth_benz_t', 'xylenes_t',
       'diesel_t', 'trimeth_t', 'coke_t', 'butadien_t', 'county_num',
       'cancer_%', 'cancer_%_l', 'cancer_%_h', 'total_pop', 'geo_loc', 'mtfcc',
       'aland', 'awater', 'intptlat', 'intptlon', 'households', 'below_pove',
       'hh_snap', 'hh_nowork', 'tract_name', 'below50pov', 'below125po',
       'below150po', 'below185po', 'below200po', 'below300po', 'below400po',
       'below500po', 'geometry'],
      dtype='object')

In [13]:
combined = refinery_df.merge(cancer_df, on='tract_num', how='left')

combined = combined.merge(shape_df, on='tract_num', how='left')

combined_gpd = gpd.GeoDataFrame(combined)

combined_gpd.columns

Index(['tract_num', 'company', 'corp', 'num_refine', 'county', 'respir_hi',
       'neurol_hi', 'develop_hi', 'reprodu_hi', 'kidney_hi', 'immuno_hi',
       'tot_bod_hi', 'total_r/m', 'ptstat_r/m', 'benzen_r/m', 'coke_r/m',
       'butadi_r/m', 'benzene_t', 'toluene_t', 'eth_benz_t', 'xylenes_t',
       'diesel_t', 'trimeth_t', 'coke_t', 'butadien_t', 'county_num',
       'cancer_%', 'cancer_%_l', 'cancer_%_h', 'total_pop', 'geo_loc', 'mtfcc',
       'aland', 'awater', 'intptlat', 'intptlon', 'households', 'below_pove',
       'hh_snap', 'hh_nowork', 'tract_name', 'below50pov', 'below125po',
       'below150po', 'below185po', 'below200po', 'below300po', 'below400po',
       'below500po', 'geometry'],
      dtype='object')

In [16]:
combined_gpd.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 5493 entries, 0 to 5492
Data columns (total 50 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   tract_num   5493 non-null   int64   
 1   company     5493 non-null   object  
 2   corp        5493 non-null   object  
 3   num_refine  5493 non-null   float64 
 4   county      5493 non-null   object  
 5   respir_hi   5493 non-null   float64 
 6   neurol_hi   5493 non-null   float64 
 7   develop_hi  5493 non-null   float64 
 8   reprodu_hi  5493 non-null   float64 
 9   kidney_hi   5493 non-null   float64 
 10  immuno_hi   5493 non-null   float64 
 11  tot_bod_hi  5493 non-null   float64 
 12  total_r/m   5493 non-null   int64   
 13  ptstat_r/m  5493 non-null   float64 
 14  benzen_r/m  5493 non-null   float64 
 15  coke_r/m    5493 non-null   float64 
 16  butadi_r/m  5493 non-null   float64 
 17  benzene_t   867 non-null    float64 
 18  toluene_t   867 non-null    float64 
 19

In [14]:
type(combined_gpd)

geopandas.geodataframe.GeoDataFrame

In [15]:
combined_gpd.to_file('./cleaned_datasets/all_data_txshp/all_data_shape.shp', index=False)