## Data Upload

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

from google.colab import files
uploaded = files.upload()

Saving ghgp_data_2020.csv to ghgp_data_2020.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df = pd.read_csv(io.BytesIO(uploaded['ghgp_data_2020.csv']))

## Exploratory Data Analysis (EDA)

In [3]:
# Rows and columns
df.shape

(6515, 66)

In [4]:
# All headings
df.columns

Index(['Facility Id', 'FRS Id', 'Facility Name', 'City', 'State', 'Zip Code',
       'Address', 'County', 'Latitude', 'Longitude', 'Primary NAICS Code',
       'Industry Type (subparts)', 'Industry Type (sectors)',
       'Total reported direct emissions', 'CO2 emissions (non-biogenic) ',
       'Methane (CH4) emissions ', 'Nitrous Oxide (N2O) emissions ',
       'HFC emissions', 'PFC emissions', 'SF6 emissions ', 'NF3 emissions',
       'Other Fully Fluorinated GHG emissions', 'HFE emissions',
       'Very Short-lived Compounds emissions', 'Other GHGs (metric tons CO2e)',
       'Biogenic CO2 emissions (metric tons)', 'Stationary Combustion',
       'Electricity Generation', 'Adipic Acid Production',
       'Aluminum Production', 'Ammonia Manufacturing', 'Cement Production',
       'Electronics Manufacture', 'Ferroalloy Production',
       'Fluorinated GHG Production', 'Glass Production',
       'HCFC22 Production from HFC23 Destruction', 'Hydrogen Production',
       'Iron and Steel 

In [5]:
# Data preview
df.head()

Unnamed: 0,Facility Id,FRS Id,Facility Name,City,State,Zip Code,Address,County,Latitude,Longitude,...,Titanium Dioxide Production,Underground Coal Mines,Zinc Production,Municipal Landfills,Industrial Wastewater Treatment,Manufacture of Electric Transmission and Distribution Equipment,Industrial Waste Landfills,CO2 used for NH3 / Lime?,Supplier of CO2?,CEMS?
0,1004377,110000000000.0,121 REGIONAL DISPOSAL FACILITY,MELISSA,TX,75454,3820 SAM RAYBURN HIGHWAY,COLLIN COUNTY,33.3,-96.54,...,,,,504064.0,,,,N,N,N
1,1000112,110000000000.0,23rd and 3rd,BROOKLYN,NY,11232,730 3rd Avenue,Kings,40.66,-74.0,...,,,,,,,,N,N,N
2,1013621,110000000000.0,3Bear Libby Gas Plant,Hobbs,NM,88240,674 Marathon Rd,LEA COUNTY,32.54,-103.52,...,,,,,,,,N,N,N
3,1003742,110000000000.0,31st Street Landfill,WESTCHESTER,IL,60154,11700 W 31ST ST,COOK COUNTY,41.84,-87.92,...,,,,100165.5,,,,N,N,N
4,1003188,110000000000.0,3M BROWNWOOD,BROWNWOOD,TX,76801,4501 HIGHWAY 377 SOUTH,BROWN COUNTY,31.67,-99.0,...,,,,,,,,N,N,N


In [6]:
# Looking at individual columns
df.iloc[:,3]

0           MELISSA
1          BROOKLYN
2             Hobbs
3       WESTCHESTER
4         BROWNWOOD
           ...     
6510          Aiken
6511           ZION
6512         DENVER
6513       Portland
6514    waynesville
Name: City, Length: 6515, dtype: object

### Meta Data
- Checking for data type (int/float v. object)
- Checking for missing data
- Some numeric data is displaying as "object" (may need to convert dtype)
- Missing data 

In [7]:
# Meta Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6515 entries, 0 to 6514
Data columns (total 66 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Facility Id                                                      6515 non-null   int64  
 1   FRS Id                                                           6405 non-null   float64
 2   Facility Name                                                    6515 non-null   object 
 3   City                                                             6515 non-null   object 
 4   State                                                            6515 non-null   object 
 5   Zip Code                                                         6515 non-null   int64  
 6   Address                                                          6038 non-null   object 
 7   County                                    

In [8]:
# Missing data ("True")
df.isna()

Unnamed: 0,Facility Id,FRS Id,Facility Name,City,State,Zip Code,Address,County,Latitude,Longitude,...,Titanium Dioxide Production,Underground Coal Mines,Zinc Production,Municipal Landfills,Industrial Wastewater Treatment,Manufacture of Electric Transmission and Distribution Equipment,Industrial Waste Landfills,CO2 used for NH3 / Lime?,Supplier of CO2?,CEMS?
0,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,True,True,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,True,True,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6510,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
6511,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
6512,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False
6513,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,False,False,False


## Data Cleaning

- Removing columns not relevant to analysis
- Data to include: facility name/ID, state/city, latitude/longitude, regulation subparts, industry type, CO2e emissions, emissions by process, checks for utilization and injection of CO2

In [9]:
# Create new df for summary of site information

site_summary=df.iloc[:,[2,3,4,5,8,9,11,12,13,14,25]]
site_summary.index=df['Facility Id']
site_summary.head()

Unnamed: 0_level_0,Facility Name,City,State,Zip Code,Latitude,Longitude,Industry Type (subparts),Industry Type (sectors),Total reported direct emissions,CO2 emissions (non-biogenic),Biogenic CO2 emissions (metric tons)
Facility Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1004377,121 REGIONAL DISPOSAL FACILITY,MELISSA,TX,75454,33.3,-96.54,HH,Waste,504064.0,,
1000112,23rd and 3rd,BROOKLYN,NY,11232,40.66,-74.0,"C,D",Power Plants,60819.7,60751.2,
1013621,3Bear Libby Gas Plant,Hobbs,NM,88240,32.54,-103.52,"C,W-PROC",Petroleum and Natural Gas Systems,40521.81,39555.5,
1003742,31st Street Landfill,WESTCHESTER,IL,60154,41.84,-87.92,"C,HH",Waste,100500.95,334.9,
1003188,3M BROWNWOOD,BROWNWOOD,TX,76801,31.67,-99.0,"C,N",Minerals,29359.64,29329.5,


In [10]:
# Confirming same number of rows
site_summary.shape

(6515, 11)

In [11]:
# Create new df for industrial sector data
# How to select columns including 25 - 64?
industries=df.iloc[:,[11,12,13,14,25,64]]
industries.index=df['Facility Id']
industries.head()

Unnamed: 0_level_0,Industry Type (subparts),Industry Type (sectors),Total reported direct emissions,CO2 emissions (non-biogenic),Biogenic CO2 emissions (metric tons),Supplier of CO2?
Facility Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1004377,HH,Waste,504064.0,,,N
1000112,"C,D",Power Plants,60819.7,60751.2,,N
1013621,"C,W-PROC",Petroleum and Natural Gas Systems,40521.81,39555.5,,N
1003742,"C,HH",Waste,100500.95,334.9,,N
1003188,"C,N",Minerals,29359.64,29329.5,,N


In [12]:
# Basic statistics on quantitative data
site_summary.iloc[:,[-2]].describe()

Unnamed: 0,CO2 emissions (non-biogenic)
count,5858.0
mean,380080.8
std,1102021.0
min,0.0
25%,22828.57
50%,52692.1
75%,179787.5
max,17097800.0
