# Cleaning and Analysis of CO2 data
- Author: Bryan Flores
- Date: 8/31/2021
- Version: 1.1

In [1]:
''' Dependencies '''
import pandas as pd
import numpy as np

''' Suppress Warnings '''
import warnings
warnings.filterwarnings('ignore')

In [2]:
co2 = pd.read_csv(r'C:\Users\Rudy\OneDrive\Projects\US-Wildfires\Phase 2\data\owid-co2-data.csv')
co2.head(10)

Unnamed: 0,iso_code,country,year,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
0,AFG,Afghanistan,1949,0.015,,,,,,0.002,...,,,,,,,,,7663783.0,
1,AFG,Afghanistan,1950,0.084,475.0,0.07,,,,0.011,...,,,,,,,,,7752000.0,19494800000.0
2,AFG,Afghanistan,1951,0.092,8.696,0.007,,,,0.012,...,,,,,,,,,7840000.0,20063850000.0
3,AFG,Afghanistan,1952,0.092,,,,,,0.012,...,,,,,,,,,7936000.0,20742350000.0
4,AFG,Afghanistan,1953,0.106,16.0,0.015,,,,0.013,...,,,,,,,,,8040000.0,22015460000.0
5,AFG,Afghanistan,1954,0.106,,,,,,0.013,...,,,,,,,,,8151000.0,22483330000.0
6,AFG,Afghanistan,1955,0.154,44.828,0.048,,,,0.019,...,,,,,,,,,8271000.0,22929890000.0
7,AFG,Afghanistan,1956,0.183,19.048,0.029,,,,0.022,...,,,,,,,,,8399000.0,23959930000.0
8,AFG,Afghanistan,1957,0.293,60.0,0.11,,,,0.034,...,,,,,,,,,8535000.0,23961910000.0
9,AFG,Afghanistan,1958,0.33,12.5,0.037,,,,0.038,...,,,,,,,,,8680000.0,25307440000.0


#### Step 1: Create subset of data for only the United States.
- Inspect 'country' and 'iso_code' fields for missing values

In [3]:
co2.describe()

Unnamed: 0,year,co2,co2_growth_prct,co2_growth_abs,consumption_co2,trade_co2,trade_co2_share,co2_per_capita,consumption_co2_per_capita,share_global_co2,...,ghg_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp
count,23708.0,23170.0,21907.0,22014.0,3350.0,3318.0,3318.0,22380.0,3350.0,23100.0,...,5155.0,5211.0,5157.0,5211.0,5157.0,6044.0,6044.0,6044.0,21071.0,13002.0
mean,1951.670997,270.245818,17.707864,5.733241,222.596056,3.673305,27.047046,4.059419,6.556161,5.143504,...,8.099506,82.25337,2.046923,29.308135,0.60669,1638.281196,27401.710621,2.293554,59275390.0,439660500000.0
std,53.828319,1509.911932,290.770226,54.592216,773.26139,100.251878,52.599781,15.340912,7.303592,17.985974,...,9.387477,566.702756,3.534126,199.291118,0.844671,9666.490291,33293.462203,1.681657,373074300.0,3666682000000.0
min,1750.0,-1.165,-2835.714,-860.211,-1.344,-1487.155,-118.238,-2.675,-0.396,-0.03,...,-50.487,0.0,0.0,0.0,0.0,0.208,111.472,0.048,1000.0,63780000.0
25%,1922.0,0.546,-1.071,-0.011,8.942,0.0,0.0,0.234,1.1005,0.006,...,2.514,2.15,0.715,0.535,0.228,46.319,5829.803,1.247,1333018.0,8926728000.0
50%,1965.0,5.168,4.363,0.088,36.854,2.348,16.3195,1.2085,4.04,0.0715,...,5.474,9.03,1.111,3.59,0.384,148.7905,15359.9885,1.8235,4856304.0,29662170000.0
75%,1993.0,44.799,11.453,1.271,141.397,9.5455,40.842,4.633,9.82975,0.625,...,10.299,31.055,1.69,10.57,0.607,518.953,37103.69825,2.7005,15944070.0,121000000000.0
max,2019.0,36441.388,20100.0,1667.711,8959.962,533.928,1470.066,794.334,58.736,100.0,...,86.991,8660.01,39.812,3054.0,8.239,153848.433,317582.244,14.939,7713468000.0,106561000000000.0


In [4]:
co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23708 entries, 0 to 23707
Data columns (total 55 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   iso_code                             20930 non-null  object 
 1   country                              23708 non-null  object 
 2   year                                 23708 non-null  int64  
 3   co2                                  23170 non-null  float64
 4   co2_growth_prct                      21907 non-null  float64
 5   co2_growth_abs                       22014 non-null  float64
 6   consumption_co2                      3350 non-null   float64
 7   trade_co2                            3318 non-null   float64
 8   trade_co2_share                      3318 non-null   float64
 9   co2_per_capita                       22380 non-null  float64
 10  consumption_co2_per_capita           3350 non-null   float64
 11  share_global_co2            

##### Notes for CO2 information:
- No missing values for 'country', roughly 3e10 for 'iso_code' --> can be easily filled in 
- Includes data for gasses other than CO2 --> subset #1 will include all countries but only CO2-related fields
- Lots of missing values --> research each ones importance and make decisions from there

In [5]:
co2_sub_1 = co2.drop(columns = ['methane', 'methane_per_capita', 'nitrous_oxide', 'nitrous_oxide_per_capita',
                                'total_ghg', 'ghg_per_capita'])
co2_sub_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23708 entries, 0 to 23707
Data columns (total 49 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   iso_code                             20930 non-null  object 
 1   country                              23708 non-null  object 
 2   year                                 23708 non-null  int64  
 3   co2                                  23170 non-null  float64
 4   co2_growth_prct                      21907 non-null  float64
 5   co2_growth_abs                       22014 non-null  float64
 6   consumption_co2                      3350 non-null   float64
 7   trade_co2                            3318 non-null   float64
 8   trade_co2_share                      3318 non-null   float64
 9   co2_per_capita                       22380 non-null  float64
 10  consumption_co2_per_capita           3350 non-null   float64
 11  share_global_co2            

In [7]:
co2_sub_1.to_csv('./data/co2_subset_1.csv')