In [1]:
import warnings
from copy import deepcopy
import pandas as pd
import numpy as np

In [2]:
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [4]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [5]:
np.random.seed(42)

In [6]:
cait_df = pd.read_csv('data/CAIT Country CO2 Emissions.csv', encoding = 'latin-1', header = None)

In [7]:
cait_df_columns = list(cait_df.loc[1])

In [8]:
cait_df = deepcopy(cait_df[2:].reset_index())
cait_df.drop(['index'], axis = 1, inplace = True)
cait_df.columns = cait_df_columns
cait_df.rename({'Country': 'state_name', 'Year': 'year', 'Total CO2 Emissions Excluding Land-Use Change and Forestry (MtCO2)': 'mtco2'}, axis = 1, inplace = True)

In [9]:
cait_df.head()

Unnamed: 0,state_name,year,mtco2
0,Afghanistan,1850,
1,Albania,1850,
2,Algeria,1850,
3,Angola,1850,
4,Antigua & Barbuda,1850,


In [10]:
cait_df = deepcopy(cait_df[(cait_df['state_name'] != 'Cook Islands') & (cait_df['state_name'] != 'European Union (15)') & (cait_df['state_name'] != 'European Union (28)') & (cait_df['state_name'] != 'Niue') & (cait_df['state_name'] != 'World')])
cait_df = cait_df.reset_index()
cait_df.drop(['index'], axis = 1, inplace = True)

In [11]:
cait_df.loc[cait_df['state_name'] == 'Bahamas, The', 'state_name'] = 'Bahamas'
cait_df.loc[cait_df['state_name'] == 'Bosnia & Herzegovina', 'state_name'] = 'Bosnia and Herzegovina'
cait_df.loc[cait_df['state_name'] == 'Congo, Dem. Rep.', 'state_name'] = 'Democratic Republic of the Congo'
cait_df.loc[cait_df['state_name'] == 'Congo, Rep.', 'state_name'] = 'Congo'
cait_df.loc[cait_df['state_name'] == 'Gambia, The', 'state_name'] = 'Gambia'
cait_df.loc[cait_df['state_name'] == "Cote d'Ivoire", 'state_name'] = 'Ivory Coast'
cait_df.loc[cait_df['state_name'] == 'Korea, Dem. Rep. (North)', 'state_name'] = 'North Korea'
cait_df.loc[cait_df['state_name'] == 'Korea, Rep. (South)', 'state_name'] = 'South Korea'
cait_df.loc[cait_df['state_name'] == 'Macedonia, FYR', 'state_name'] = 'Macedonia'
cait_df.loc[cait_df['state_name'] == 'Russian Federation', 'state_name'] = 'Russia'
cait_df.loc[cait_df['state_name'] == 'Saint Kitts & Nevis', 'state_name'] = 'St. Kitts and Nevis'
cait_df.loc[cait_df['state_name'] == 'Saint Lucia', 'state_name'] = 'St. Lucia'
cait_df.loc[cait_df['state_name'] == 'Saint Vincent & Grenadines', 'state_name'] = 'St. Vincent and the Grenadines'
cait_df.loc[cait_df['state_name'] == 'Sao Tome & Principe', 'state_name'] = 'Sao Tome and Principe'
cait_df.loc[cait_df['state_name'] == 'Trinidad & Tobago', 'state_name'] = 'Trinidad and Tobago'
cait_df.loc[cait_df['state_name'] == 'United States', 'state_name'] = 'United States of America'

In [12]:
# Yugoslavia did not exist for all the years that is recorded for Serbia in the dataset.
# Since Serbia is missing from the master dataset and cait_df does not contain Yugoslavia,
# Serbia has been changed to Yugoslavia. Non-existant years will be removed after join.
cait_df.loc[cait_df['state_name'] == 'Serbia', 'state_name'] = 'Yugoslavia'

In [13]:
cait_df['year'] = cait_df['year'].astype(int)

In [14]:
# accounting for not having Czechoslovakia in this dataset
cait_df.loc[(cait_df['state_name'] == 'Czech Republic') & (cait_df['year'] < 1992), 'cow_code'] = 315
cait_df.loc[(cait_df['state_name'] == 'Czech Republic') & (cait_df['year'] < 1992), 'state_name'] = 'Czechoslovakia'

In [15]:
c_code_df = pd.read_pickle('pickle/c_code_df.pkl')

In [16]:
c_code_dic = {}

for i, state in enumerate(c_code_df['state_name']):
    c_code_dic[state] = c_code_df['c_code'][i]

In [17]:
missing_list = []
for i, country in enumerate(cait_df['state_name']):
    try:
        cait_df.loc[i, 'cow_code'] = c_code_dic[country]
    except:
        missing_list.append(country)

missing_list = sorted(list(set(missing_list)))
print(len(missing_list))

0


In [18]:
len(cait_df)

29829

In [19]:
yemen_df = cait_df[cait_df['state_name'] == 'Yemen']
yemen_df = yemen_df.reset_index()
yemen_df.drop(['index'], axis = 1, inplace = True)
yemen_df['year'] = yemen_df['year'].astype(int)

In [20]:
# regular "yemen" will still exist between 1966 and 1989, but it won't matter after joins
cait_df_index = len(cait_df) - 1
for i, carbon_amt in enumerate(yemen_df['mtco2']):
    if (yemen_df.loc[i, 'year'] >= 1966) and (yemen_df.loc[i, 'year'] <= 1989):
        cait_df.loc[cait_df_index, 'state_name'] = "Yemen People's Republic"
        cait_df.loc[cait_df_index, 'cow_code'] = 680
        cait_df.loc[cait_df_index, 'year'] = yemen_df.loc[i, 'year']
        cait_df.loc[cait_df_index, 'mtco2'] = carbon_amt
        cait_df_index+=1
    if (yemen_df.loc[i, 'year'] >= 1946) and (yemen_df.loc[i, 'year'] <= 1989):
        cait_df.loc[cait_df_index, 'state_name'] = 'Yemen Arab Republic'
        cait_df.loc[cait_df_index, 'cow_code'] = 678
        cait_df.loc[cait_df_index, 'year'] = yemen_df.loc[i, 'year']
        cait_df.loc[cait_df_index, 'mtco2'] = carbon_amt
        cait_df_index+=1

In [21]:
germany_df = cait_df[cait_df['state_name'] == 'Germany']
germany_df = germany_df.reset_index()
germany_df.drop(['index'], axis = 1, inplace = True)
germany_df['year'] = germany_df['year'].astype(int)

In [22]:
# regular "germany" will still exist between 1948 and 1989, but it won't matter after joins
cait_df_index = len(cait_df) - 1
for i, carbon_amt in enumerate(germany_df['mtco2']):
    if (germany_df.loc[i, 'year'] >= 1948) and (germany_df.loc[i, 'year'] <= 1989):
        cait_df.loc[cait_df_index, 'state_name'] = 'German Federal Republic'
        cait_df.loc[cait_df_index, 'cow_code'] = 260
        cait_df.loc[cait_df_index, 'year'] = germany_df.loc[i, 'year']
        cait_df.loc[cait_df_index, 'mtco2'] = carbon_amt
        cait_df_index+=1
    if (germany_df.loc[i, 'year'] >= 1948) and (germany_df.loc[i, 'year'] <= 1989):
        cait_df.loc[cait_df_index, 'state_name'] = 'German Democratic Republic'
        cait_df.loc[cait_df_index, 'cow_code'] = 265
        cait_df.loc[cait_df_index, 'year'] = germany_df.loc[i, 'year']
        cait_df.loc[cait_df_index, 'mtco2'] = carbon_amt
        cait_df_index+=1

In [23]:
cait_df[cait_df['cow_code'].isnull()]

Unnamed: 0,state_name,year,mtco2,cow_code


In [24]:
world_bank_df = pd.read_csv('data/worldbank_emissions.csv', sep = '\t', encoding = 'latin-1', header = None)
world_bank_columns = list(world_bank_df.loc[0])

In [25]:
world_bank_df = deepcopy(world_bank_df[1:].reset_index())
world_bank_df.drop(['index'], axis = 1, inplace = True)
world_bank_df.columns = world_bank_columns
world_bank_df.drop(['Country Code', 'Indicator Name', 'Indicator Code'], axis = 1, inplace = True)
world_bank_df.rename({'Country Name': 'state_name'}, axis = 1, inplace = True)

In [26]:
world_bank_df = deepcopy(world_bank_df.T.reset_index())
world_bank_columns = world_bank_df.loc[0]
world_bank_df = world_bank_df[1:]
world_bank_df.columns = world_bank_columns

In [27]:
# Keep only countries missing from the master dataset
world_bank_df = world_bank_df[['Liechtenstein', 'Tuvalu', 'Marshall Islands', 'Micronesia, Fed. Sts.', 'San Marino', 'Timor-Leste', 'Somalia']]

In [28]:
world_bank_df['year'] = list(np.arange(1960, 2018))

In [29]:
world_bank_df = world_bank_df[world_bank_df['year'] <= 2014]

In [30]:
world_bank_df.drop(['San Marino'], axis = 1, inplace = True)
world_bank_df.rename({'Micronesia, Fed. Sts.': 'Federated States of Micronesia', 'Timor-Leste': 'East Timor'}, axis = 1, inplace = True)

In [31]:
# need a row for every year each of these countries appears in the master dataset.
# will impute missing values later.
# years where a country is not present will be left out during join
full_years = list(reversed(np.arange(1945, 2018)))
for i, year in enumerate(full_years):
    if year not in list(world_bank_df['year']):
        world_bank_df.loc[i, 'year'] = year

In [32]:
world_bank_df.head()

Unnamed: 0,Liechtenstein,Tuvalu,Marshall Islands,Federated States of Micronesia,East Timor,Somalia,year
1,,,,,,0.031,2016.0
2,,,,,,0.031,2015.0
3,,,,,,0.037,1962.0
4,,,,,,0.035,1963.0
5,,,,,,0.045,1964.0


In [33]:
liechtenstein_df = deepcopy(world_bank_df[['Liechtenstein', 'year']])
liechtenstein_df.rename({'Liechtenstein': 'state_name'}, axis = 1, inplace = True)
liechtenstein_df['state_name'] = 'Liechtenstein'
liechtenstein_df['cow_code'] = 223
tuvalu_df = deepcopy(world_bank_df[['Tuvalu', 'year']])
tuvalu_df.rename({'Tuvalu': 'state_name'}, axis = 1, inplace = True)
tuvalu_df['state_name'] = 'Tuvalu'
tuvalu_df['cow_code'] = 947
marshall_islands_df = deepcopy(world_bank_df[['Marshall Islands', 'year']])
marshall_islands_df.rename({'Marshall Islands': 'state_name'}, axis = 1, inplace = True)
marshall_islands_df['state_name'] = 'Marshall Islands'
marshall_islands_df['cow_code'] = 983
micronesia_df = deepcopy(world_bank_df[['Federated States of Micronesia', 'year']])
micronesia_df.rename({'Federated States of Micronesia': 'state_name'}, axis = 1, inplace = True)
micronesia_df['state_name'] = 'Federated States of Micronesia'
micronesia_df['cow_code'] = 987
east_timor_df = deepcopy(world_bank_df[['East Timor', 'year']])
east_timor_df.rename({'East Timor': 'state_name'}, axis = 1, inplace = True)
east_timor_df['state_name'] = 'East Timor'
east_timor_df['cow_code'] = 860
somalia_df = deepcopy(world_bank_df[['Somalia', 'year']])
somalia_df.rename({'Somalia': 'state_name'}, axis = 1, inplace = True)
somalia_df['state_name'] = 'Somalia'
somalia_df['cow_code'] = 520

In [34]:
taiwan_df = pd.read_csv('data/taiwan_co2_emissions.csv', encoding = 'latin-1', header = None)
taiwan_df_columns = list(taiwan_df.loc[1])

In [35]:
taiwan_df = deepcopy(taiwan_df[2:].reset_index())
taiwan_df.drop(['index'], axis = 1, inplace = True)
taiwan_df.columns = taiwan_df_columns
taiwan_df.rename({'Country': 'state_name', 'Year': 'year', 'Total CO2 Emissions': 'mtco2'}, axis = 1, inplace = True)

In [36]:
taiwan_df = taiwan_df.sort_values(by = 'year', ascending = False)
taiwan_df = deepcopy(taiwan_df.reset_index())
taiwan_df.drop(['index'], axis = 1, inplace = True)

In [37]:
# need a row for every year Taiwan appears in the master dataset.
# will impute missing values later.
taiwan_years = list(reversed(np.arange(1948, 2018)))
for i, year in enumerate(taiwan_years):
    if year not in list(taiwan_df['year']):
        taiwan_df.loc[i, 'year'] = year
        taiwan_df.loc[i, 'state_name'] = 'Taiwan'
        taiwan_df.loc[i, 'cow_code'] = 713

In [38]:
len(cait_df)

29979

In [39]:
cait_df = deepcopy(pd.concat([cait_df, liechtenstein_df, tuvalu_df,
                              marshall_islands_df, micronesia_df,
                              east_timor_df, somalia_df,
                              taiwan_df], axis = 0))
cait_df['mtco2'] = cait_df['mtco2'].astype(float)

In [40]:
len(cait_df)

30487

In [41]:
cait_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30487 entries, 0 to 69
Data columns (total 4 columns):
cow_code      30487 non-null float64
mtco2         17048 non-null float64
state_name    30487 non-null object
year          30487 non-null float64
dtypes: float64(3), object(1)
memory usage: 1.2+ MB


In [42]:
cait_df.head()

Unnamed: 0,cow_code,mtco2,state_name,year
0,700.0,,Afghanistan,1850.0
1,339.0,,Albania,1850.0
2,615.0,,Algeria,1850.0
3,540.0,,Angola,1850.0
4,58.0,,Antigua & Barbuda,1850.0


In [44]:
cait_df[cait_df['mtco2'] < 0]

Unnamed: 0,cow_code,mtco2,state_name,year
190,900.0,-0.062,Australia,1851.0
373,900.0,-0.055,Australia,1852.0
556,900.0,-0.11,Australia,1853.0
739,900.0,-0.128,Australia,1854.0
922,900.0,-0.132,Australia,1855.0
1105,900.0,-0.183,Australia,1856.0
1288,900.0,-0.209,Australia,1857.0
1471,900.0,-0.245,Australia,1858.0
1654,900.0,-0.377,Australia,1859.0
14818,101.0,-1.165,Venezuela,1930.0


In [43]:
cait_df.to_pickle('pickle/cait_df.pkl')
cait_df.to_csv('dataframe_exports/cait_df.csv')