## Imports

In [1]:
import pandas as pd
import numpy as np

main_df = pd.read_csv('../data/energy_data.csv')
life_exp = pd.read_excel('../data/life_expectancy.xlsx')
not_in_countries = [country for country in set(main_df['country']) \
                    if country not in set(life_exp['country'])]

## Cleaning
---
- Reordering columns
- Replacing `'--'` with nulls
- Dropping:
    - countries not found in the World Bank data, and years 2019 and 2020
    - countries with null data in columns except  production/consumption_nuclear
- Converting float data as floats

In [2]:
# re-ordering columns
cols = main_df.columns.to_list()
cols = cols[:2] + cols[27:28] + cols[32:] + cols[24:27]  + cols[2:24] + cols[28:32]
main_df = main_df[cols]

In [3]:
# replacing '--' with np.nan
main_df.replace(to_replace = '--', value = np.nan, inplace = True)

In [4]:
# dropping
drop_index = main_df[(main_df.country.isin(not_in_countries))|
                     (main_df['year']==2019)|
                     (main_df['year']==2020)].index

main_df.drop(index = drop_index, inplace=True)

# after iteratively dropping countries that were giving nulls, identified the following list of
# to drop all at once
drop_countries = ['American Samoa', 'British Virgin Islands','Cyprus','Czech Republic','Eritrea',
                  'Faroe Islands','French Polynesia','Gibraltar','Greenland','Guam','Iceland',
                  'Kosovo','Micronesia','Moldova','Montenegro','Nauru','New Caledonia',
                  'Northern Mariana Islands','Puerto Rico','Serbia','Slovakia','South Sudan',
                  'Timor-Leste','Trinidad and Tobago','Turks and Caicos Islands','Tuvalu',
                  'U.S. Virgin Islands']

drop_index = main_df[(main_df.country.isin(drop_countries)) |
                     (main_df['year'] < 1992) |
                     (main_df['life_expectancy'].isnull())].index

main_df.drop(index = drop_index, inplace = True)

In [5]:
# converting all float data as float
float_cols = {col:'float64' for col in main_df.columns[2:].to_list()}
main_df = main_df.astype(float_cols)

In [6]:
# saving progress
main_df.to_csv('../data/cleaned_energy_data.csv', index=False)

main_df

Unnamed: 0,country,year,population,life_expectancy,consumption_per_capita,consumption_per_GDP,ppp_2015USD,production_total,production_coal,production_natural_gas,...,imports_coal,imports_electricity,exports_crude_oil,exports_natural_gas,exports_coal,exports_electricity,emissions_co2_emissions,emissions_coal_and_coke,emissions_consumed_natural_gas,emissions_petroleum_and_other_liquids
2772,World,1992,5.481774e+06,65.769659,64.426755,6.923158,51013.270526,353.718416,92.558725,76.984223,...,481679.269000,400.464479,30667.877,17854.85234,494096.467500,400.795117,22529.424264,8587.186368,4076.853622,9865.384274
2773,Afghanistan,1992,1.448550e+04,51.641000,2.159523,1.042253,30.013600,0.016143,0.000158,0.011092,...,0.000000,0.131000,0.000,0.00000,0.000000,0.000000,1.704078,0.015551,0.588528,1.100000
2774,Albania,1992,3.245890e+03,71.802000,25.937798,8.087302,10.410300,0.065519,0.003415,0.003760,...,104.719450,0.104000,0.000,0.00000,0.000000,0.560000,3.901890,0.502372,0.199519,3.200000
2775,Algeria,1992,2.702830e+04,67.575000,48.164981,5.592648,232.773000,5.031036,0.000380,2.325281,...,1139.243484,0.125000,279.400,1240.61590,0.000000,1.061000,74.390488,2.687475,45.703013,26.000000
2777,Angola,1992,1.265740e+04,45.230000,6.689229,1.423070,59.496900,1.152349,0.000000,0.021076,...,0.000000,0.000000,508.400,0.00000,0.000000,0.000000,5.218203,0.000000,1.118203,4.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9002,Venezuela,2018,2.888713e+04,72.128000,91.868677,9.243563,287.099500,4.961309,0.012681,1.028201,...,4.086969,0.000000,1002.677,0.00000,96.077120,1.023000,128.298044,0.127186,51.365908,76.804951
9003,Vietnam,2018,9.551330e+04,75.317000,37.603299,4.198402,855.472000,2.758733,1.115381,0.301613,...,25541.550070,3.124000,66.880,0.00000,1922.007365,1.509000,240.322623,158.668579,15.837385,65.816659
9006,Yemen,2018,2.849850e+04,66.096000,4.336396,2.376919,51.992000,0.103735,0.000000,0.003439,...,136.686440,0.000000,0.000,0.00000,0.000000,0.000000,8.368932,0.297799,0.182775,7.888358
9007,Zambia,2018,1.736372e+04,63.510000,12.207614,0.870219,243.581900,0.153048,0.028917,0.000000,...,97.105841,0.152000,0.000,0.00000,2.994003,1.225000,7.143270,2.894102,0.000000,4.249168


In [7]:
main_df.isnull().sum()

country                                        0
year                                           0
population                                     0
life_expectancy                                0
consumption_per_capita                         0
consumption_per_GDP                            0
ppp_2015USD                                    0
production_total                               0
production_coal                                0
production_natural_gas                         0
production_petrolium_and_other_liquids         0
production_nuclear                          3942
production_nuclear_renewables_and_other        0
production_renewables_and_other                0
consumption_total                              0
consumption_coal                               0
consumption_natural_gas                        0
consumption_petrolium_and_other_liquids        0
consumption_nuclear                         3942
consumption_nuclear_renewables_and_other       0
consumption_renewabl