## Imports

In [1]:
import pandas as pd
import numpy as np

main_df = pd.read_csv('../data/energy_data.csv')
life_exp = pd.read_excel('../data/life_expectancy.xlsx')
not_in_countries = [country for country in set(main_df['country']) \
                    if country not in set(life_exp['country'])]

## Cleaning
---
- Reordering columns
- Replacing `'--'` with nulls
- Dropping countries not found in the World Bank data, and years 2019 and 2020
- Converting float data as floats

In [2]:
# re-ordering columns
cols = main_df.columns.to_list()
cols = cols[:2] + cols[27:28] + cols[32:] + cols[24:27]  + cols[2:24] + cols[28:32]
main_df = main_df[cols]

In [3]:
# replacing '--' with np.nan
main_df.replace(to_replace = '--', value = np.nan, inplace = True)

In [4]:
# dropping
drop_index = main_df[(main_df.country.isin(not_in_countries))|
                     (main_df['year']==2019)|
                     (main_df['year']==2020)].index

main_df.drop(index = drop_index, inplace=True)

In [5]:
# converting all float data as float
float_cols = {col:'float64' for col in main_df.columns[2:].to_list()}
main_df = main_df.astype(float_cols)

In [6]:
# saving progress, delete before submitting
main_df.to_csv('../data/cleaned_energy_data.csv', index=False)

main_df

Unnamed: 0,country,year,population,life_expectancy,consumption_per_capita,consumption_per_GDP,ppp_2015USD,production_total,production_coal,production_natural_gas,...,imports_coal,imports_electricity,exports_crude_oil,exports_natural_gas,exports_coal,exports_electricity,emissions_co2_emissions,emissions_coal_and_coke,emissions_consumed_natural_gas,emissions_petroleum_and_other_liquids
0,World,1980,4.298127e+06,62.841745,68.155646,10.558174,27745.479547,296.214353,79.991943,54.761046,...,284893.825000,145.536984,30580.206,,297478.529700,150.131648,18671.570672,7455.939754,2843.422967,8372.207951
1,Afghanistan,1980,1.335650e+04,43.244000,1.990283,0.000000,,0.072561,0.002355,0.062820,...,0.000000,0.000000,0.000,,0.000000,0.000000,1.325965,0.231314,0.111101,0.983550
2,Albania,1980,2.682700e+03,70.208000,60.752906,0.000000,,0.155562,0.013229,0.010470,...,176.369600,0.000000,0.000,,0.000000,0.500000,9.618526,2.379753,0.555503,6.683270
3,Algeria,1980,1.922170e+04,58.198000,40.615303,0.000000,,2.803017,0.000076,0.484980,...,99.062575,0.070000,713.534,,0.000000,0.061000,45.663591,0.237672,28.799159,16.626760
4,American Samoa,1980,3.264600e+01,,180.515604,0.000000,,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000,,0.000000,0.000000,0.425071,0.000000,0.000000,0.425071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9002,Venezuela,2018,2.888713e+04,72.128000,91.868677,9.243563,287.099500,4.961309,0.012681,1.028201,...,4.086969,0.000000,1002.677,0.0,96.077120,1.023000,128.298044,0.127186,51.365908,76.804951
9003,Vietnam,2018,9.551330e+04,75.317000,37.603299,4.198402,855.472000,2.758733,1.115381,0.301613,...,25541.550070,3.124000,66.880,0.0,1922.007365,1.509000,240.322623,158.668579,15.837385,65.816659
9006,Yemen,2018,2.849850e+04,66.096000,4.336396,2.376919,51.992000,0.103735,0.000000,0.003439,...,136.686440,0.000000,0.000,0.0,0.000000,0.000000,8.368932,0.297799,0.182775,7.888358
9007,Zambia,2018,1.736372e+04,63.510000,12.207614,0.870219,243.581900,0.153048,0.028917,0.000000,...,97.105841,0.152000,0.000,0.0,2.994003,1.225000,7.143270,2.894102,0.000000,4.249168
