# Exploratory Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
wec = pd.read_csv('World Energy Consumption.csv')
lon_lat = pd.read_csv('world_country_latitude_and_longitude_values.csv')[['latitude', 'longitude', 'country']]

In [None]:
lon_lat = lon_lat.replace('Congo [DRC]', 'Democratic Republic of Congo')
lon_lat = lon_lat.replace('Congo [Republic]', 'Congo')

In [None]:
#join the 2 dataframes on country
wec = wec.merge(lon_lat)

In [None]:
columns_electricity = ['hydro_electricity', 'solar_electricity', 'wind_electricity',
                       'renewables_electricity']                    
columns_renewables = ['renewables_consumption', 'hydro_consumption', 'solar_consumption', 'wind_consumption']
column_info = ['iso_code', 'country', 'year', 'latitude', 'longitude', 'gdp', 'population']
wec_all = wec[column_info + columns_electricity + columns_renewables]

In [None]:
wec_all

In [None]:
#number of countries
len(wec_all.country.unique())

In [None]:
wec_all.isna().sum()

Naturally, data may not available for all of the 204 countries present in the dataset, so let's see what are the countries for which data is available. We will split the analysis in two: data related to electricity and data related to consumption.

In [None]:
wec_consumption = wec[column_info + columns_renewables]
#drop rows with NaNs
wec_consumption = wec_consumption.dropna(subset=columns_renewables)
#nr of countries
len(wec_consumption.country.unique())

Let's see which are these countries.

In [None]:
#visualize the countries
fig = px.scatter_mapbox(wec_consumption, lat="latitude", lon="longitude", hover_name="country", zoom=0)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
wec_electricity = wec[column_info + columns_electricity]
#drop rows with NaNs
wec_electricity = wec_electricity.dropna(subset=columns_electricity)
#nr of countries
len(wec_electricity.country.unique())

Electricity related data is available for most of the countries, but probably the time span is small (data is only available for a few years). Let's verify this.

In [None]:
country_years_count = wec_electricity.groupby('country').year.count().reset_index(name='years_count')
country_years_count

In [None]:
#years available and number of countries
country_years_count.years_count.value_counts()

The hypothesis is therefore true, most of the countries (115/202) have 20 years of electricity data available, which should suffice for our analysis. 