## Cleaning Eurostat data


### Let's start with electricity import and export per country

In [309]:
import pandas as pd
import numpy as np

df= pd.read_csv('Electricity import export transformation per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'


#We will keep just the columns with import, export and electricity available to market
df.rename(columns=lambda col: col.replace('AIM,', 'Electricity available to market GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('IMP,', 'Electricity imported GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('EXP,', 'Electricity exported GWh'), inplace=True)
df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('GWh')]

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('E7000,GWH,', ''), inplace=True)

# Multiindexing for date and country

## Mapping dictionary for European countries and adjacent countries
country_mapping = {
    'AL': 'Albania',
    'AT': 'Austria',
    'BA': 'Bosnia and Herzegovina',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'CH': 'Switzerland',
    'CY': 'Cyprus',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EL': 'Greece',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'IE': 'Ireland',
    'IS': 'Iceland',
    'IT': 'Italy',
    'LI': 'Liechtenstein',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'ME': 'Montenegro',
    'MK': 'North Macedonia',
    'MT': 'Malta',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'PL': 'Poland',
    'PT': 'Portugal',
    'RO': 'Romania',
    'RU': 'Russia',
    'RS': 'Serbia',
    'SE': 'Sweden',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'TR': 'Turkey',
    'UK': 'United Kingdom',
    'XK': 'Kosovo',
    'UA': 'Ukraine',
    'GE': 'Georgia',
    'MD': 'Moldova'
}

mi = pd.MultiIndex.from_tuples(tuples, names=('country', 'date'))

#New dataframe in which I add the data
s = pd.DataFrame(np.random.randn(len(mi),3), index=mi, columns=['Electricity import (GWh)','Electricity export (GWh)','Electricity available to market (GWh)'])

#Putting data in the new dataframe

kt = []
for k in df.columns:
    kt.append(k[:-2])
kt = list(set(kt))


ci = 0
for k in kt:
    c = s.columns[ci]
    for i in country_mapping:
        st = k + i
        for j in df.index:
            if st in df.columns:
                s.loc[(i,j),c] = df.loc[j,st]
            else:
                s.loc[(i,j),c] = ':'

    ci = ci + 1

display(s)




Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available to market (GWh)
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AL,2008-01,:,:,:
AL,2008-02,:,:,:
AL,2008-03,:,:,:
AL,2008-04,:,:,:
AL,2008-05,:,:,:
...,...,...,...,...
MD,2023-10,341.081,56.098,303.988
MD,2023-11,311.614,34.553,304.493
MD,2023-12,316.417,31.656,347.600
MD,2024-01,303.787,32.177,392.548


### Electricity mix per country

In [312]:
df= pd.read_csv('Electricity mix per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('GWH,', ''), inplace=True)

#We only keep the information on gas
df.rename(columns=lambda col: col.replace('G3000', 'Electricity generated from natural gas GWh'), inplace=True)

#We generate the following information: Electricity generated in total, reliance on gas for production
summingtitles = ['CF','RA100','RA200','RA300','RA400','RA500_5160','N9000']

'''
r = np.sum(df[])
pd.Series()

df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('GWh')]

#We generate 
'''

display(df)




"freq,siec,unit,geo\TIME_PERIOD","C0000,AL","C0000,AT","C0000,BA","C0000,BE","C0000,BG","C0000,CY","C0000,CZ","C0000,DE","C0000,DK","C0000,EE",...,"X9900,PT","X9900,RO","X9900,RS","X9900,SE","X9900,SI","X9900,SK","X9900,TR","X9900,UA","X9900,UK","X9900,XK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-02,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-03,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-04,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-05,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10,0.000,147.864,796.937,3.602,781.035,0.000,2610.974,9674.588 e,44.275,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,85.657,:,:,0.000
2023-11,0.000,146.045,692.416,1.176,871.225,0.000,2753.006,10947.152 e,226.176,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,79.285,:,:,:
2023-12,0.000,158.347,685.594,1.002,1043.759,0.000,2762.875,10597.592 e,257.861,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,76.049,:,:,:
2024-01,0.000,162.180,836.717,152.345,905.171,0.000,2590.198,10536.739 e,309.229,0.000,...,0.000,0.000,0.000,0.000,:,5.000,77.926,:,:,:


In [314]:
totalgeneration = np.zeros(len(country_mapping))
for i in country_mapping:
    sum = np.zeros(len(df.index))
    for k in summingtitles:
        st = k + ',' + i
        sum = sum + df[k]




KeyError: "None of [Index(['CF', 'RA100', 'RA200', 'RA300', 'RA400', 'RA500_5160', 'N9000'], dtype='object', name='freq,siec,unit,geo\\TIME_PERIOD')] are in the [columns]"

In [321]:
totalenergy = np.zeros(len(country_mapping))
len(totalenergy)


43