## Cleaning Eurostat data
### Let's start with electricity import and export per country

In [224]:
import pandas as pd
import numpy as np

df= pd.read_csv('Electricity import export transformation per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#display(df)

#We will keep just the columns with import, export and electricity available to market
df.rename(columns=lambda col: col.replace('AIM,', 'Electricity available to market GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('IMP,', 'Electricity imported GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('EXP,', 'Electricity exported GWh'), inplace=True)
df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('GWh')]

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('E7000,GWH,', ''), inplace=True)

# Multiindexing for date and country

## Mapping dictionary for European countries and adjacent countries
country_mapping = {
    'AL': 'Albania',
    'AT': 'Austria',
    'BA': 'Bosnia and Herzegovina',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'CH': 'Switzerland',
    'CY': 'Cyprus',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EL': 'Greece',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'IE': 'Ireland',
    'IS': 'Iceland',
    'IT': 'Italy',
    'LI': 'Liechtenstein',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'ME': 'Montenegro',
    'MK': 'North Macedonia',
    'MT': 'Malta',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'PL': 'Poland',
    'PT': 'Portugal',
    'RO': 'Romania',
    'RU': 'Russia',
    'RS': 'Serbia',
    'SE': 'Sweden',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'TR': 'Turkey',
    'UK': 'United Kingdom',
    'XK': 'Kosovo',
    'UA': 'Ukraine',
    'GE': 'Georgia',
    'MD': 'Moldova'
}

mi = pd.MultiIndex.from_tuples(tuples, names=('country', 'date'))
s = pd.Series(np.random.randn(len(mi)), index=mi)

for i in country_mapping:
    for j in df.index:
        st = 'Electricity imported GWh' + i
        if st in df.columns:
            s.loc[i,j] = df.loc[j,st]
        else:
            s.loc[i,j] = ':'

display(s)

'''
ref = [col.split(',')[-1][-2:] for col in df.columns]
for i in range(0,len(ref)):
    df.rename(columns=lambda col: col.replace(ref[i], country_mapping.get(ref[i])), inplace=True)
display(df)



#Creating a multiindex
tuples = []

for i in country_mapping:
    for j in df.index:
        tuples.append((i,j))

pd.MultiIndex.from_tuples(tuples, names=('country', 'date'))


display(df)
'''

country  date    
AL       2008-01    -0.601378
         2008-02    -0.161508
         2008-03     1.703187
         2008-04    -0.370356
         2008-05     1.059189
                       ...   
MD       2023-10    -0.126917
         2023-11    -0.162352
         2023-12     0.030223
         2024-01    -1.723030
         2024-02    -0.734133
Length: 8342, dtype: float64

country  date    
AL       2008-01           : 
         2008-02           : 
         2008-03           : 
         2008-04           : 
         2008-05           : 
                       ...   
MD       2023-10     341.081 
         2023-11     311.614 
         2023-12     316.417 
         2024-01     303.787 
         2024-02           : 
Length: 8342, dtype: object

"\nref = [col.split(',')[-1][-2:] for col in df.columns]\nfor i in range(0,len(ref)):\n    df.rename(columns=lambda col: col.replace(ref[i], country_mapping.get(ref[i])), inplace=True)\ndisplay(df)\n\n\n\n#Creating a multiindex\ntuples = []\n\nfor i in country_mapping:\n    for j in df.index:\n        tuples.append((i,j))\n\npd.MultiIndex.from_tuples(tuples, names=('country', 'date'))\n\n\ndisplay(df)\n"