## Cleaning Eurostat data


### Electricity import, export and consumption per country

In [65]:
import pandas as pd
import numpy as np

df= pd.read_csv('Electricity import export transformation per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with import, export and electricity available to market (=consumed)
df.rename(columns=lambda col: col.replace('AIM,', 'Electricity available to market GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('IMP,', 'Electricity imported GWh'), inplace=True)
df.rename(columns=lambda col: col.replace('EXP,', 'Electricity exported GWh'), inplace=True)
df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('GWh')]

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('E7000,GWH,', ''), inplace=True)

# Multiindexing for date and country

## Mapping dictionary for European countries and adjacent countries
country_mapping = {
    'AL': 'Albania',
    'AT': 'Austria',
    'BA': 'Bosnia and Herzegovina',
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'CH': 'Switzerland',
    'CY': 'Cyprus',
    'CZ': 'Czech Republic',
    'DE': 'Germany',
    'DK': 'Denmark',
    'EE': 'Estonia',
    'EL': 'Greece',
    'ES': 'Spain',
    'FI': 'Finland',
    'FR': 'France',
    'HR': 'Croatia',
    'HU': 'Hungary',
    'IE': 'Ireland',
    'IS': 'Iceland',
    'IT': 'Italy',
    'LI': 'Liechtenstein',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'LV': 'Latvia',
    'ME': 'Montenegro',
    'MK': 'North Macedonia',
    'MT': 'Malta',
    'NL': 'Netherlands',
    'NO': 'Norway',
    'PL': 'Poland',
    'PT': 'Portugal',
    'RO': 'Romania',
    'RU': 'Russia',
    'RS': 'Serbia',
    'SE': 'Sweden',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'TR': 'Turkey',
    'UK': 'United Kingdom',
    'XK': 'Kosovo',
    'UA': 'Ukraine',
    'GE': 'Georgia',
    'MD': 'Moldova'
}

#Creating a multiindex
tuples = []

for i in country_mapping:
    for j in df.index:
        tuples.append((i,j))
        
mi = pd.MultiIndex.from_tuples(tuples, names=('country', 'date'))

#New dataframe in which I add the data
s = pd.DataFrame(np.random.randn(len(mi),3), index=mi, columns=['Electricity import (GWh)','Electricity export (GWh)','Electricity available (GWh)'])

#Putting data in the new dataframe
kt = []
for k in df.columns:
    kt.append(k[:-2])
kt = list(set(kt))

ci = 0
for k in kt:
    c = s.columns[ci]
    for i in country_mapping:
        st = k + i
        if st in df.columns:
            for j in df.index:
                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                df.loc[j,st] = df.loc[j,st].replace('e', '')
                df.loc[j,st] = df.loc[j,st].replace('p', '')
                df.loc[j,st] = df.loc[j,st].replace('n', '')
                df.loc[j,st] = df.loc[j,st].replace('z', '')
                df.loc[j,st] = df.loc[j,st].replace('u', '')
                df.loc[j,st] = df.loc[j,st].replace('d', '')
                df.loc[j,st] = df.loc[j,st].replace('c', '')
                df.loc[j,st] = df.loc[j,st].replace('b', '')
                if st in df.columns:
                    s.loc[(i,j),c] = df.loc[j,st]
                else:
                    s.loc[(i,j),c] = ':'

    ci = ci + 1

display(s)


  s.loc[(i,j),c] = df.loc[j,st]


Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh)
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AL,2008-01,:,:,:
AL,2008-02,:,:,:
AL,2008-03,:,:,:
AL,2008-04,:,:,:
AL,2008-05,:,:,:
...,...,...,...,...
MD,2023-10,56.098,341.081,303.988
MD,2023-11,34.553,311.614,304.493
MD,2023-12,31.656,316.417,347.600
MD,2024-01,32.177,303.787,392.548


### Electricity mix per country

In [66]:
df= pd.read_csv('Electricity mix per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#Getting rid of predicted and estimated labels
df = df.replace('e', '')
df = df.replace('p', '')
df[:] = df[:].replace(' ', '')

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('GWH,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('G3000', 'Electricity generated from natural gas GWh'), inplace=True)

#We generate the following information: Electricity generated in total, reliance on gas for production
summingtitles = ['CF','RA100','RA200','RA300','RA400','RA500_5160','N9000']

display(df)


"freq,siec,unit,geo\TIME_PERIOD","C0000,AL","C0000,AT","C0000,BA","C0000,BE","C0000,BG","C0000,CY","C0000,CZ","C0000,DE","C0000,DK","C0000,EE",...,"X9900,PT","X9900,RO","X9900,RS","X9900,SE","X9900,SI","X9900,SK","X9900,TR","X9900,UA","X9900,UK","X9900,XK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-02,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-03,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-04,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2016-05,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10,0.000,147.864,796.937,3.602,781.035,0.000,2610.974,9674.588 e,44.275,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,85.657,:,:,0.000
2023-11,0.000,146.045,692.416,1.176,871.225,0.000,2753.006,10947.152 e,226.176,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,79.285,:,:,:
2023-12,0.000,158.347,685.594,1.002,1043.759,0.000,2762.875,10597.592 e,257.861,0.000,...,0.000,0.000,0.000,0.000,0.000,1.000,76.049,:,:,:
2024-01,0.000,162.180,836.717,152.345,905.171,0.000,2590.198,10536.739 e,309.229,0.000,...,0.000,0.000,0.000,0.000,:,5.000,77.926,:,:,:


In [67]:
totalgeneration = pd.DataFrame(0, index=mi, columns=['Total electricity generated GWh']) 
gasgenerationfraction = pd.DataFrame(np.nan, index=mi, columns=['Fraction of electricity generated by gas']) 

for i in country_mapping:
    for k in summingtitles:
        st = k + ',' + i
        if st in df.columns:
                for j in df.index:
                        #this cleans p, n e labels
                        df.loc[j,st] = df.loc[j,st].replace(' ', '')
                        df.loc[j,st] = df.loc[j,st].replace('e', '')
                        df.loc[j,st] = df.loc[j,st].replace('p', '')
                        df.loc[j,st] = df.loc[j,st].replace('n', '')
                        df.loc[j,st] = df.loc[j,st].replace('z', '')
                        df.loc[j,st] = df.loc[j,st].replace('u', '')
                        df.loc[j,st] = df.loc[j,st].replace('d', '')
                        #Making the new feature
                        if (df.loc[j,st] != ':') and (df.loc[j,st] != ': ') and (totalgeneration.loc[i,j] != [':']).all():    
                                totalgeneration.loc[(i,j),'Total electricity generated GWh'] = totalgeneration.loc[(i,j),'Total electricity generated GWh'] + float(df.loc[j,st])
                        else:
                                totalgeneration.loc[i,j] = ':'

        else:
                totalgeneration.loc[i] = [':']

totalgeneration.replace(0, np.nan, inplace=True)
                
for i in country_mapping:
        st1 = 'Electricity generated from natural gas GWh,' + i
        if st1 in df.columns:
                for j in df.index:
                        #this cleans p, n e labels
                        df.loc[j,st1] = df.loc[j,st1].replace(' ', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('e', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('p', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('n', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('z', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('u', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('d', '')
                        df.loc[j,st1] = df.loc[j,st1].replace('c', '')
                        #Putting ele-gas data in s
                        if st1 in df.columns:
                                s.loc[(i,j),'Electricity generated from natural gas GWh'] = df.loc[j,st1]
                        else:
                                s.loc[(i,j),'Electricity generated from natural gas GWh'] = ':'
                        #Making the new feature
                        if (df.loc[j,st1]!= ':') and (df.loc[j,st1]!= ': ') and (totalgeneration.loc[(i,j),'Total electricity generated GWh']!= ':') and (s.loc[(i,j),'Electricity generated from natural gas GWh']!= ':'):    
                                gasgenerationfraction.loc[(i,j),'Fraction of electricity generated by gas'] = float(df.loc[j,st1]) / totalgeneration.loc[(i,j),'Total electricity generated GWh']
                        else:
                                gasgenerationfraction.loc[(i,j),'Fraction of electricity generated by gas'] = ':'

#We only keep the information on gas
df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('GWh')]

#Merging and cleaning
s = pd.merge(s,totalgeneration, left_index=True, right_index=True)
s = pd.merge(s,gasgenerationfraction, left_index=True, right_index=True)
s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

display(s)

  totalgeneration.loc[i,j] = ':'
  gasgenerationfraction.loc[(i,j),'Fraction of electricity generated by gas'] = ':'


Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AL,2008-01,,,,n,,
AL,2008-02,,,,n,,
AL,2008-03,,,,n,,
AL,2008-04,,,,n,,
AL,2008-05,,,,n,,
...,...,...,...,...,...,...,...
MD,2023-10,56.098,341.081,303.988,25.592,37.600,0.680638
MD,2023-11,34.553,311.614,304.493,73.918,80.085,0.922994
MD,2023-12,31.656,316.417,347.600,120.680,126.124,0.956836
MD,2024-01,32.177,303.787,392.548,126.431,142.357,0.888126


### Gas imports, exports and consumption

In [68]:
df= pd.read_csv('Gas import, export, consumption per country.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with import, export and electricity available to market
df.rename(columns=lambda col: col.replace('IPRD,', 'Gas produced Mm3'), inplace=True)
df.rename(columns=lambda col: col.replace('TOS,', 'Gas from other sources Mm3'), inplace=True)
df.rename(columns=lambda col: col.replace('IMP,', 'Gas imported Mm3'), inplace=True)
df.rename(columns=lambda col: col.replace('EXP,', 'Gas exported Mm3'), inplace=True)
df.rename(columns=lambda col: col.replace('IC_OBS,', 'Gas consumed Mm3'), inplace=True)

df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,df.columns.str.contains('Mm3')]
df = df.loc[:,~df.columns.str.contains('TJ_GCV,')]

#Giving better names
df.rename(columns=lambda col: col.replace('M,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('G3000,', ''), inplace=True)
df.rename(columns=lambda col: col.replace('MIO_M3,', ''), inplace=True)

display(df)

"freq,nrg_bal,siec,unit,geo\TIME_PERIOD",Gas exported Mm3AL,Gas exported Mm3AT,Gas exported Mm3BE,Gas exported Mm3BG,Gas exported Mm3CY,Gas exported Mm3CZ,Gas exported Mm3DE,Gas exported Mm3DK,Gas exported Mm3EE,Gas exported Mm3EL,...,Gas from other sources Mm3PL,Gas from other sources Mm3PT,Gas from other sources Mm3RO,Gas from other sources Mm3RS,Gas from other sources Mm3SE,Gas from other sources Mm3SI,Gas from other sources Mm3SK,Gas from other sources Mm3TR,Gas from other sources Mm3UK,Gas from other sources Mm3XK
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-01,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2008-02,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2008-03,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2008-04,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
2008-05,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10,0.000,229.524,2886.500,1478.627,0.000,18.004,1160.321,668.954,20.100,1002.828,...,:,0.000,0.000,0.000,0.000,:,0.000,0.000,:,:
2023-11,0.000,482.022,2872.100,1283.671,0.000,52.275,746.120,799.742,0.000,952.190,...,:,0.000,0.000,0.000,0.000,:,0.000,0.000,:,:
2023-12,0.000,608.401,3009.200,1230.721,0.000,31.821,811.852,719.219,0.000,875.503,...,:,0.000,0.000,0.000,0.000,:,0.000,0.000,:,:
2024-01,0.000,960.692,2462.400,1193.181,0.000,20.941,751.483,788.464,0.000,914.502,...,:,0.000,0.000,0.000,0.000,:,0.000,0.000,:,:


In [69]:
#Putting data in the dataframe

kt = []
for k in df.columns:
    kt.append(k[:-2])
kt = list(set(kt))

for i in country_mapping:
        for k in kt:
                st1 = k + i
                if st1 in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st1] = df.loc[j,st1].replace(' ', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('e', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('p', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('n', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('z', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('u', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('d', '')
                                df.loc[j,st1] = df.loc[j,st1].replace('c', '')
                                #Putting gas data in s
                                if st1 in df.columns:
                                        s.loc[(i,j),k] = df.loc[j,st1]
                                else:
                                        s.loc[(i,j),k] = ':'

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AL,2008-01,,,,n,,,,,,,
AL,2008-02,,,,n,,,,,,,
AL,2008-03,,,,n,,,,,,,
AL,2008-04,,,,n,,,,,,,
AL,2008-05,,,,n,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
MD,2023-10,56.098,341.081,303.988,25.592,37.600,0.680638,0.000,0.000,42.300,43.000,0.000
MD,2023-11,34.553,311.614,304.493,73.918,80.085,0.922994,,,90.800,89.800,
MD,2023-12,31.656,316.417,347.600,120.680,126.124,0.956836,,,144.400,143.700,
MD,2024-01,32.177,303.787,392.548,126.431,142.357,0.888126,0.000,0.000,157.941,161.161,0.000


### Gas prices for households

In [70]:
df= pd.read_csv('Gas prices for households.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with price excluding tax and choose the metric for mid range consumers as they are the heaviest consumers in all countries,
#it seems representative the total and average price is available only for a short period of time. Information on share of consumers in each "range" is from other datasets
#in eurostat; this same choice was made for other metrics.
df.rename(columns=lambda col: col.replace('KWH,X_TAX,EUR,', 'Gas price for households, no tax €/kWh'), inplace=True)
df = df.loc[:,df.columns.str.contains('€')]

df = df.loc[:,df.columns.str.contains('GJ20-199,')]
df = df.loc[:,~df.columns.str.endswith('20')]

#Giving better names
df.rename(columns=lambda col: col.replace('S,4100,GJ20-199,', ''), inplace=True)

display(df)

"freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD","Gas price for households, no tax €/kWhAL","Gas price for households, no tax €/kWhAT","Gas price for households, no tax €/kWhBA","Gas price for households, no tax €/kWhBE","Gas price for households, no tax €/kWhBG","Gas price for households, no tax €/kWhCZ","Gas price for households, no tax €/kWhDE","Gas price for households, no tax €/kWhDK","Gas price for households, no tax €/kWhEA","Gas price for households, no tax €/kWhEE",...,"Gas price for households, no tax €/kWhPL","Gas price for households, no tax €/kWhPT","Gas price for households, no tax €/kWhRO","Gas price for households, no tax €/kWhRS","Gas price for households, no tax €/kWhSE","Gas price for households, no tax €/kWhSI","Gas price for households, no tax €/kWhSK","Gas price for households, no tax €/kWhTR","Gas price for households, no tax €/kWhUA","Gas price for households, no tax €/kWhUK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-S1,:,:,:,:,:,:,:,0.0406,0.0427,:,...,:,:,:,:,0.0547,:,:,:,:,:
2007-S2,:,0.0446,:,0.0401,0.0269,0.0304,0.0456,0.0462,0.0435,0.0222,...,0.0329,0.0622,0.0227,:,0.0518,0.0396,0.0348,0.0268,:,0.034
2008-S1,:,0.0428,:,0.0468,0.0295,0.0369,0.0479,0.0516,0.0452,0.0266,...,0.0341,0.0595,0.0214,:,0.0532,0.0437,0.036,0.0264,:,0.0377
2008-S2,:,0.0452,:,0.0589,0.0326,0.0444,0.0581,0.0532,0.0523,0.0302,...,0.0422,0.0599,0.0221,:,0.0612,0.0564,0.0391,0.0384,:,0.0458
2009-S1,:,0.0473,:,0.0487,0.0394,0.0416,0.0485,0.0388,0.0492,0.0324,...,0.0319,0.0564,0.0174,:,0.0509,0.052,0.0388,0.0321,:,0.0406
2009-S2,:,0.0449,:,0.0412,0.029,0.0397,0.0435,0.0426,0.0424,0.0289,...,0.0377,0.0566,0.0146,:,0.0553,0.042,0.04,0.0251,:,0.0406
2010-S1,:,0.0451,0.0291,0.0423,0.0306,0.0391,0.0415,0.0473,0.0412,0.0278,...,0.0348,0.0565,0.015,:,0.0592,0.0452,0.0366,0.0263,:,0.0386
2010-S2,:,0.0434,0.035,0.0479,0.0359,0.0431,0.042,0.0482,0.0457,0.0312,...,0.0414,0.0589,0.0145,:,0.0627,0.0516,0.0375,0.0273,:,0.0402
2011-S1,:,0.0512,0.0354,0.0507,0.0358,0.0454,0.0435,0.053,0.0447,0.0326,...,0.0377,0.0567,0.0149,:,0.066,0.0512,0.0388,0.0235,:,0.0405
2011-S2,:,0.0534,0.0426,0.0586,0.0393,0.0496,0.0478,0.0526,0.0518,0.0342,...,0.0407,0.0643,0.0144,:,0.0649,0.0616,0.0427,0.024,:,0.0498


In [71]:
display(df)

"freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD","Gas price for households, no tax €/kWhAL","Gas price for households, no tax €/kWhAT","Gas price for households, no tax €/kWhBA","Gas price for households, no tax €/kWhBE","Gas price for households, no tax €/kWhBG","Gas price for households, no tax €/kWhCZ","Gas price for households, no tax €/kWhDE","Gas price for households, no tax €/kWhDK","Gas price for households, no tax €/kWhEA","Gas price for households, no tax €/kWhEE",...,"Gas price for households, no tax €/kWhPL","Gas price for households, no tax €/kWhPT","Gas price for households, no tax €/kWhRO","Gas price for households, no tax €/kWhRS","Gas price for households, no tax €/kWhSE","Gas price for households, no tax €/kWhSI","Gas price for households, no tax €/kWhSK","Gas price for households, no tax €/kWhTR","Gas price for households, no tax €/kWhUA","Gas price for households, no tax €/kWhUK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-S1,:,:,:,:,:,:,:,0.0406,0.0427,:,...,:,:,:,:,0.0547,:,:,:,:,:
2007-S2,:,0.0446,:,0.0401,0.0269,0.0304,0.0456,0.0462,0.0435,0.0222,...,0.0329,0.0622,0.0227,:,0.0518,0.0396,0.0348,0.0268,:,0.034
2008-S1,:,0.0428,:,0.0468,0.0295,0.0369,0.0479,0.0516,0.0452,0.0266,...,0.0341,0.0595,0.0214,:,0.0532,0.0437,0.036,0.0264,:,0.0377
2008-S2,:,0.0452,:,0.0589,0.0326,0.0444,0.0581,0.0532,0.0523,0.0302,...,0.0422,0.0599,0.0221,:,0.0612,0.0564,0.0391,0.0384,:,0.0458
2009-S1,:,0.0473,:,0.0487,0.0394,0.0416,0.0485,0.0388,0.0492,0.0324,...,0.0319,0.0564,0.0174,:,0.0509,0.052,0.0388,0.0321,:,0.0406
2009-S2,:,0.0449,:,0.0412,0.029,0.0397,0.0435,0.0426,0.0424,0.0289,...,0.0377,0.0566,0.0146,:,0.0553,0.042,0.04,0.0251,:,0.0406
2010-S1,:,0.0451,0.0291,0.0423,0.0306,0.0391,0.0415,0.0473,0.0412,0.0278,...,0.0348,0.0565,0.015,:,0.0592,0.0452,0.0366,0.0263,:,0.0386
2010-S2,:,0.0434,0.035,0.0479,0.0359,0.0431,0.042,0.0482,0.0457,0.0312,...,0.0414,0.0589,0.0145,:,0.0627,0.0516,0.0375,0.0273,:,0.0402
2011-S1,:,0.0512,0.0354,0.0507,0.0358,0.0454,0.0435,0.053,0.0447,0.0326,...,0.0377,0.0567,0.0149,:,0.066,0.0512,0.0388,0.0235,:,0.0405
2011-S2,:,0.0534,0.0426,0.0586,0.0393,0.0496,0.0478,0.0526,0.0518,0.0342,...,0.0407,0.0643,0.0144,:,0.0649,0.0616,0.0427,0.024,:,0.0498


In [72]:
#Reindexing the dataframe
for i in range(len(df.index)):
       if df.index[i].endswith('S1 '):
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '01-01'
       else:
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '07-01'

df = df.set_index('sdate')
df.index = pd.to_datetime(df.index)

#Upsampling to get data for each month
df = df.resample('M').ffill()

df.index = df.index.strftime('%Y-%m') + ' '
df.index.name = 'date'

#Putting data in the dataframe

for i in country_mapping:
                st = 'Gas price for households, no tax €/kWh' + i
                if st in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                                df.loc[j,st] = df.loc[j,st].replace('e', '')
                                df.loc[j,st] = df.loc[j,st].replace('p', '')
                                df.loc[j,st] = df.loc[j,st].replace('n', '')
                                df.loc[j,st] = df.loc[j,st].replace('z', '')
                                df.loc[j,st] = df.loc[j,st].replace('u', '')
                                df.loc[j,st] = df.loc[j,st].replace('d', '')
                                df.loc[j,st] = df.loc[j,st].replace('c', '')
                                #Putting gas data in s
                                if st in df.columns:
                                        s.loc[(i,j),'Gas price for households, no tax €/kWh'] = df.loc[j,st]
                                else:
                                        s.loc[(i,j),'Gas price for households, no tax €/kWh'] = ':'

                #no need for an else statement, it puts NaN by default

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

s = s.sort_index(ascending=True)
display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh"
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AL,2007-01,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n
XK,2023-11,,,,,,,,,,,,n
XK,2023-12,,,,,,,,,,,,n
XK,2024-01,,,,,,,,,,,,n


### Gas prices for industries

In [73]:
df= pd.read_csv('Gas prices for industry.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with price excluding tax and choose the metric for 
#Consumers from 999999 GJ/y to 100000 GJ/y for same reasons as before, most countries 
#have around 20% or above volumetric consumption from this category

df.rename(columns=lambda col: col.replace('KWH,X_TAX,EUR,', 'Gas price for industries, no tax €/kWh'), inplace=True)
df = df.loc[:,df.columns.str.contains('€')]

df = df.loc[:,df.columns.str.contains('GJ100000-999999,')]
df = df.loc[:,~df.columns.str.endswith('20')]

#Giving better names
df.rename(columns=lambda col: col.replace('S,4100,GJ100000-999999,', ''), inplace=True)

display(df)

"freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD","Gas price for industries, no tax €/kWhAL","Gas price for industries, no tax €/kWhAT","Gas price for industries, no tax €/kWhBA","Gas price for industries, no tax €/kWhBE","Gas price for industries, no tax €/kWhBG","Gas price for industries, no tax €/kWhCZ","Gas price for industries, no tax €/kWhDE","Gas price for industries, no tax €/kWhDK","Gas price for industries, no tax €/kWhEA","Gas price for industries, no tax €/kWhEE",...,"Gas price for industries, no tax €/kWhPL","Gas price for industries, no tax €/kWhPT","Gas price for industries, no tax €/kWhRO","Gas price for industries, no tax €/kWhRS","Gas price for industries, no tax €/kWhSE","Gas price for industries, no tax €/kWhSI","Gas price for industries, no tax €/kWhSK","Gas price for industries, no tax €/kWhTR","Gas price for industries, no tax €/kWhUA","Gas price for industries, no tax €/kWhUK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-S1,:,:,:,:,:,:,:,0.0194,0.026,:,...,:,:,:,:,0.0299,:,:,:,:,0.0249
2007-S2,:,:,:,0.0256,0.0165,0.0229,0.0279,0.0221,0.0263,0.0173,...,0.0226,0.021,0.0225,:,0.0337,0.0244,0.0265,0.0234,:,0.0212
2008-S1,:,:,:,0.0311,0.0195,0.0284,0.0334,0.0247,0.0299,0.0222,...,0.0267,0.0237,0.0209,:,0.0403,0.0303,0.031,0.0237,:,0.0251
2008-S2,:,:,:,0.0328,0.0248,0.0356,0.0368,0.0255,0.0347,0.0279,...,0.0298,0.0261,0.0229,:,0.0381,0.039,0.0428,0.0315,:,0.0288
2009-S1,:,0.0247,:,0.0289,0.0293,0.0293,0.0318,0.0216,0.0304,0.0249,...,0.0249,0.0256,0.016,:,0.0261,:,0.0358,0.0263,:,0.0248
2009-S2,:,0.0243,:,0.0279,0.0188,0.0229,0.0289,0.0173,0.0264,0.0202,...,0.0256,0.023,0.0131,:,0.0332,0.0244,0.0286,0.0207,:,0.0188
2010-S1,:,0.0235,:,0.0231,0.0216,0.0262,0.0292,0.0216,0.0263,0.0258,...,0.0266,0.026,0.0147,:,0.0347,0.0374,0.0284,0.0215,:,0.0194
2010-S2,:,0.0262,:,0.0235,0.0274,0.0304,0.0291,0.0271,0.0268,0.0257,...,0.0285,0.0289,0.0138,:,0.0374,0.0315,0.0289,0.022,:,0.0207
2011-S1,:,0.028,:,0.0258,0.0265,0.0281,0.0306,0.0303,0.0279,0.0257,...,0.0287,0.0288,0.0157,:,0.0391,:,0.029,0.0202,:,0.0229
2011-S2,:,0.0294,0.0421,0.0269,0.0293,0.0315,0.0321,0.0298,0.0299,0.0291,...,0.0274,0.0335,0.0178,:,0.041,:,0.0312,0.02,:,0.0249


In [74]:
#Reindexing the dataframe
for i in range(len(df.index)):
       if df.index[i].endswith('S1 '):
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '01-01'
       else:
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '07-01'

df = df.set_index('sdate')
df.index = pd.to_datetime(df.index)

#Upsampling to get data for each month
df = df.resample('M').ffill()

df.index = df.index.strftime('%Y-%m') + ' '
df.index.name = 'date'

#Putting data in the dataframe

for i in country_mapping:
                st = 'Gas price for industries, no tax €/kWh' + i
                if st in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                                df.loc[j,st] = df.loc[j,st].replace('e', '')
                                df.loc[j,st] = df.loc[j,st].replace('p', '')
                                df.loc[j,st] = df.loc[j,st].replace('n', '')
                                df.loc[j,st] = df.loc[j,st].replace('z', '')
                                df.loc[j,st] = df.loc[j,st].replace('u', '')
                                df.loc[j,st] = df.loc[j,st].replace('d', '')
                                df.loc[j,st] = df.loc[j,st].replace('c', '')
                                #Putting gas data in s
                                if st in df.columns:
                                        s.loc[(i,j),'Gas price for industries, no tax €/kWh'] = df.loc[j,st]
                                else:
                                        s.loc[(i,j),'Gas price for industries, no tax €/kWh'] = ':'

                #no need for an else statement, it puts NaN by default

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

s = s.sort_index(ascending=True)
display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh"
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AL,2007-01,,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n,n
XK,2023-11,,,,,,,,,,,,n,n
XK,2023-12,,,,,,,,,,,,n,n
XK,2024-01,,,,,,,,,,,,n,n


### Electricity prices for households

In [75]:
df= pd.read_csv('Electricity prices for households.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with price excluding tax and choose the metric for the band from 2500 kWh to 4999 kWh

df.rename(columns=lambda col: col.replace('KWH,X_TAX,EUR,', 'Electricity price for households, no tax €/kWh'), inplace=True)
df = df.loc[:,df.columns.str.contains('€')]

df = df.loc[:,df.columns.str.contains('KWH2500-4999,')]
df = df.loc[:,~df.columns.str.endswith('20')]

#Giving better names
df.rename(columns=lambda col: col.replace('S,6000,KWH2500-4999,', ''), inplace=True)

display(df)

"freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD","Electricity price for households, no tax €/kWhAL","Electricity price for households, no tax €/kWhAT","Electricity price for households, no tax €/kWhBA","Electricity price for households, no tax €/kWhBE","Electricity price for households, no tax €/kWhBG","Electricity price for households, no tax €/kWhCY","Electricity price for households, no tax €/kWhCZ","Electricity price for households, no tax €/kWhDE","Electricity price for households, no tax €/kWhDK","Electricity price for households, no tax €/kWhEA",...,"Electricity price for households, no tax €/kWhPT","Electricity price for households, no tax €/kWhRO","Electricity price for households, no tax €/kWhRS","Electricity price for households, no tax €/kWhSE","Electricity price for households, no tax €/kWhSI","Electricity price for households, no tax €/kWhSK","Electricity price for households, no tax €/kWhTR","Electricity price for households, no tax €/kWhUA","Electricity price for households, no tax €/kWhUK","Electricity price for households, no tax €/kWhXK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-S1,:,:,:,:,:,:,0.0963,0.1227,:,0.1238,...,:,:,:,0.0968,:,:,:,:,:,:
2007-S2,:,0.1255,:,0.1286,0.0603,0.1349,0.099,0.1279,0.1027,0.1121,...,0.1206,0.0954,:,0.1013,0.0861,0.1152,0.0706,:,0.1411,:
2008-S1,:,0.1271,:,0.15,0.0593,0.1528,0.1167,0.1299,0.1124,0.1124,...,0.1074,0.0885,:,0.1085,0.0911,0.1194,0.0782,:,0.1394,:
2008-S2,:,0.1268,:,0.1619,0.0685,0.1754,0.1189,0.1341,0.1262,0.1172,...,0.1066,0.092,:,0.1137,0.0919,0.1283,0.0969,:,0.153,:
2009-S1,:,0.138,:,0.1431,0.0685,0.1336,0.1212,0.1401,0.1115,0.1204,...,0.1264,0.0814,:,0.104,0.1056,0.1294,0.0909,:,0.1399,:
2009-S2,:,0.138,:,0.139,0.0685,0.1409,0.1277,0.1359,0.0963,0.1197,...,0.1383,0.0815,:,0.1059,0.105,0.1311,0.0936,:,0.1341,:
2010-S1,:,0.1427,0.0634,0.1449,0.0675,0.1597,0.1236,0.1381,0.1041,0.1206,...,0.1093,0.0856,:,0.1195,0.1057,0.1277,0.1067,:,0.1321,:
2010-S2,:,0.1396,0.0629,0.146,0.0692,0.169,0.1279,0.137,0.1096,0.125,...,0.1061,0.0839,:,0.128,0.1058,0.1376,0.1092,:,0.138,:
2011-S1,0.096,0.1442,0.0637,0.1572,0.0688,0.1731,0.1372,0.1406,0.1173,0.1289,...,0.1015,0.0848,:,0.1376,0.1079,0.1372,0.0978,:,0.1365,:
2011-S2,0.0964,0.1444,0.0671,0.1595,0.0727,0.2035,0.1345,0.1395,0.1201,0.1313,...,0.1068,0.0823,:,0.134,0.1149,0.1395,0.0919,:,0.1509,:


In [76]:
#Reindexing the dataframe
for i in range(len(df.index)):
       if df.index[i].endswith('S1 '):
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '01-01'
       else:
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '07-01'

df = df.set_index('sdate')
df.index = pd.to_datetime(df.index)

#Upsampling to get data for each month
df = df.resample('M').ffill()

df.index = df.index.strftime('%Y-%m') + ' '
df.index.name = 'date'

#Putting data in the dataframe

for i in country_mapping:
                st = 'Electricity price for households, no tax €/kWh' + i
                if st in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                                df.loc[j,st] = df.loc[j,st].replace('e', '')
                                df.loc[j,st] = df.loc[j,st].replace('p', '')
                                df.loc[j,st] = df.loc[j,st].replace('n', '')
                                df.loc[j,st] = df.loc[j,st].replace('z', '')
                                df.loc[j,st] = df.loc[j,st].replace('u', '')
                                df.loc[j,st] = df.loc[j,st].replace('d', '')
                                df.loc[j,st] = df.loc[j,st].replace('c', '')
                                #Putting gas data in s
                                if st in df.columns:
                                        s.loc[(i,j),'Electricity price for households, no tax €/kWh'] = df.loc[j,st]
                                else:
                                        s.loc[(i,j),'Electricity price for households, no tax €/kWh'] = ':'

                #no need for an else statement, it puts NaN by default

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

s = s.sort_index(ascending=True)
display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh","Electricity price for households, no tax €/kWh"
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AL,2007-01,,,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n,n,n
XK,2023-11,,,,,,,,,,,,n,n,n
XK,2023-12,,,,,,,,,,,,n,n,n
XK,2024-01,,,,,,,,,,,,n,n,n


### Electricity price for industry

In [77]:
df= pd.read_csv('Electricity prices for industry.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()
df.index.name = 'date'

#We will keep just the columns with price excluding tax and choose the metric for the band from 2000 MWh to 19999 MWh

df.rename(columns=lambda col: col.replace('KWH,X_TAX,EUR,', 'Electricity price for industries, no tax €/kWh'), inplace=True)
df = df.loc[:,df.columns.str.contains('€')]

df = df.loc[:,df.columns.str.contains('MWH2000-19999,')]
df = df.loc[:,~df.columns.str.endswith('20')]

#Giving better names
df.rename(columns=lambda col: col.replace('S,6000,MWH2000-19999,', ''), inplace=True)

display(df)

"freq,product,nrg_cons,unit,tax,currency,geo\TIME_PERIOD","Electricity price for industries, no tax €/kWhAL","Electricity price for industries, no tax €/kWhAT","Electricity price for industries, no tax €/kWhBA","Electricity price for industries, no tax €/kWhBE","Electricity price for industries, no tax €/kWhBG","Electricity price for industries, no tax €/kWhCY","Electricity price for industries, no tax €/kWhCZ","Electricity price for industries, no tax €/kWhDE","Electricity price for industries, no tax €/kWhDK","Electricity price for industries, no tax €/kWhEA",...,"Electricity price for industries, no tax €/kWhPT","Electricity price for industries, no tax €/kWhRO","Electricity price for industries, no tax €/kWhRS","Electricity price for industries, no tax €/kWhSE","Electricity price for industries, no tax €/kWhSI","Electricity price for industries, no tax €/kWhSK","Electricity price for industries, no tax €/kWhTR","Electricity price for industries, no tax €/kWhUA","Electricity price for industries, no tax €/kWhUK","Electricity price for industries, no tax €/kWhXK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-S1,:,:,:,:,:,:,:,0.0741,:,0.0723,...,:,:,:,0.0518,:,:,:,:,0.098,:
2007-S2,:,0.0655,:,0.0748,0.0501,0.1261,0.0778,0.0776,0.0761,0.0688,...,0.0637,0.079,:,0.0577,0.0709,0.0952,0.0596,:,0.0944,:
2008-S1,:,0.0768,:,0.0856,0.0491,0.1295,0.0913,0.0839,0.0705,0.0743,...,0.0698,0.0783,:,0.0614,0.0742,0.1083,0.0618,:,0.0844,:
2008-S2,:,0.0813,:,0.0838,0.0588,0.1694,0.093,0.0831,0.0829,0.0752,...,0.0704,0.0799,:,0.0681,0.077,0.1164,0.0769,:,0.0975,:
2009-S1,:,0.0909,:,0.0928,0.0588,0.1061,0.0929,0.0843,0.0606,0.0805,...,0.083,0.0734,:,0.0587,0.0773,0.1263,0.0699,:,0.0986,:
2009-S2,:,0.0904,:,0.0902,0.0578,0.134,0.0967,0.0833,0.0626,0.0776,...,0.0822,0.0714,:,0.0596,0.076,0.1256,0.0696,:,0.0866,:
2010-S1,:,0.0843,:,0.0837,0.0568,0.1378,0.0929,0.08,0.0717,0.0769,...,0.0759,0.0716,:,0.0713,0.077,0.1045,0.0796,:,0.0839,:
2010-S2,:,0.0846,:,0.0836,0.0588,0.1547,0.0957,0.079,0.0759,0.0794,...,0.0761,0.0694,:,0.073,0.0764,0.1073,0.0815,:,0.0853,:
2011-S1,:,0.0823,:,0.086,0.0574,0.1445,0.0984,0.0791,0.078,0.0813,...,0.0841,0.0703,:,0.0783,0.0755,0.1113,0.0678,:,0.0856,:
2011-S2,:,0.0824,:,0.0899,0.0582,0.1904,0.0956,0.0804,0.069,0.0817,...,0.0829,0.0703,:,0.0703,0.0771,0.1133,0.066,:,0.089,:


In [78]:
#Reindexing the dataframe
for i in range(len(df.index)):
       if df.index[i].endswith('S1 '):
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '01-01'
       else:
              df.loc[df.index[i],'sdate'] = df.index[i][:-3] + '07-01'

df = df.set_index('sdate')
df.index = pd.to_datetime(df.index)

#Upsampling to get data for each month
df = df.resample('M').ffill()

df.index = df.index.strftime('%Y-%m') + ' '
df.index.name = 'date'

#Putting data in the dataframe

for i in country_mapping:
                st = 'Electricity price for industries, no tax €/kWh' + i
                if st in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                                df.loc[j,st] = df.loc[j,st].replace('e', '')
                                df.loc[j,st] = df.loc[j,st].replace('p', '')
                                df.loc[j,st] = df.loc[j,st].replace('n', '')
                                df.loc[j,st] = df.loc[j,st].replace('z', '')
                                df.loc[j,st] = df.loc[j,st].replace('u', '')
                                df.loc[j,st] = df.loc[j,st].replace('d', '')
                                df.loc[j,st] = df.loc[j,st].replace('c', '')
                                #Putting gas data in s
                                if st in df.columns:
                                        s.loc[(i,j),'Electricity price for industries, no tax €/kWh'] = df.loc[j,st]
                                else:
                                        s.loc[(i,j),'Electricity price for industries, no tax €/kWh'] = ':'

                #no need for an else statement, it puts NaN by default

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

s = s.sort_index(ascending=True)
display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh","Electricity price for households, no tax €/kWh","Electricity price for industries, no tax €/kWh"
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AL,2007-01,,,,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n,n,n,n
XK,2023-11,,,,,,,,,,,,n,n,n,n
XK,2023-12,,,,,,,,,,,,n,n,n,n
XK,2024-01,,,,,,,,,,,,n,n,n,n


### Population

In [79]:
df= pd.read_csv('Europe population.tsv', sep='\t')
df = df.set_index(df.columns[0])
df = df.transpose()

#Selecting only relevant years and total population
df.index = pd.to_datetime(df.index)
df = df.loc[pd.to_datetime('2008-01-31'):pd.to_datetime('2023-01-31')]

df = df.loc[:,df.columns.str.contains('TOTAL,T')]
df = df.loc[:,~df.columns.str.endswith('20')]
df = df.loc[:,~df.columns.str.endswith('19')]
df = df.loc[:,~df.columns.str.endswith('07')]
df = df.loc[:,~df.columns.str.endswith('28')]
df = df.loc[:,~df.columns.str.endswith('EFTA')]
df = df.loc[:,~df.columns.str.endswith('TOT')]
df = df.loc[:,~df.columns.str.endswith('31')]

#Upsampling to get data for each month
df = df.resample('M').ffill()

df.index = df.index.strftime('%Y-%m') + ' '
df.index.name = 'date'

#Giving better names
df.rename(columns=lambda col: col.replace('A,NR,TOTAL,T,', 'Population'), inplace=True)

display(df)

  df.index = pd.to_datetime(df.index)


"freq,unit,age,sex,geo\TIME_PERIOD",PopulationAD,PopulationAL,PopulationAM,PopulationAT,PopulationAZ,PopulationBA,PopulationBE,PopulationBG,PopulationBY,PopulationCH,...,PopulationRS,PopulationRU,PopulationSE,PopulationSI,PopulationSK,PopulationSM,PopulationTR,PopulationUA,PopulationUK,PopulationXK
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01,84484,2936355,3237976,8335003,8896900,3843998,10753080,7467119,9671912,7701856,...,7334937,141903979,9256347,2032362,5382401,31269,71517100,45963359,62042343,2180686
2009-02,84484,2936355,3237976,8335003,8896900,3843998,10753080,7467119,9671912,7701856,...,7334937,141903979,9256347,2032362,5382401,31269,71517100,45963359,62042343,2180686
2009-03,84484,2936355,3237976,8335003,8896900,3843998,10753080,7467119,9671912,7701856,...,7334937,141903979,9256347,2032362,5382401,31269,71517100,45963359,62042343,2180686
2009-04,84484,2936355,3237976,8335003,8896900,3843998,10753080,7467119,9671912,7701856,...,7334937,141903979,9256347,2032362,5382401,31269,71517100,45963359,62042343,2180686
2009-05,84484,2936355,3237976,8335003,8896900,3843998,10753080,7467119,9671912,7701856,...,7334937,141903979,9256347,2032362,5382401,31269,71517100,45963359,62042343,2180686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09,:,2793592,:,8978929,10156366,:,11617623,6838937,:,8738791,...,6797105,:,10452326,2107180,5434712,:,84680273,40997698,:,1773971
2022-10,:,2793592,:,8978929,10156366,:,11617623,6838937,:,8738791,...,6797105,:,10452326,2107180,5434712,:,84680273,40997698,:,1773971
2022-11,:,2793592,:,8978929,10156366,:,11617623,6838937,:,8738791,...,6797105,:,10452326,2107180,5434712,:,84680273,40997698,:,1773971
2022-12,:,2793592,:,8978929,10156366,:,11617623,6838937,:,8738791,...,6797105,:,10452326,2107180,5434712,:,84680273,40997698,:,1773971


In [80]:
#Putting data in the dataframe

for i in country_mapping:
                st = 'Population' + i
                if st in df.columns:
                        for j in df.index:
                                #this cleans p, n e labels
                                df.loc[j,st] = df.loc[j,st].replace(' ', '')
                                df.loc[j,st] = df.loc[j,st].replace('e', '')
                                df.loc[j,st] = df.loc[j,st].replace('p', '')
                                df.loc[j,st] = df.loc[j,st].replace('n', '')
                                df.loc[j,st] = df.loc[j,st].replace('z', '')
                                df.loc[j,st] = df.loc[j,st].replace('u', '')
                                df.loc[j,st] = df.loc[j,st].replace('d', '')
                                df.loc[j,st] = df.loc[j,st].replace('c', '')
                                df.loc[j,st] = df.loc[j,st].replace('b', '')
                                #Putting gas data in s
                                if st in df.columns:
                                        s.loc[(i,j),'Population'] = df.loc[j,st]
                                else:
                                        s.loc[(i,j),'Population'] = ':'

                #no need for an else statement, it puts NaN by default

s.replace(':', np.nan, inplace=True)
s.replace(': ', np.nan, inplace=True)

s = s.sort_index(ascending=True)
display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh","Electricity price for households, no tax €/kWh","Electricity price for industries, no tax €/kWh",Population
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AL,2007-01,,,,,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n,n,n,n,
XK,2023-11,,,,,,,,,,,,n,n,n,n,
XK,2023-12,,,,,,,,,,,,n,n,n,n,
XK,2024-01,,,,,,,,,,,,n,n,n,n,


### Gas Stocks data

In [81]:
#Antonio's data
#gas stocks

df_gs = pd.read_csv('estat_nrg_stk_gasm.tsv', sep='\t')

df_gs = df_gs.set_index(df_gs.columns[0])
df_gs = df_gs.transpose()
df_gs.index.name = 'date'
# Getting rid of predicted and estimated labels
df_gs = df_gs.replace('e', '')
df_gs = df_gs.replace('p', '')
df_gs[:] = df_gs[:].replace(' ', '')

df_gs.rename(columns=lambda col: col.replace('G3000,', ''), inplace=True)
df_gs.rename(columns=lambda col: col.replace('M,', ''), inplace=True)

df_gs = df_gs.loc[:, df_gs.columns.str.contains('TJ_GCV')]
df_gs = df_gs.loc[:, ~df_gs.columns.str.contains('STKCL_ABR,')]
df_gs = df_gs.loc[:, ~df_gs.columns.str.contains('STKCL_CG,')]
df_gs = df_gs.loc[:, ~df_gs.columns.str.contains('STKOP_ABR,')]
df_gs = df_gs.loc[:, ~df_gs.columns.str.contains('STKOP_CG,')]
df_gs = df_gs.loc[:, ~df_gs.columns.str.contains('STKOP_NAT,')]

df_gs.rename(columns=lambda col: col.replace('TJ_GCV,', ''), inplace=True)

df_gs1 = df_gs

'''
for country_code in df_gs.columns.str[-2:].unique():
    country_columns = [col for col in df_gs.columns if col[-2:] == country_code]
    df_gs1['TJ_GCV,' + country_code] = df_gs[country_columns].sum(axis=1)
'''


display(df_gs)

"freq,stk_flow,siec,unit,geo\TIME_PERIOD","STKCL_NAT,AL","STKCL_NAT,AT","STKCL_NAT,BE","STKCL_NAT,BG","STKCL_NAT,CY","STKCL_NAT,CZ","STKCL_NAT,DE","STKCL_NAT,DK","STKCL_NAT,EE","STKCL_NAT,EL",...,"STKCL_NAT,PT","STKCL_NAT,RO","STKCL_NAT,RS","STKCL_NAT,SE","STKCL_NAT,SI","STKCL_NAT,SK","STKCL_NAT,TR","STKCL_NAT,UA","STKCL_NAT,UK","STKCL_NAT,XK"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-01,:,92342.000,30185.000,16443.000,0.000,64566.000,0.000,82715.000,0.000,2541.000,...,6507.000,4634.000,:,0.000,0.000,52195.000,14412.000,:,166681.000,:
2008-02,:,73945.000,20517.000,13906.000,0.000,53077.000,0.000,76141.000,0.000,3165.000,...,6534.000,5413.000,:,0.000,0.000,49555.000,15788.000,:,127935.000,:
2008-03,:,56621.000,17266.000,11682.000,0.000,44186.000,0.000,69740.000,0.000,3825.000,...,6673.000,6760.000,:,0.000,0.000,47710.000,13277.000,:,113866.000,:
2008-04,:,61031.000,16707.000,12249.000,0.000,47756.000,0.000,69504.000,0.000,3431.000,...,7675.000,9040.000,:,0.000,0.000,49384.000,16044.000,:,131279.000,:
2008-05,:,82686.000,26768.000,14870.000,0.000,65124.000,0.000,76246.000,0.000,2571.000,...,7528.000,16192.000,:,0.000,0.000,59565.000,16499.000,:,201994.000,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10,0.000,349514.890,40601.100,20724.193,0.000,135054.219,886439.000,35507.064 p,0.000,6028.738,...,21824.438,125193.000,29115.000,872.925,:,175462.075,206295.385,:,:,:
2023-11,0.000,340927.573,38132.200,19179.292,0.000,127888.712,858236.000,33663.484 p,0.000,2687.634,...,18247.146,118221.000,29189.000,1051.694,:,166114.148,204424.962,:,:,:
2023-12,0.000,327013.225,34870.100,17441.643,0.000,119855.792,808729.000,33412.613,0.000,3782.360,...,19600.746 p,101350.000,26818.000,1529.200,:,151184.832,208465.690,:,:,:
2024-01,0.000,287403.475,24922.000,14368.378,0.000,96516.835,662684.000,28326.137,0.000,3449.177,...,17069.578 p,77624.000 p,23892.000,1319.786,:,140698.664,199957.835,:,:,:


In [82]:
gas_stocks = pd.DataFrame(0, index=mi, columns=['Total Gas Stocks (TJ/GCV)'])

for i in country_mapping:
        
    st = 'STKCL_NAT,' + i
    if st in df_gs1.columns:

        for j in df_gs1.index:
            
            
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace(' ', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('e', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('p', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('n', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('z', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('u', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('d', '')
            df_gs1.loc[j, st] = df_gs1.loc[j, st].replace('c', '')
           
            # Making the new feature
            if (df_gs1.loc[j, st] != ':') and (df_gs1.loc[j, st] != ': '):    
                s.loc[(i, j), 'Total Gas Stocks (TJ/GCV)'] = float(df_gs1.loc[j, st])
            else:
                s.loc[(i, j),'Total Gas Stocks (TJ/GCV)' ] = ':'
     
    else:
        s.loc[(i, j),'Total Gas Stocks (TJ/GCV)' ] = ':'
        
s.replace(':', np.nan, inplace=True)
#s = pd.merge(s,gas_stocks, left_index=True, right_index=True)

display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh","Electricity price for households, no tax €/kWh","Electricity price for industries, no tax €/kWh",Population,Total Gas Stocks (TJ/GCV)
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AL,2007-01,,,,,,,,,,,,,,,,,n
AL,2007-02,,,,,,,,,,,,,,,,,n
AL,2007-03,,,,,,,,,,,,,,,,,n
AL,2007-04,,,,,,,,,,,,,,,,,n
AL,2007-05,,,,,,,,,,,,,,,,,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,n,n,n,n,,
XK,2023-11,,,,,,,,,,,,n,n,n,n,,
XK,2023-12,,,,,,,,,,,,n,n,n,n,,
XK,2024-01,,,,,,,,,,,,n,n,n,n,,


### Mean Temperature Data

In [87]:
df_mt = pd.read_csv('mean_temperature.csv')

#Adapting to s dataframe date 

#df_mt['date'] = pd.to_datetime(df_mt['date']).dt.to_period('M')

# Rename columns using the country_mapping dictionary

country_mapping_rev = {
    'Austria': 'AT',
    'Belgium': 'BE',
    'Bulgaria': 'BG',
    'Switzerland': 'CH',
    'Czech Republic': 'CZ',
    'Germany': 'DE',
    'Denmark': 'DK',
    'Estonia': 'EE',
    'Spain': 'ES',
    'Finland': 'FI',
    'France': 'FR',
    'Great Britain': 'GB',
    'Greece': 'GR',
    'Croatia': 'HR',
    'Hungary': 'HU',
    'Ireland': 'IE',
    'Italy': 'IT',
    'Lithuania': 'LT',
    'Luxembourg': 'LU',
    'Latvia': 'LV',
    'Malta': 'MT',
    'Netherlands': 'NL',
    'Norway': 'NO',
    'Poland': 'PL',
    'Portugal': 'PT',
    'Romania': 'RO',
    'Russia': 'RU',
    'Sweden': 'SE',
    'Slovenia': 'SI',
    'Slovakia': 'SK',
    'United Kingdom': 'UK'
}


df_mt.rename(columns=country_mapping_rev, inplace=True)


df_mt['date'] = df_mt['date'].str.slice(stop=7) + ' '

df_mt.set_index('date', inplace=True)

display(df_mt.head)
df_mt.info()

<bound method NDFrame.head of                  AT         BE         BG         CH         CZ         DE  \
date                                                                         
2015-01   -0.811476   2.251527   0.258144  -1.176280  -0.480534   1.000013   
2015-02   -1.869132   2.051071   0.741122  -3.048363  -1.308914  -0.158522   
2015-03    3.172863   5.430645   3.814645   3.849481   3.582897   4.697487   
2015-04    7.401744   8.680649   8.719247   7.553757   7.459043   8.089296   
2015-05   12.552173  12.134363  16.477702  11.913712  12.579737  12.344438   
2015-06   16.911244  15.656911  18.572703  16.284660  16.863342  16.102493   
2015-07   21.039379  18.243516  23.918960  20.124702  21.520522  19.973409   
2015-08   20.723409  18.599251  23.669962  18.399460  22.573809  20.382401   
2015-09   13.035799  12.908044  19.765953  11.470611  14.108021  13.176499   
2015-10    7.500723   9.368270  11.042272   7.607362   7.797493   8.223023   
2015-11    5.459817   9.071417   8

<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, 2015-01  to 2019-12 
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      60 non-null     float64
 1   BE      60 non-null     float64
 2   BG      60 non-null     float64
 3   CH      60 non-null     float64
 4   CZ      60 non-null     float64
 5   DE      60 non-null     float64
 6   DK      60 non-null     float64
 7   EE      60 non-null     float64
 8   ES      60 non-null     float64
 9   FI      60 non-null     float64
 10  FR      60 non-null     float64
 11  GB      60 non-null     float64
 12  GR      60 non-null     float64
 13  HR      60 non-null     float64
 14  HU      60 non-null     float64
 15  IE      60 non-null     float64
 16  IT      60 non-null     float64
 17  LT      60 non-null     float64
 18  LU      60 non-null     float64
 19  LV      60 non-null     float64
 20  NL      60 non-null     float64
 21  NO      60 non-null     float64
 

In [88]:


for i in country_mapping:
    
    if i in df_mt.columns:
            for j in df_mt.index:
                
           
                s.loc[(i, j), 'Average Temperature (°C)'] = float(df_mt.loc[j,i])
            
    else:
        s.loc[(i,j), 'Average Temperature (°C)'] = ':'


for i in country_mapping:
    if i in df_mt.columns:
        for j in df_mt.index:
            # Check if the index 'j' exists in DataFrame 's'
            if (i, j) in s.index:
                # Convert the value to float and assign it to 's'
                s.loc[(i, j), 'Average Temperature (°C)'] = float(df_mt.loc[j, i])
            else:
                # Handle the case where the index 'j' does not exist in DataFrame 's'
                # This may happen if 'j' exists in df_mt but not in s
                pass
    else:
        # Assign ':' to 's' if the column 'i' does not exist in df_mt
        for j in df_mt.index:
            s.loc[(i, j), 'Average Temperature (°C)'] = ':'













s.replace('n', np.nan, inplace=True)

s.replace(':', np.nan, inplace=True)

#s = pd.merge(s, average_temp, left_index=True, right_index=True)

display(s)

  s.loc[(i,j), 'Average Temperature (°C)'] = ':'


Unnamed: 0_level_0,Unnamed: 1_level_0,Electricity import (GWh),Electricity export (GWh),Electricity available (GWh),Electricity generated from natural gas GWh,Total electricity generated GWh,Fraction of electricity generated by gas,Gas exported Mm3,Gas produced Mm3,Gas consumed Mm3,Gas imported Mm3,Gas from other sources Mm3,"Gas price for households, no tax €/kWh","Gas price for industries, no tax €/kWh","Electricity price for households, no tax €/kWh","Electricity price for industries, no tax €/kWh",Population,Total Gas Stocks (TJ/GCV),Average Temperature (°C)
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
AL,2007-01,,,,,,,,,,,,,,,,,,
AL,2007-02,,,,,,,,,,,,,,,,,,
AL,2007-03,,,,,,,,,,,,,,,,,,
AL,2007-04,,,,,,,,,,,,,,,,,,
AL,2007-05,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XK,2023-10,198.320,243.327,513.611,0.000,468.603,0.0,,,,,,,,,,,,
XK,2023-11,,,,,,,,,,,,,,,,,,
XK,2023-12,,,,,,,,,,,,,,,,,,
XK,2024-01,,,,,,,,,,,,,,,,,,


In [89]:
s.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8834 entries, ('AL', '2007-01 ') to ('XK', '2024-02 ')
Data columns (total 18 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Electricity import (GWh)                        7081 non-null   object 
 1   Electricity export (GWh)                        7081 non-null   object 
 2   Electricity available (GWh)                     7082 non-null   object 
 3   Electricity generated from natural gas GWh      3075 non-null   object 
 4   Total electricity generated GWh                 3101 non-null   float64
 5   Fraction of electricity generated by gas        3006 non-null   float64
 6   Gas exported Mm3                                3916 non-null   object 
 7   Gas produced Mm3                                3911 non-null   object 
 8   Gas consumed Mm3                                4077 non-null   object 
 9   Gas import

### Exporting into a CSV

In [90]:
s.to_csv('Data.csv', index=True)