# Markowitz Analysis on Eurostat Data
Ellie Cox

This file will conduct markowitz and black-litterman analysis on energy supply data retrieved from eurostat. 
First with the full data, then taking the average when excluding one country at a time, the average when excluding one year of data at a time, and lastly using a randomly selected 75\% of the data

## Read Packages

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pypfopt as pyp
import math
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import plotting
import cvxopt as opt
from cvxopt import blas, solvers
solvers.options['show_progress'] = False

## Read Data

In [182]:
# Read data using pandas
data = pd.read_csv("/Users/elizabeth/Documents/Master's Project/Data/EU_TotalEnergySupply.csv")

# Create list of column names
data.columns.values.tolist()
# Rename columns to make life easier
data.columns = [c.replace(' ', '_') for c in data.columns] # remove spaces
data.columns = [c.replace('(', '') for c in data.columns] # remove open parenthesis
data.columns = [c.replace(')', '') for c in data.columns] # remove close parenthesis
data.columns.values.tolist()

# Get rid of ':' and shorten other names
data = data.replace([':'],'')
data = data.replace(['European Union - 27 countries (from 2020)'],'EU')
data = data.replace(['Euro area - 19 countries  (from 2015)'],'Euro area')
data = data.replace(['Germany (until 1990 former territory of the FRG)'],'Germany')
data = data.replace(['Kosovo (under United Nations Security Council Resolution 1244/99)'],'Kosovo')

# Change Data type to numeric
data[data.columns[2:]] = data[data.columns[2:]].apply(pd.to_numeric, errors ='coerce')

## Missing Data
Missing Data is handled in 2 ways:

    1) Replacing missing data with the country's average
    2) Dropping it

In [186]:
## 1) Impute data with the average
frames = []
for i in list(set(data['Country'])):
            df_country = data[data['Country'] == i] 
            df_country['Total_GWH'].fillna(df_country['Total_GWH'].mean(),inplace = True)
            df_country['Solid_fossil_fuels'].fillna(df_country['Solid_fossil_fuels'].mean(), inplace = True)
            df_country['Peat_and_peat_products'].fillna(df_country['Peat_and_peat_products'].mean(), inplace = True)
            df_country['Solar_Thermal'].fillna(df_country['Solar_Thermal'].mean(), inplace = True)
            df_country['Oil_and_petroleum_products'].fillna(df_country['Oil_and_petroleum_products'].mean(), inplace = True)
            df_country['Natural_gas'].fillna(df_country['Natural_gas'].mean(), inplace = True)
            df_country['Renewables_and_biofuels'].fillna(df_country['Renewables_and_biofuels'].mean(), inplace = True)
            df_country['Nuclear_heat'].fillna(df_country['Nuclear_heat'].mean(),inplace = True)
            df_country['Hydro'].fillna(df_country['Hydro'].mean(),inplace = True)
            df_country['Geothermal'].fillna(df_country['Geothermal'].mean(),inplace = True)
            df_country['Ambient_Heat'].fillna(df_country['Ambient_Heat'].mean(),inplace = True)
            df_country['Tide_wave_and_ocean'].fillna(df_country['Tide_wave_and_ocean'].mean(),inplace = True)
            df_country['Wind'].fillna(df_country['Wind'].mean(),inplace = True)
            df_country['Biofuels_solid'].fillna(df_country['Biofuels_solid'].mean(),inplace = True)
            df_country['Biofuels_other'].fillna(df_country['Biofuels_other'].mean(),inplace = True)
            df_country['Biofuels'].fillna(df_country['Biofuels'].mean(),inplace = True)
            frames.append(df_country)
            final_df = pd.concat(frames)
#final_df[(final_df.Country == 'United Kingdom') & (final_df.Year == 2020)]
data_impute = final_df
data_impute.shape

(420, 18)

In [123]:
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(data_impute)

## Full Data
Using aggregated renewables for now

In [187]:
cdat = data_impute.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]

In [188]:
n = len(cdat.T) # n assets
returns = np.array(cdat).T
    
N = len(cdat) # n obs
mus = [(1/n) + t/N for t in range(N)]#[np.mean(returns, axis=0)]#[10**(5.0 * t/N - 1.0) for t in range(N)]
    
# Convert to cvxopt matrices
S = opt.matrix(np.cov(returns))
pbar = opt.matrix(np.mean(returns, axis = 1))
    
# Create constraint matrices
G = -opt.matrix(np.eye(n)) # negative nxn identity matrix
h = opt.matrix(0.0, (n,1))
A = opt.matrix(1.0, (1,n))
b = opt.matrix(1.0)
    
# Calculate efficient frontier weights using quadratic programming
portfolios = [solvers.qp(mu*S, -pbar, G, h, A, b)['x']
                  for mu in mus]
## Calculate risk and returns for frontier
ret = [blas.dot(pbar, x) for x in portfolios]
risks = [np.sqrt(blas.dot(x, S*x)) for x in portfolios]
## Calculate the 2nd degree polynomail of the frontier curve
m1 = np.polyfit(ret, risks, 2)
x1 = np.sqrt(m1[2] / m1[0])
## Calculate the optimal portfolio
wt = solvers.qp(opt.matrix(x1 * S), -pbar, G, h, A, b)['x']
ret = np.zeros((n,1))
rsk = np.zeros((n,1))
for j in range(n):
    ret[j] = np.sum(np.array(wt)[j] * np.mean(returns, axis = 1))
    rsk = wt.T @ np.cov(returns) @ wt


  exec(code_obj, self.user_global_ns, self.user_ns)


In [153]:
np.array(wt).shape

(6, 1)

In [160]:
print(np.asarray(wt),ret,rsk)

[[5.20240713e-11]
 [9.99999999e-01]
 [1.10133542e-10]
 [5.78058399e-11]
 [1.92247506e-10]
 [3.46906499e-10]] [[6.35393641e-05]
 [1.22134547e+06]
 [1.34511103e-04]
 [7.06009010e-05]
 [2.34800621e-04]
 [4.23692682e-04]] [[39562171.71864054]]


## Repeat average excluding one country at a time

In [172]:
def markowitz(ret_mat):
    '''
    Calculates the markowitz optimal portfolio weights, returns, and risks for one set of return data
    returns are of shape: n_obs x n_assets
    '''
    n = len(ret_mat.T) # n assets
    return_vec = np.array(ret_mat).T
    
    N = len(ret_mat) # n obs
    mus = [(1/n) + t/N for t in range(N)]#[np.mean(returns, axis=0)]#[10**(5.0 * t/N - 1.0) for t in range(N)]
    
    # Convert to cvxopt matrices
    S = opt.matrix(np.cov(return_vec))
    pbar = opt.matrix(np.mean(return_vec, axis = 1))
    
    # Create constraint matrices
    G = -opt.matrix(np.eye(n)) # negative nxn identity matrix
    h = opt.matrix(0.0, (n,1))
    A = opt.matrix(1.0, (1,n))
    b = opt.matrix(1.0)
    
    # Calculate efficient frontier weights using quadratic programming
    portfolios = [solvers.qp(mu*S, -pbar, G, h, A, b)['x']
                  for mu in mus]
    ## Calculate risk and returns for frontier
    ret = [blas.dot(pbar, x) for x in portfolios]
    risks = [np.sqrt(blas.dot(x, S*x)) for x in portfolios]
    ## Calculate the 2nd degree polynomail of the frontier curve
    m1 = np.polyfit(ret, risks, 2)
    x1 = np.sqrt(m1[2] / m1[0])
    ## Calculate the optimal portfolio
    wt = solvers.qp(opt.matrix(x1 * S), -pbar, G, h, A, b)['x']
    ret = np.zeros((n,1))
    rsk = np.zeros((n,1))
    for j in range(n):
        ret[j] = np.sum(np.array(wt)[j] * np.mean(returns, axis = 1))
        rsk = wt.T @ np.cov(returns) @ wt
    return np.asarray(wt), ret, rsk

In [190]:
cdat = data_impute.loc[:,['Country','Year','Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]

In [191]:
weight_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))

for i in range(len(data.groupby('Country'))):
    country_dat = cdat.loc[(cdat.Country != cdat.Country.unique()[i])]
    country_dat = country_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]
    weights, returns, risks = markowitz(country_dat)
    weight_res[i,:] = weights.T
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.array(risks).T

  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowi

In [199]:
print('weights (mean) = '+str(np.mean(weight_res, axis=0)),
      '\nreturns (mean) = '+str(np.mean(return_res, axis=0)),
      '\nrisks (mean) ='+str(np.mean(risks_res, axis=0)))

weights (mean) = [8.72009909e-11 9.99999999e-01 1.28920129e-10 7.59508692e-11
 4.41303026e-10 6.23036381e-10] 
returns (mean) = [1.04244229e-04 1.19544776e+06 1.54117279e-04 9.07952966e-05
 5.27554715e-04 7.44807447e-04] 
risks (mean) =[nan nan nan nan nan nan]


In [192]:
weight_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))

for i in range(len(data.groupby('Year'))):
    year_dat = cdat.loc[(cdat.Year != cdat.Year.unique()[i])]
    year_dat = year_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]
    weights, returns, risks = markowitz(year_dat)
    weight_res[i,:] = weights.T
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.array(risks).T

  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np

In [200]:
print('weights (mean) = '+str(np.mean(weight_res, axis=0)),
      '\nreturns (mean) = '+str(np.mean(return_res, axis=0)),
      '\nrisks (mean) ='+str(np.mean(risks_res, axis=0)))

weights (mean) = [8.72009909e-11 9.99999999e-01 1.28920129e-10 7.59508692e-11
 4.41303026e-10 6.23036381e-10] 
returns (mean) = [1.04244229e-04 1.19544776e+06 1.54117279e-04 9.07952966e-05
 5.27554715e-04 7.44807447e-04] 
risks (mean) =[nan nan nan nan nan nan]


## Repeat now dropping missing data

In [175]:
data_drop = data.dropna()
data_drop.shape    

(413, 18)

In [193]:
cdat = data_drop.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]

In [194]:
n = len(cdat.T) # n assets
returns = np.array(cdat).T
    
N = len(cdat) # n obs
mus = [(1/n) + t/N for t in range(N)]#[np.mean(returns, axis=0)]#[10**(5.0 * t/N - 1.0) for t in range(N)]
    
# Convert to cvxopt matrices
S = opt.matrix(np.cov(returns))
pbar = opt.matrix(np.mean(returns, axis = 1))
    
# Create constraint matrices
G = -opt.matrix(np.eye(n)) # negative nxn identity matrix
h = opt.matrix(0.0, (n,1))
A = opt.matrix(1.0, (1,n))
b = opt.matrix(1.0)
    
# Calculate efficient frontier weights using quadratic programming
portfolios = [solvers.qp(mu*S, -pbar, G, h, A, b)['x']
                  for mu in mus]
## Calculate risk and returns for frontier
ret = [blas.dot(pbar, x) for x in portfolios]
risks = [np.sqrt(blas.dot(x, S*x)) for x in portfolios]
## Calculate the 2nd degree polynomail of the frontier curve
m1 = np.polyfit(ret, risks, 2)
x1 = np.sqrt(m1[2] / m1[0])
## Calculate the optimal portfolio
wt = solvers.qp(opt.matrix(x1 * S), -pbar, G, h, A, b)['x']
ret = np.zeros((n,1))
rsk = np.zeros((n,1))
for j in range(n):
    ret[j] = np.sum(np.array(wt)[j] * np.mean(returns, axis = 1))
    rsk = wt.T @ np.cov(returns) @ wt

  exec(code_obj, self.user_global_ns, self.user_ns)


In [203]:
print('weights = '+str(wt),
      '\nreturns = '+str(ret),
      '\nrisks (mean) ='+str(rsk))

weights = [ 3.96e-11]
[ 1.00e+00]
[ 6.82e-11]
[ 1.95e-11]
[ 2.34e-10]
[ 3.44e-10]
 
returns = [[4.73611556e-05]
 [1.19544776e+06]
 [8.14834554e-05]
 [2.33189393e-05]
 [2.80026965e-04]
 [4.11535725e-04]] 
risks (mean) =[[37246969.01880678]]


## Now drop one country at a time

In [195]:
cdat = data_drop.loc[:,['Country','Year','Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]

In [196]:
weight_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))

for i in range(len(data.groupby('Country'))):
    country_dat = cdat.loc[(cdat.Country != cdat.Country.unique()[i])]
    country_dat = country_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]
    weights, returns, risks = markowitz(country_dat)
    weight_res[i,:] = weights.T
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.array(risks).T

  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_dat)
  weights, returns, risks = markowitz(country_

In [204]:
print('weights (mean) = '+str(np.mean(weight_res, axis=0)),
      '\nreturns (mean) = '+str(np.mean(return_res, axis=0)),
      '\nrisks (mean) ='+str(np.mean(risks_res, axis=0)))

weights (mean) = [8.72009909e-11 9.99999999e-01 1.28920129e-10 7.59508692e-11
 4.41303026e-10 6.23036381e-10] 
returns (mean) = [1.04244229e-04 1.19544776e+06 1.54117279e-04 9.07952966e-05
 5.27554715e-04 7.44807447e-04] 
risks (mean) =[nan nan nan nan nan nan]


## Drop one year at a time

In [197]:
weight_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))

for i in range(len(data.groupby('Year'))):
    year_dat = cdat.loc[(cdat.Year != cdat.Year.unique()[i])]
    year_dat = year_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                  'Natural_gas', 'Nuclear_heat', 'Renewables_and_biofuels']]
    weights, returns, risks = markowitz(year_dat)
    weight_res[i,:] = weights.T
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.array(risks).T

  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  weights, returns, risks = markowitz(year_dat)
  rsk = wt.T @ np.cov(returns) @ wt
  c *= np.true_divide(1, fact)
  c *= np

In [205]:
print('weights (mean) = '+str(np.mean(weight_res, axis=0)),
      '\nreturns (mean) = '+str(np.mean(return_res, axis=0)),
      '\nrisks (mean) ='+str(np.mean(risks_res, axis=0)))

weights (mean) = [8.72009909e-11 9.99999999e-01 1.28920129e-10 7.59508692e-11
 4.41303026e-10 6.23036381e-10] 
returns (mean) = [1.04244229e-04 1.19544776e+06 1.54117279e-04 9.07952966e-05
 5.27554715e-04 7.44807447e-04] 
risks (mean) =[nan nan nan nan nan nan]
