# Black-Litterman Analysis on Eurostat Data
Ellie Cox

This file will conduct markowitz and black-litterman analysis on energy supply data retrieved from eurostat. 
First with the full data, then taking the average when excluding one country at a time, the average when excluding one year of data at a time, and lastly using a randomly selected 75\% of the data

## Load Packages

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stat
import matplotlib.pyplot as plt
from pylab import rcParams
import random
import pickle
%matplotlib inline

In [None]:
import pypfopt as pyp
import warnings
warnings.filterwarnings("ignore")

## Read Data

In [None]:
# Read data using pandas
data = pd.read_csv("/Users/elizabeth/Documents/Master's Project/Data/EU_TotalEnergySupply.csv")

# Create list of column names
data.columns.values.tolist()
# Rename columns to make life easier
data.columns = [c.replace(' ', '_') for c in data.columns] # remove spaces
data.columns = [c.replace('(', '') for c in data.columns] # remove open parenthesis
data.columns = [c.replace(')', '') for c in data.columns] # remove close parenthesis
data.columns.values.tolist()

# Get rid of ':' and shorten other names
data = data.replace([':'],'')
data = data.replace(['European Union - 27 countries (from 2020)'],'EU')
data = data.replace(['Euro area - 19 countries  (from 2015)'],'Euro area')
data = data.replace(['Germany (until 1990 former territory of the FRG)'],'Germany')
data = data.replace(['Kosovo (under United Nations Security Council Resolution 1244/99)'],'Kosovo')

# Change Data type to numeric
data[data.columns[2:]] = data[data.columns[2:]].apply(pd.to_numeric, errors ='coerce')

## Missing Data
Missing Data is handled in 2 ways:

    1) Replacing missing data with the country's average
    2) Dropping it

## Impute Data

In [None]:
## 1) Impute data with the average
frames = []
for i in list(set(data['Country'])):
            df_country = data[data['Country'] == i] 
            df_country['Total_GWH'].fillna(df_country['Total_GWH'].mean(),inplace = True)
            df_country['Solid_fossil_fuels'].fillna(df_country['Solid_fossil_fuels'].mean(), inplace = True)
            df_country['Peat_and_peat_products'].fillna(df_country['Peat_and_peat_products'].mean(), inplace = True)
            df_country['Solar_Thermal'].fillna(df_country['Solar_Thermal'].mean(), inplace = True)
            df_country['Oil_and_petroleum_products'].fillna(df_country['Oil_and_petroleum_products'].mean(), inplace = True)
            df_country['Natural_gas'].fillna(df_country['Natural_gas'].mean(), inplace = True)
            df_country['Renewables_and_biofuels'].fillna(df_country['Renewables_and_biofuels'].mean(), inplace = True)
            df_country['Nuclear_heat'].fillna(df_country['Nuclear_heat'].mean(),inplace = True)
            df_country['Hydro'].fillna(df_country['Hydro'].mean(),inplace = True)
            df_country['Geothermal'].fillna(df_country['Geothermal'].mean(),inplace = True)
            df_country['Ambient_Heat'].fillna(df_country['Ambient_Heat'].mean(),inplace = True)
            df_country['Tide_wave_and_ocean'].fillna(df_country['Tide_wave_and_ocean'].mean(),inplace = True)
            df_country['Wind'].fillna(df_country['Wind'].mean(),inplace = True)
            df_country['Biofuels_solid'].fillna(df_country['Biofuels_solid'].mean(),inplace = True)
            df_country['Biofuels_other'].fillna(df_country['Biofuels_other'].mean(),inplace = True)
            df_country['Biofuels'].fillna(df_country['Biofuels'].mean(),inplace = True)
            frames.append(df_country)
            final_df = pd.concat(frames)
data_impute = final_df

## Define Black-Litterman Function

In [None]:
def bl(n_assets, n_obs, return_vec):
    '''
    This function evaluates the equillibrium returns of a portfolio and generates the sample
    Inputs: 
    n_assets: Number of assets
    n_obs: Number of observations
    return_vec: A matrix of returns of shape n_obs x n_assets (a np.array)
    view: a vector of investor views of length n_assets
    Returns: 
    weights: optimal weights
    bl_returns: BL returns
    S: BL risk
    '''
    R = pd.Series(np.mean(return_vec, axis = 0)) # Market returns
    pi = R - 0 * np.ones(n_assets) # equillibrium risk premiums where R_f = 0
    r = np.cov(return_vec.T)
    
    rng = np.random.default_rng()
    cov_struct = rng.multivariate_normal(np.zeros(n_assets), cov = r, size = n_obs)
    S = pyp.risk_models.CovarianceShrinkage(cov_struct).ledoit_wolf()
    delta = pyp.black_litterman.market_implied_risk_aversion(pi, risk_free_rate=0.5)
    
    market_prior = pd.Series(np.mean(return_vec, axis = 0))
    #views = pd.Series(view)
    Q = np.reshape(np.mean(return_vec, axis = 0),(n_assets,1))[0:4,:] # 4 views
    P = np.array(
        [
            np.random.dirichlet(np.ones(n_assets), size=1)[0],
            np.random.dirichlet(np.ones(n_assets), size=1)[0],
            np.random.dirichlet(np.ones(n_assets), size=1)[0],
            np.random.dirichlet(np.ones(n_assets), size=1)[0],
        ]
    )
    
    bl = pyp.BlackLittermanModel(S, pi=market_prior, Q = Q, P = P)
    bl_return = bl.bl_returns()

    ef = pyp.EfficientFrontier(bl_return, r)
    bl.bl_weights(delta)
    weights = bl.clean_weights()

    S_bl = bl.bl_cov()
    return weights, bl_return, S_bl

## Analysis on Imputed Data

In [None]:
cdat = data_impute.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]

In [None]:
## Run one optimization - this returns the results for the full data set and will report the 
## results of one optimization instead of an average 

## Set view - I'm going to start by using the median - this is only used when I use absolute confidence
view = np.array(np.median(cdat, axis = 0))

full_wt, full_rt, full_rsk = bl(cdat.shape[1], cdat.shape[0], np.array(cdat))
weight_full = list(full_wt.items())
w_full = [x[1] for x in weight_full]
weight_full = w_full

## Calculate average over dropping one country at a time

In [None]:
## Full data set - with country and year for filtering
cdat = data_impute.loc[:,['Country','Year','Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]

In [None]:
weight_country = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))

for i in range(len(data.groupby('Country'))):
    country_dat = cdat.loc[(cdat.Country != cdat.Country.unique()[i])]
    country_dat = country_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(country_dat, axis = 0))
    weights, returns, risks = bl(country_dat.shape[1], country_dat.shape[0], np.array(country_dat))
    weights = list(weights.items())
    w = [x[1] for x in weights]
    weight_country[i,:] = w
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.diagonal(np.array(risks))
    

## Calculate average over dropping one year at a time

In [None]:
weight_year = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))

for i in range(len(data.groupby('Year'))):
    year_dat = cdat.loc[(cdat.Year != cdat.Year.unique()[i])]
    year_dat = year_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(year_dat, axis = 0))
    weights, returns, risks = bl(year_dat.shape[1], year_dat.shape[0], np.array(year_dat))
    weights = list(weights.items())
    w = [x[1] for x in weights]
    weight_year[i,:] = w
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.diagonal(np.array(risks))

## Drop Missing Data

In [None]:
data_drop = data.dropna()

Full results

In [None]:
# Subset Data
cdat = data_drop.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]

In [None]:
## Run one optimization - this returns the results for the full data set and will report the 
## results of one optimization instead of an average 

## Set view - I'm going to start by using the median
view = np.array(np.median(cdat, axis = 0))

full_wt, full_rt, full_rsk = bl(cdat.shape[1], cdat.shape[0], np.array(cdat))
weight_full = list(full_wt.items())
w_full = [x[1] for x in weight_full]
weight_full_drop = w_full

Calculate average over dropping one country at a time

In [None]:
## Full data set - with country and year for filtering
cdat = data_drop.loc[:,['Country','Year','Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]

In [None]:
weight_country_drop = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Country')),len(cdat.T)-2))

for i in range(len(data.groupby('Country'))):
    country_dat = cdat.loc[(cdat.Country != cdat.Country.unique()[i])]
    country_dat = country_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(country_dat, axis = 0))
    weights, returns, risks = bl(country_dat.shape[1], country_dat.shape[0], np.array(country_dat))
    weights = list(weights.items())
    w = [x[1] for x in weights]
    weight_country_drop[i,:] = w
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.diagonal(np.array(risks))

Calculate average over dropping one year at a time

In [None]:
weight_year_drop = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
return_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))
risks_res = np.zeros((len(data.groupby('Year')),len(cdat.T)-2))

for i in range(len(data.groupby('Year'))):
    year_dat = cdat.loc[(cdat.Year != cdat.Year.unique()[i])]
    year_dat = year_dat.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(year_dat, axis = 0))
    weights, returns, risks = bl(year_dat.shape[1], year_dat.shape[0], np.array(year_dat))
    weights = list(weights.items())
    w = [x[1] for x in weights]
    weight_year_drop[i,:] = w
    return_res[i,:] = np.array(returns).T
    risks_res[i,:] = np.diagonal(np.array(risks))

## Drop random 20%

In [None]:
cdat = data_impute.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]


In [None]:
weight_rand = np.zeros((100,len(cdat.T)))
return_res = np.zeros((100,len(cdat.T)))
risks_res = np.zeros((100,len(cdat.T)))

for i in range(100):
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(cdat, axis = 0))
    
    # Randomly take out 20% of the data
    _80_perct = int(cdat.shape[0]*4/5)
    cdat = cdat.iloc[random.sample(list(range(cdat.shape[0])), _80_perct)]

    full_wt, full_rt, full_rsk = bl(cdat.shape[1], cdat.shape[0], np.array(cdat))
    weight_full = list(full_wt.items())
    w_full = [x[1] for x in weight_full]
    full_wt = w_full
    weight_rand[i,:] = full_wt
    return_res[i,:] = full_rt
    risks_res[i,:] = np.diagonal(np.array(full_rsk))
    print(i)

In [None]:
cdat = data_drop.loc[:,['Solid_fossil_fuels','Peat_and_peat_products','Oil_and_petroleum_products', 
                          'Natural_gas', 'Nuclear_heat', 'Hydro', 'Solar_Thermal', 'Geothermal', 
                          'Ambient_Heat', 'Tide_wave_and_ocean', 'Biofuels_solid', 'Biofuels_other']]

In [None]:
weight_rand_drop = np.zeros((100,len(cdat.T)))
return_res = np.zeros((100,len(cdat.T)))
risks_res = np.zeros((100,len(cdat.T)))

for i in range(100):
    ## Set view - I'm going to start by using the median
    view = np.array(np.median(cdat, axis = 0))
    
    # Randomly take out 20% of the data
    _80_perct = int(cdat.shape[0]*4/5)
    cdat = cdat.iloc[random.sample(list(range(cdat.shape[0])), _80_perct)]

    full_wt, full_rt, full_rsk = bl(cdat.shape[1], cdat.shape[0], np.array(cdat))
    weight_full = list(full_wt.items())
    w_full = [x[1] for x in weight_full]
    full_wt = w_full
    weight_rand_drop[i,:] = full_wt
    return_res[i,:] = full_rt
    risks_res[i,:] = np.diagonal(np.array(full_rsk))
    print(i)

## Save Results

In [None]:
BLResults_Weights = [weight_full, weight_country, weight_year, weight_rand, 
                              weight_full_drop, weight_country_drop, weight_year_drop, weight_rand_drop]

file_name = "EUBLResultsWeights.pkl"
open_file = open(file_name, "wb")
pickle.dump(BLResults_Weights, open_file)
open_file.close()