In [1]:
from IPython.display import display, HTML
import plotly.graph_objects as go
from plotly import express as px
import pandas as pd
import datetime as dt
import numpy as np
import glob
import os
import re
import warnings

warnings.simplefilter("ignore")

### Goal

Get an overview of Wintershall Noordzee B.V. assets in the North Sea, looking at:
1. Fields and licenses owned by Wintershall since 2015
2. Production of gas, oil and condensates since 2015
3. Share of gas, oil and condensate production in the Dutch North Sea
4. Share of Gazprom

### Data sets

1. License history Netherlands
2. License history with shares UK 2012-2020
3. License history UK
4. Production data Netherlands per field / license

### Steps

1. Get all fields and licenses
2. Compare production data set with historical licenses NL
3. Query UK data for Wintershall Noordzee B.V.
4. Get share of Wintershall in fields, so look for co-use
5. Get share of production in Dutch North Sea production

## Historical licenses

In [2]:
PATH = '/Users/dt/Documents/Projecten/northsea/data/'

In [3]:
# Import historical licenses from Netherlands

df_license = pd.read_excel(PATH + 'licenses/nl_licenses/nl_production_licenses.xlsx', 
                           sheet_name='license_raw', 
                           usecols=['observation_date',
                                    'mutation_start',
                                    'mutation_end',
                                    'mutation_type',
                                    'licensee',
                                    'licensee_normalized',
                                    'license',
                                    'license_normalized',
                                    'nr'])

df_license.sort_values(by='observation_date', ascending=False).head()

Unnamed: 0,observation_date,mutation_start,mutation_end,mutation_type,nr,licensee,licensee_normalized,license,license_normalized
831,2022-01-01,NaT,NaT,,88,Neptune Energy Participation Netherlands B.V.,Neptune Energy Group,D12a,D12a
1425,2022-01-01,NaT,NaT,,72,TAQA Offshore B.V.,TAQA,P18c,P18c
869,2022-01-01,NaT,NaT,,45,ONE-Dyas B.V.,ONE-Dyas,L11c,L11c
1409,2022-01-01,NaT,NaT,,70,TAQA Offshore B.V.,TAQA,"P15c, P15g, P15h, P15i & P15j","P15c, P15g, P15h, P15i & P15j"
1411,2022-01-01,NaT,NaT,,71,TAQA Offshore B.V.,TAQA,P18a,P18a


In [4]:
def assign_start_date(row):
    if row['mutation_start'] is pd.NaT:
        value = row['observation_date']
    else:
        value = row['mutation_start']
    return value

def assign_end_date(row):
    if row['mutation_end'] is pd.NaT:
        value = row['observation_date'] + pd.offsets.DateOffset(years=1)
    else:
        value = row['mutation_end']
    return value

df_license['start_date'] = df_license.apply(assign_start_date, axis=1)
df_license['end_date'] = df_license.apply(assign_end_date, axis=1)
df_license.head()

Unnamed: 0,observation_date,mutation_start,mutation_end,mutation_type,nr,licensee,licensee_normalized,license,license_normalized,start_date,end_date
0,2021-01-01,2019-12-20,NaT,Acquired,63,Aceiro Energy B.V.,Aceiro Energy B.V.,P08a,P08a,2019-12-20,2022-01-01
1,2022-01-01,NaT,NaT,,64,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a,P09a,2022-01-01,2023-01-01
2,2017-01-01,NaT,NaT,,65,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2017-01-01,2018-01-01
3,2016-01-01,NaT,NaT,,65,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2016-01-01,2017-01-01
4,2015-01-01,NaT,NaT,,59,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2015-01-01,2016-01-01


In [5]:
# Get min and max observation dates

dfg_license = df_license.groupby(['licensee_normalized', 'license_normalized'])
dfg_license = df_license.agg(start_date=('start_date', np.min), end_date=('end_date', np.max))

# Re-index the dataframe

dfg_license = df_license.reset_index()

# And see the result

dfg_license.head()

Unnamed: 0,index,observation_date,mutation_start,mutation_end,mutation_type,nr,licensee,licensee_normalized,license,license_normalized,start_date,end_date
0,0,2021-01-01,2019-12-20,NaT,Acquired,63,Aceiro Energy B.V.,Aceiro Energy B.V.,P08a,P08a,2019-12-20,2022-01-01
1,1,2022-01-01,NaT,NaT,,64,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a,P09a,2022-01-01,2023-01-01
2,2,2017-01-01,NaT,NaT,,65,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2017-01-01,2018-01-01
3,3,2016-01-01,NaT,NaT,,65,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2016-01-01,2017-01-01
4,4,2015-01-01,NaT,NaT,,59,Aceiro Energy B.V.,Aceiro Energy B.V.,P09a & P09b,P09a,2015-01-01,2016-01-01


## Historical Production data

In [6]:
# Import production license data

dfs= []
for file in glob.glob(PATH + 'production/nl_production/per_license/lice*.xlsx'):
    commodity = os.path.basename(file)[7:-18]
    df = pd.read_excel(file, skiprows=1, engine='openpyxl')
    df['type'] = commodity
    dfs.append(df)
    
df_production = pd.concat(dfs)
len(df_production)

2452

In [7]:
# Melt month columns into one colum

df_production = df_production.melt(id_vars = ['LICENCE', 'OPERATOR', 'YEAR', 'type'], var_name='month', value_name='1000_sm3')

# Clean NaNs
df_production = df_production.dropna(subset=['YEAR'])

# Convert month name and year to date column

df_production['YEAR'] = df_production['YEAR'].astype('int')

df_production['date'] = pd.to_datetime(df_production['YEAR'].astype(str) + '-' + df_production['month'], format='%Y-%b')

# Drop columns and rename

df_production = df_production.rename(columns={'LICENCE': 'license',
                                              'OPERATOR': 'operator_name'})

df_production = df_production.drop(['YEAR', 'month'], axis=1)

df_production['country'] = 'Netherlands'

df_production['year'] = pd.to_datetime(df_production['date']).dt.to_period('Y')

# See the result

len(df_production)

28704

In [7]:
# Some licenses are missing from the production per license data set, but they are present in the production per field
# data, so let's add those as well.
# The fields are ['A15-A', 'A18-FA', 'B18-FA', 'J03-C', 'L08-A', 'N07-FA']

fields = ['A15-A', 'A18-FA', 'B18-FA', 'J03-C', 'L08-A', 'N07-FA']

dfs= []
for file in glob.glob(PATH + 'production/nl_production/per_field/field*.xlsx'):
    commodity = os.path.basename(file)[5:-18]
    df = pd.read_excel(file, skiprows=1, engine='openpyxl')
    df['type'] = commodity
    df = df[df['FIELD'].isin(fields)]
    dfs.append(df)
    
df_fields = pd.concat(dfs)
len(df_fields)

53

In [8]:
# Melt month columns into one colum

df_fields = df_fields.melt(id_vars = ['FIELD', 'OPERATOR', 'YEAR', 'type'], var_name='month', value_name='1000_sm3')

# Clean NaNs
df_fields = df_fields.dropna(subset=['YEAR'])

# Convert month name and year to date column

df_fields['YEAR'] = df_fields['YEAR'].astype('int')

df_fields['date'] = pd.to_datetime(df_fields['YEAR'].astype(str) + '-' + df_fields['month'], format='%Y-%b')

# Drop columns and rename

df_fields = df_fields.rename(columns={'FIELD': 'license',
                                      'OPERATOR': 'operator_name'})

df_fields = df_fields.drop(['YEAR', 'month'], axis=1)

df_fields['country'] = 'Netherlands'

df_fields['year'] = pd.to_datetime(df_fields['date']).dt.to_period('Y')

# See the result

len(df_fields)

636

In [9]:
df_fields.head()

Unnamed: 0,license,operator_name,type,1000_sm3,date,country,year
0,N07-FA,Nederlandse Aardolie Maatschappij B.V.,Condensate,,2011-01-01,Netherlands,2011
1,L08-A,Wintershall Noordzee B.V.,Condensate,0.0,2007-01-01,Netherlands,2007
2,A18-FA,Petrogas E&P Netherlands B.V.,Gas,60014.035231,2020-01-01,Netherlands,2020
3,N07-FA,Nederlandse Aardolie Maatschappij B.V.,Gas,5618.150266,2020-01-01,Netherlands,2020
4,A18-FA,Petrogas E&P Netherlands B.V.,Gas,18002.418796,2016-01-01,Netherlands,2016


In [10]:
# Bring it all together

df_production = pd.concat([df_production, df_fields])
len(df_production)

29340

## Try filtering out Wintershall

In [8]:
com = list(set(df_license[df_license['licensee_normalized'].str.contains('Gaz')]['license_normalized']))

In [9]:
com.sort()
com

['D12a',
 'D12b',
 'D15a',
 'D18a',
 'E15a',
 'E15b',
 'E18a',
 'F13a',
 'F16a & F16b',
 'F17a-diep',
 'G17c & G17d',
 'K08 & K11a',
 'K18b',
 'L05b',
 'L05c',
 'L06a',
 'L06b',
 'L08-A',
 'L08b, L08d & L08e',
 'L12a',
 'L12b & L15b',
 'L12b & L15d',
 'L12c',
 'L12d',
 'L13',
 'L15d',
 'L16a',
 'P06a',
 'P09a',
 'P09a, P09b & P09d',
 'P09c',
 'P12a',
 'P15a, P15b, P15d, P15e & P15f',
 'P15c, P15g, P15h, P15i & P15j',
 'Q01c-diep',
 'Q04a',
 'Q05d']

In [10]:
df_production = df_production[df_production['date'] >= '2015-01-01']
com_prod = df_production[df_production['license'].isin(com)]
len(com_prod)

3684

In [11]:
df_fields[df_fields['license'].str.contains('D12')]

NameError: name 'df_fields' is not defined

In [12]:
com_prod.head()

Unnamed: 0,license,operator_name,type,1000_sm3,date,country,year
3,D12a,Wintershall Noordzee B.V.,Gas,1783.185064,2020-01-01,Netherlands,2020
4,D15a,Neptune Energy Netherlands B.V.,Gas,0.0,2020-01-01,Netherlands,2020
5,D18a,Neptune Energy Netherlands B.V.,Gas,773.897248,2020-01-01,Netherlands,2020
11,F16a & F16b,Wintershall Noordzee B.V.,Gas,6579.237962,2020-01-01,Netherlands,2020
14,G17c & G17d,Neptune Energy Netherlands B.V.,Gas,8273.584414,2020-01-01,Netherlands,2020


In [13]:
com_prod = com_prod[com_prod['type'] == 'Gas']

prod_win = pd.DataFrame(com_prod.groupby('license')['1000_sm3'].sum())

In [14]:
prod_win['1000_sm3'] = prod_win['1000_sm3'] / 1000000

In [15]:
prod_win

Unnamed: 0_level_0,1000_sm3
license,Unnamed: 1_level_1
D12a,1.846068
D15a,0.022851
D18a,0.158088
E18a,0.1643
F16a & F16b,0.688225
G17c & G17d,0.729444
K08 & K11a,2.266053
K18b,2.445637
L05b,0.883055
L06a,0.878612


In [25]:
prod_win.to_csv(PATH + 'licenses/nl_licenses/production_wintershall.csv')

In [16]:
df = pd.merge(pd.DataFrame(com_prod.groupby('year')['1000_sm3'].sum()), 
         pd.DataFrame(df_production.groupby('year')['1000_sm3'].sum()),
         on='year',
         how='left')



df['perc'] = df['1000_sm3_x'] / df['1000_sm3_y'] * 100

df['1000_sm3_x'] = df['1000_sm3_x'] / 1000000
df['1000_sm3_y'] = df['1000_sm3_y'] / 1000000

df = df.rename(columns={'1000_sm3_x': 'prod_wintershall_1b_sm3',
                        '1000_sm3_y': 'prod_total_1b_sm3',
                        'perc': 'percentage_wintershall'
                       })

df.round({'prod_wintershall_1b_sm3': 2,
          'prod_total_1b_sm3': 2,
          'percentage_wintershall':2})

Unnamed: 0_level_0,prod_wintershall_1b_sm3,prod_total_1b_sm3,percentage_wintershall
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,3.04,16.33,18.63
2016,2.77,15.19,18.25
2017,2.53,13.72,18.48
2018,2.45,12.24,19.99
2019,2.18,10.89,19.99
2020,2.4,10.39,23.08
2021,2.29,9.83,23.32
2022,0.72,4.22,17.09


In [None]:
df

In [75]:
# Add other companies

df = pd.merge(pd.DataFrame(com_prod.groupby('year')['1000_sm3'].sum()),
              df,
              on='year',
              how='left')
df

Unnamed: 0_level_0,1000_sm3,prod_nam_1b_sm3,prod_mb_holding_1b_sm3,prod_neptune_1b_sm3,prod_wintershall_1b_sm3,prod_total_1b_sm3,percentage_wintershall,percentage_neptune,percentage_mb_holding,percentage_nam
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015,4362674.0,4.131121,1.353808,6e-06,3.196988,16.444348,19.441258,3.7e-05,8.232666,25.121826
2016,4071590.0,4.056274,1.450924,6e-06,2.898167,15.831753,18.30604,3.6e-05,9.164642,25.621132
2017,3268972.0,3.610541,1.442553,5e-06,2.617191,14.482252,18.071717,3.2e-05,9.960832,24.930796
2018,2758704.0,3.300525,1.385217,4e-06,2.515117,13.009947,19.33226,3.3e-05,10.647368,25.36924
2019,2466780.0,2.813531,1.428968,3e-06,2.235543,11.640138,19.205473,2.7e-05,12.276216,24.170946
2020,2428859.0,2.413663,1.351392,3e-06,2.47164,11.082943,22.301293,3e-05,12.193436,21.778174
2021,1978137.0,2.184387,1.307054,4e-06,2.349176,10.370388,22.652733,3.4e-05,12.603709,21.063698
2022,823593.4,0.92692,0.357807,1e-06,0.732002,4.219175,17.349416,3.4e-05,8.480495,21.969228


In [76]:
df['1000_sm3'] = df['1000_sm3'] / 1000000

df['perc'] = df['1000_sm3'] / df['prod_total_1b_sm3'] * 100


df = df.rename(columns={'1000_sm3': 'prod_totalenergies_1b_sm3',
                        '1000_sm3_y': 'prod_nam_sm3',
                        'perc': 'percentage_totalenergies'
                       })

df.round({'prod_totalenergies_1b_sm3': 2,
          'prod_total_1b_sm3': 2,
          'percentage_totalenergies':2})

Unnamed: 0_level_0,prod_totalenergies_1b_sm3,prod_nam_1b_sm3,prod_mb_holding_1b_sm3,prod_neptune_1b_sm3,prod_wintershall_1b_sm3,prod_total_1b_sm3,percentage_wintershall,percentage_neptune,percentage_mb_holding,percentage_nam,percentage_totalenergies
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015,4.36,4.131121,1.353808,6e-06,3.196988,16.44,19.441258,3.7e-05,8.232666,25.121826,26.53
2016,4.07,4.056274,1.450924,6e-06,2.898167,15.83,18.30604,3.6e-05,9.164642,25.621132,25.72
2017,3.27,3.610541,1.442553,5e-06,2.617191,14.48,18.071717,3.2e-05,9.960832,24.930796,22.57
2018,2.76,3.300525,1.385217,4e-06,2.515117,13.01,19.33226,3.3e-05,10.647368,25.36924,21.2
2019,2.47,2.813531,1.428968,3e-06,2.235543,11.64,19.205473,2.7e-05,12.276216,24.170946,21.19
2020,2.43,2.413663,1.351392,3e-06,2.47164,11.08,22.301293,3e-05,12.193436,21.778174,21.92
2021,1.98,2.184387,1.307054,4e-06,2.349176,10.37,22.652733,3.4e-05,12.603709,21.063698,19.07
2022,0.82,0.92692,0.357807,1e-06,0.732002,4.22,17.349416,3.4e-05,8.480495,21.969228,19.52


## Infrastructure

In [None]:
# Import platforms

df_infra = pd.read_excel(PATH + 'licenses/nl_licenses/nl_production_licenses.xlsx', 
                         sheet_name='platforms')

df_infra.head()

In [None]:
df_infra = df_infra.dropna(subset=['operator'])

In [None]:
winfra = df_infra[df_infra['operator'].str.contains('Wintersh')]

In [None]:
winfra[winfra['year_end'].isna()]['platform'].nunique()

In [None]:
df_pijpleidingen = pd.read_excel(PATH + 'licenses/nl_licenses/nl_production_licenses.xlsx',
                                sheet_name='pipelines')

df_pijpleidingen.head()

In [None]:
df_pijpleidingen = df_pijpleidingen.dropna(subset=['operator'])
wpijp = df_pijpleidingen[df_pijpleidingen['operator'].str.contains('Wint')]
wpijp.head()

In [None]:
wpijp = wpijp.drop_duplicates(subset=['from'])
wpijp['to'].unique()

## All companies

So now compare these dataframes:
1. df_license (historical licenses)
2. nl_production_licenses (license production)

In [None]:
merge = pd.merge(df_license, 
                 df_production, 
                 left_on='license_normalized', 
                 right_on='license',
                 how='outer')

merge.head()

In [None]:
merge = pd.merge(df_license,
                df_production,
                 how='outer',
                 left_on='license_normalized',
                 right_on='license'
                )

len(merge)

In [None]:
# Filter on start_date, end_date and license_name

merge = merge.query('start_date <= date and end_date >= date and license_y == license_normalized' )
len(merge)

In [None]:
# first explore the differences between historical licenses and production licenses

diff = list(set(dfg_license.license) - set(merge.license_normalized))
diff.sort()
diff

In [None]:
# Clean it up

# Select relevant columns

df_total = merge[['license_normalized', 
                  'operator_name', 
                  'type', '1000_sm3', 
                  'date', 
                  'country',
                  'licensee',
                  'licensee_normalized', 
                  'start_date', 
                  'end_date', 
                  'year']]

# Rename columns

df_total = df_total.rename(columns={'license_normalized': 'license', 
                                    'date': 'production_date', 
                                    'start_date': 'license_start_date', 
                                    'end_date': 'license_end_date'})

# Filter date range

df_total = df_total[df_total['production_date'] >= '2015-01-01']

df_total.head()

In [None]:
# Write to file (be aware of date filter!)

df_total.to_csv(PATH + 'licenses/nl_licenses/matched_production.csv', index=False)

# Write total production df to file

df_production.to_csv(PATH + 'licenses/nl_licenses/nl_production_licenses.csv', index=False)

## Analysis

### Datasets

1. df_license: historical licenses EEZ NL from 2015-01-01 onwards
2. dfg_license: grouped by license and licensee (normalized)
3. df_total: production grouped by company
3. df_production: historical production EEZ NL from 2003-01-01 onwards

### Questions

- How much gas is yearly produced since 2015?
- How much gas is yearly produced by each company?
- What are the main trends? 
- How does the gas production of 2022 compare to previous years?
- What is the relative market share of each company (ignoring license shares)?
- What infrastructure do these companies have? 
- Where is this infrastructure located? 
- Which infrastructure can we assume to be decommissioned in the near future (yes, it's fuzzy)?

In [21]:
# Production per year - table

yearly_production = df_production.groupby('year', as_index=False)['1000_sm3'].sum()
yearly_production['1b_sm3'] = yearly_production['1000_sm3'] / 1000000
yearly_production

Unnamed: 0,year,1000_sm3,1b_sm3
0,2015,16444350.0,16.444348
1,2016,15831750.0,15.831753
2,2017,14482250.0,14.482252
3,2018,13009950.0,13.009947
4,2019,11640140.0,11.640138
5,2020,11082940.0,11.082943
6,2021,10370390.0,10.370388
7,2022,4219175.0,4.219175


In [None]:
# Production per year - graph
yearly_production = df_production.groupby('date', as_index=False)['1000_sm3'].sum()
yearly_production['1b_sm3'] = yearly_production['1000_sm3'] / 1000000
fig = px.line(yearly_production, x='date', y='1b_sm3')
fig.show()

In [None]:
# Production per company (not normalized)

production_company = pd.pivot_table(df_total, index=['license', 'year', 'licensee'], values='1000_sm3', aggfunc='sum')
production_company = production_company.reset_index()
production_company.head()

In [None]:
production_company.columns

In [None]:
test = production_company.groupby(['license','year', '1000_sm3'])['licensee'].count()
test = pd.DataFrame(test).reset_index()
test.head()

In [None]:
test['prod_1000_sm3'] = test['1000_sm3'] / test['licensee']
len(test)

In [None]:
test.head()

In [None]:
#df_total['year'] = df_total['year'].astype('int64')
#test['year'] = test['year'].astype('int64')

production_company = pd.merge(df_total[['license', 'year', 'licensee']],
                             test[['license', 'year', 'prod_1000_sm3']],
                              on=['license', 'year'],
                              how='left')
len(production_company)

In [None]:
production_company.head()

In [None]:
production_company = production_company.drop_duplicates(subset=['license', 'year', 'licensee'])

In [None]:
production_company.head(20)

In [None]:
p = production_company.groupby(['licensee', 'year'])['prod_1000_sm3'].sum()
p = p.reset_index()
fig = px.bar(p,
              x=p['year'].dt.strftime('%Y'),
              y='prod_1000_sm3',
              color='licensee')
fig.show()

In [None]:
p.head(20)

In [None]:
## Flotsam and jetsam

# License names are different when there is only one digit, e.g. P9 in historical license set is P09 in production license set. Modify by regex in historical license

def pad_license(d):
    digit = re.match('[A-Z](\d{1,2})*', d).group(1)
    result = d.replace(digit, digit.zfill(2))
    return result 

df_license['license'] = df_license['license'].apply(lambda x: pad_license(x))