In [1]:
import pandas as pd
import numpy as np

## Load BP data

In [119]:
df = pd.read_csv('https://www.bp.com/content/dam/bp/business-sites/en/global/corporate/xlsx/energy-economics/statistical-review/bp-stats-review-2022-consolidated-dataset-narrow-format.csv')
df.head()

Unnamed: 0,Country,Year,ISO3166_alpha3,ISO3166_numeric,Region,SubRegion,OPEC,EU,OECD,CIS,Var,Value
0,Brazil,2004,BRA,76.0,S. & Cent. America,South America,0.0,0.0,0.0,0.0,biodiesel_cons_kboed,0.0
1,Brazil,2005,BRA,76.0,S. & Cent. America,South America,0.0,0.0,0.0,0.0,biodiesel_cons_kboed,0.014737
2,Brazil,2006,BRA,76.0,S. & Cent. America,South America,0.0,0.0,0.0,0.0,biodiesel_cons_kboed,1.016887
3,Brazil,2007,BRA,76.0,S. & Cent. America,South America,0.0,0.0,0.0,0.0,biodiesel_cons_kboed,5.953947
4,Brazil,2008,BRA,76.0,S. & Cent. America,South America,0.0,0.0,0.0,0.0,biodiesel_cons_kboed,16.534379


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277095 entries, 0 to 277094
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Country          277095 non-null  object 
 1   Year             277095 non-null  int64  
 2   ISO3166_alpha3   277095 non-null  object 
 3   ISO3166_numeric  193275 non-null  float64
 4   Region           193275 non-null  object 
 5   SubRegion        193275 non-null  object 
 6   OPEC             193275 non-null  float64
 7   EU               193275 non-null  float64
 8   OECD             193275 non-null  float64
 9   CIS              193275 non-null  float64
 10  Var              277095 non-null  object 
 11  Value            277095 non-null  float64
dtypes: float64(6), int64(1), object(5)
memory usage: 25.4+ MB


In [121]:
df.Var.unique()

array(['biodiesel_cons_kboed', 'biodiesel_cons_pj',
       'biodiesel_prod_kboed', 'biodiesel_prod_pj', 'biofuels_cons_ej',
       'biofuels_cons_kbd', 'biofuels_cons_kboed', 'biofuels_cons_pj',
       'biofuels_prod_kbd', 'biofuels_prod_kboed', 'biofuels_prod_pj',
       'biogeo_ej', 'biogeo_twh', 'biogeo_twh_net', 'co2_combust_mtco2',
       'co2_combust_pc', 'co2_combust_per_ej', 'co2_mtco2', 'coalcons_ej',
       'coalprod_ej', 'coalprod_mt', 'cobalt_kt', 'cobaltres_kt',
       'diesel_gasoil_cons_kbd', 'elect_twh', 'electbyfuel_coal',
       'electbyfuel_gas', 'electbyfuel_hydro', 'electbyfuel_nuclear',
       'electbyfuel_oil', 'electbyfuel_other', 'electbyfuel_ren_power',
       'electbyfuel_total', 'ethanol_cons_kboed', 'ethanol_cons_pj',
       'ethanol_prod_kboed', 'ethanol_prod_pj', 'fuel_oil_cons_kbd',
       'gascons_bcfd', 'gascons_bcm', 'gascons_ej', 'gasflared_bcm',
       'gasflared_mtco2', 'gasoline_cons_kbd', 'gasprod_bcfd',
       'gasprod_bcm', 'gasprod_ej', 'gasre

# Proven Reserves

## Old data (Postgress)

In [123]:
df_old = pd.read_csv('../data/2016_old/FOSSIL_RESERVES_bp_fossil_with_zones_prod.csv')
df_old

Unnamed: 0,group_type,group_name,energy_source,year,proven_reserves,proven_reserves_unit
0,group,EU28,Gas,1989,3.1,Bcm
1,zone,Asia and Oceania,Oil,2016,46.5,Gb
2,zone,Africa,Gas,1995,8.7,Bcm
3,group,OECD,Oil,2004,243.1,Gb
4,group,EU28,Oil,2014,4.6,Gb
...,...,...,...,...,...,...
4906,country,Yemen,Oil,2014,3.0,Gb
4907,country,Yemen,Oil,2015,3.0,Gb
4908,country,Yemen,Oil,2016,3.0,Gb
4909,country,Yemen,Oil,2017,3.0,Gb


## Clean New data

In [127]:
class BpFossilProvenReservesCleaner:

    def __init__(self):
        # Initialize any necessary variables or state here
        pass

    def drop_unnecessary_columns(self, df):
        # Columns selection
        return df.drop(columns=['ISO3166_numeric', 'ISO3166_alpha3'], axis=1)
    
    def filter_oil_and_gas(self, df):
        # Filter oil & gas
        return df[df['Var'].isin(['gasreserves_tcm', 'oilreserves_bbl'])].reset_index(drop=True)

    def add_and_convert_units(self, df):
        # Add and convert unit columns
        df['proven_reserves_unit'] = np.where(df['Var'] == 'gasreserves_tcm', 'tcm', 
                                              np.where(df['Var'] == 'oilreserves_bbl', 'bbl', ''))
        df['proven_reserves_unit'].replace({'bbl': 'Gb', 'tcm': 'Bcm'}, inplace=True)
        df.loc[df.proven_reserves_unit == 'Bcm', 'Value'] *= 1000 # Convert unit from tcm to bcm
        return df

    def replace_and_rename_columns(self, df):
        # Replace variable names and rename columns
        df['Var'].replace({'gasreserves_tcm': 'Gas', 'oilreserves_bbl': 'Oil'}, inplace=True)
        df = df.rename(columns={'Country': 'group_name', 'Year': 'year', 'Value': 'proven_reserves', 'Var': 'energy_source'})
        return df

    def classify_group_types(self, df):
        # Classify and replace group names and types
        df['group_type'] = 'country'
        group_replacements = {'Total OECD': 'OECD', 'Total OPEC': 'OPEC', 'Total EU': 'EU28', 'Total CIS': 'CIS'}
        df['group_name'].replace(group_replacements, inplace=True)
        df.loc[df.group_name.isin(group_replacements.values()), 'group_type'] = 'group'
        
        zone_replacements = {'Total S. & Cent. America': 'Central and South America', 
                             'Total North America': 'North America', 'Total Africa': 'Africa', 
                             'Total Europe': 'Europe', 'Total World': 'World', 
                             'Total Middle East': 'Middle East', 'Total Asia Pacific': 'Asia Pacific'}
        df['group_name'].replace(zone_replacements, inplace=True)
        df.loc[df.group_name.isin(zone_replacements.values()), 'group_type'] = 'zone'

        return df

    def clean_data(self, df):
        # Sequence of data cleaning steps
        df = self.drop_unnecessary_columns(df)
        df = self.filter_oil_and_gas(df)
        df = self.add_and_convert_units(df)
        df = self.replace_and_rename_columns(df)
        df = self.classify_group_types(df)
        df = df.drop(columns=['OPEC', 'EU', 'OECD', 'CIS', 'Region', 'SubRegion'], axis=1)
        return df

In [128]:
# Assuming you have a DataFrame df containing your raw data
cleaner = BpFossilProvenReservesCleaner()
df_new = cleaner.clean_data(df)
df_new

Unnamed: 0,group_name,year,energy_source,proven_reserves,proven_reserves_unit,group_type
0,Algeria,1980,Gas,3581.4625,Bcm,country
1,Algeria,1981,Gas,3540.0750,Bcm,country
2,Algeria,1982,Gas,3477.5125,Bcm,country
3,Algeria,1983,Gas,3398.5875,Bcm,country
4,Algeria,1984,Gas,3312.9250,Bcm,country
...,...,...,...,...,...,...
5787,Yemen,2016,Oil,3.0000,Gb,country
5788,Yemen,2017,Oil,3.0000,Gb,country
5789,Yemen,2018,Oil,3.0000,Gb,country
5790,Yemen,2019,Oil,3.0000,Gb,country


## Old & New data sets Comparison (until the most recent year of old data)

In [118]:
df_new = df_new[df_new.year<=df_old.year.max()].reset_index(drop=True)

### Gas

In [69]:
df_old_gas = df_old[df_old.energy_source=='Gas']
df_new_gas = df_new[df_new.energy_source=='Gas']

#### Interpretation
Deductions and Potential Causes

    -Scale and Unit Discrepancy: The most striking observation is the difference in scale of the 'proven_reserves' values. This could be due to a unit discrepancy (e.g., barrels vs. thousands of barrels) or a data entry error.

    -Data Collection or Reporting Changes: The methodology of data collection or reporting might have changed over time, leading to these discrepancies.

    -Data Processing Error: There might have been an error in the data processing steps in the new dataset, such as a mistake in unit conversion or a formula error.
    
#### Correction
By checking other ressourceS I fount that old data has a bug, it has a mistake in unit conversion, of coeff ~1000

In [89]:
print("Old Data Descriptive Statistics:\n", df_old_gas.describe())
print("New Data Descriptive Statistics:\n", df_new_gas.describe())

Old Data Descriptive Statistics:
               year  proven_reserves
count  2434.000000      2434.000000
mean   1999.528348         8.722843
std      11.145168        22.362486
min    1980.000000         0.100000
25%    1990.000000         0.300000
50%    2000.000000         1.200000
75%    2009.000000         5.000000
max    2018.000000       195.300000
New Data Descriptive Statistics:
               year  proven_reserves
count  2712.000000      2712.000000
mean   1999.401549      7782.697149
std      11.209611     23611.703967
min    1980.000000         0.000000
25%    1990.000000       204.794545
50%    2000.000000       726.071537
75%    2009.000000      3985.481822
max    2018.000000    189053.380400


In [90]:
# Assuming you have already extracted the unique group names as shown in your arrays
unique_new_gas = set(df_new_gas.group_name.unique())
unique_old_gas = set(df_old_gas.group_name.unique())

# Find the intersection
common_group_names = unique_new_gas.intersection(unique_old_gas)

# Print the common group names
print(common_group_names)

{'Algeria', 'Pakistan', 'Romania', 'Australia', 'OECD', 'Azerbaijan', 'Uzbekistan', 'Brazil', 'EU28', 'Africa', 'Poland', 'China', 'Iran', 'Libya', 'Turkmenistan', 'Bolivia', 'Saudi Arabia', 'Norway', 'Iraq', 'Kazakhstan', 'Peru', 'Canada', 'Europe', 'Ukraine', 'Egypt', 'Venezuela', 'Indonesia', 'United Kingdom', 'Colombia', 'Malaysia', 'Mexico', 'Yemen', 'Denmark', 'Kuwait', 'Germany', 'Argentina', 'Oman', 'Papua New Guinea', 'Bahrain', 'Israel', 'India', 'Netherlands', 'Thailand', 'Bangladesh', 'North America', 'Syria', 'Italy', 'World', 'Central and South America', 'Nigeria', 'United Arab Emirates', 'Middle East', 'Qatar'}


In [88]:
(df_new_gas[df_new_gas.group_name.isin(common_group_names)].groupby(['year'])['proven_reserves'].mean().reset_index().proven_reserves.values/df_old_gas[df_old_gas.group_name.isin(common_group_names)].groupby(['year'])['proven_reserves'].mean().reset_index().proven_reserves.values).mean()

954.1208530840005

### Oil

In [91]:
df_old_oil = df_old[df_old.energy_source=='Oil']
df_new_oil = df_new[df_new.energy_source=='Oil']

#### Interpretations :
    -General Consistency: The general consistency in mean values and the year range suggests that the datasets are largely comparable.

    -Slight Variations: The slight variations in standard deviation, quartiles, and the maximum values could be attributed to the inclusion of new data points, changes in estimation methods, or revisions in existing data.

    -Data Quality and Reporting: The presence of zero values in the new dataset suggests possibly more precise or different reporting standards.

In [92]:
print("Old Data Descriptive Statistics:\n", df_old_oil.describe())
print("New Data Descriptive Statistics:\n", df_new_oil.describe())

Old Data Descriptive Statistics:
               year  proven_reserves
count  2477.000000      2477.000000
mean   1999.615260        85.562253
std      11.114152       222.662068
min    1980.000000         0.100000
25%    1990.000000         1.400000
50%    2000.000000         5.100000
75%    2009.000000        57.000000
max    2018.000000      1718.500000
New Data Descriptive Statistics:
               year  proven_reserves
count  2788.000000      2788.000000
mean   1999.371593        86.815955
std      11.181688       236.256808
min    1980.000000         0.000000
25%    1990.000000         0.830000
50%    1999.500000         3.684500
75%    2009.000000        46.069707
max    2018.000000      1736.144020
