In [7]:
import pandas as pd

In [8]:
path = "../../data/_raw/eia/eia_api_intl_2020_2021.csv"
df_eia = pd.read_csv(path)



#add the name of the source
df_eia["data_source"] = "eia"

#column selection
df_eia = df_eia[["data_source", "period", "productName", "activityName", "countryRegionId", "countryRegionName", "unitName", "value", "unit"]]

#delete duplicates. Duplicates have been found on different idProduct for identical rows.
# on all columns except value because there is 1 duplicate for USA, 2020, Coal Prod with values that are almost exactly the same
df_eia = df_eia.drop_duplicates(["data_source", "period", "productName", "activityName", "countryRegionId", "countryRegionName", "unitName", "unit"])

#colums renaming
df_eia.rename(columns={
    'period': 'year',
    'productName': 'product_name',
    'activityName': 'activity_name',
    'countryRegionId': 'country_iso3',
    'countryRegionName': 'country_name',
    'unitName': 'unit_name',
}, inplace=True)

In [None]:
# to find duplicates
df_eia[:].value_counts(sort=True)

In [5]:
# we found duplicates : rows are exactly the same except for productId

df_eia[
    (df_eia["country_iso3"]=='FJI') & 
    (df_eia["activity_name"]=='Emissions') & 
    (df_eia["product_name"]=='Petroleum and other liquids') &
    (df_eia["unit"]=='MMTCD')
    ]

Unnamed: 0,year,productId,product_name,activityId,activity_name,country_iso3,country_name,countryRegionTypeId,countryRegionTypeName,dataFlagId,dataFlagDescription,unit_name,value,unit,data_source
65050,2021,4006,Petroleum and other liquids,8,Emissions,FJI,Fiji,c,Country,,,million metric tonnes carbon dioxide,1.6547908836221423,MMTCD,eia
65709,2021,5,Petroleum and other liquids,8,Emissions,FJI,Fiji,c,Country,,,million metric tonnes carbon dioxide,1.6547908836221423,MMTCD,eia


In [None]:
# to find duplicates
df_eia[:].value_counts(sort=True)

In [11]:
#calculate number of rows for each productName
df_eia["productName"].value_counts(sort=True)

#=> NEXT STEP : subset df by productName, understand data and normalize it

Coal                                    5789
Anthracite                              4722
Metallurgical coal                      4710
Metallurgical coke                      4631
Dry natural gas                         4563
Subbituminous                           4239
Bituminous                              4161
Lignite                                 4160
Electricity                             3569
Petroleum and other liquids             2337
Refined petroleum products              1825
Other petroleum liquids                 1782
Jet fuel                                1577
Distillate fuel oil                     1567
Kerosene                                1565
Residual fuel oil                       1434
Motor gasoline                          1404
Primary energy                          1305
Tide and wave                           1250
Liquefied Petroleum Gases               1239
Wind                                    1222
Non-hydroelectric renewables            1219
Crude oil 

In [10]:
# check of all product name that has coal in its name. We'll only keep the 'Coal' and compare with SDP later
#Coal                  5789
#Metallurgical coal    4710
#Coal and coke          461
#df_eia[df_eia["productName"].str.contains('coal', case = False)]["productName"].value_counts(sort=True)

#creation of the df for the Coal
df_eia_coal = df_eia[df_eia["product_name"]=='Coal']

In [17]:
#calculate number of rows for each productName
df_eia_coal["activityName"].value_counts(sort=True)

Production     1719
Consumption    1419
Exports        1296
Imports        1127
Reserves        228
Name: activityName, dtype: int64

In [18]:
#add source name


#select columns
df_eia_coal = df_eia_coal[["", ""]]

df_eia_coal.head()

Unnamed: 0,period,productId,productName,activityId,activityName,countryRegionId,countryRegionName,countryRegionTypeId,countryRegionTypeName,dataFlagId,dataFlagDescription,unitName,value,unit
1,2021,7,Coal,1,Production,ZAF,South Africa,c,Country,,,terajoules,5254411.855803305,TJ
9,2021,7,Coal,1,Production,ZAF,South Africa,c,Country,,,million metric tons of oil equivalent,125.49947103961276,MTOE
12,2021,7,Coal,1,Production,DEU,Germany,c,Country,,,terajoules,1196512.3729902192,TJ
27,2021,7,Coal,1,Production,DEU,Germany,c,Country,,,thousand short tons,139173.90502983,TST
35,2021,7,Coal,1,Production,DEU,Germany,c,Country,,,million metric tons of oil equivalent,28.57820704267334,MTOE


In [14]:
#found new duplicates
df_eia_coal.groupby(["country_iso3", "year", "activity_name"])["data_source"].count().sort_values(ascending=True)



country_iso3  year  activity_name
ABW           2020  Reserves         1
MEX           2020  Reserves         1
              2021  Reserves         1
MKD           2020  Reserves         1
              2021  Reserves         1
                                    ..
DNK           2021  Imports          5
MMR           2021  Production       5
                    Exports          5
DMA           2021  Production       5
KWT           2020  Imports          5
Name: data_source, Length: 885, dtype: int64

In [16]:
#specific row
df_eia_coal[
    (df_eia_coal["country_iso3"]=="ABW") &
    (df_eia_coal["year"]==2020) &
    (df_eia_coal["activity_name"]=="Reserves")
]

Unnamed: 0,data_source,year,product_name,activity_name,country_iso3,country_name,unit_name,value,unit
117872,eia,2020,Coal,Reserves,ABW,Aruba,million short tons,0,MST


In [None]:
#specific row that had a duplicate
df_eia_coal[
    (df_eia_coal["country_iso3"]=="USA") &
    (df_eia_coal["year"]==2020) &
    (df_eia_coal["activity_name"]=="Consumption") &
    (df_eia_coal["unit"]=="QBTU")
]

In [17]:
#analysing use of unit
df_eia_coal.groupby(["unit", "unit_name"])["data_source"].count().sort_values(ascending=False)
#all units frequency use is same order of magnitude

unit  unit_name                            
MST   million short tons                       457
QBTU  quadrillion Btu                          426
MT    1000 metric tons                         392
MTOE  million metric tons of oil equivalent    390
TST   thousand short tons                      390
TJ    terajoules                               389
Name: data_source, dtype: int64

In [33]:
#clean value column
values_to_replace = ["--", "ie"]
df_eia_coal["value"] = df_eia_coal["value"].replace(values_to_replace, 0)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eia_coal["value"] = df_eia_coal["value"].replace(values_to_replace, 0)


In [35]:
#somme pour calculer le ratio par unite d'energy
#df_eia_coal.groupby(["unit", "unit_name"])["value"].sum()
#df_eia_coal.info()
df_eia_coal["value"].astype(float)
#df_eia_coal[df_eia_coal["value"]=='ie']

2             42.699999
11             0.000000
14             0.000000
94        100369.212636
95             0.000000
              ...      
118952         2.204620
118953      2777.821200
118957         0.000000
118958      3206.619790
118963         0.000000
Name: value, Length: 2444, dtype: float64

In [37]:
df_eia_coal[df_eia_coal["activity_name"] != "Reserves"]

Unnamed: 0,data_source,year,product_name,activity_name,country_iso3,country_name,unit_name,value,unit
2,eia,2020,Coal,Production,POL,Poland,million metric tons of oil equivalent,42.69999895485965,MTOE
11,eia,2021,Coal,Production,MNP,Northern Mariana Islands,1000 metric tons,0,MT
14,eia,2021,Coal,Production,KIR,Kiribati,1000 metric tons,0,MT
94,eia,2020,Coal,Production,POL,Poland,1000 metric tons,100369.21263571049,MT
95,eia,2020,Coal,Production,MDA,Moldova,terajoules,0,TJ
...,...,...,...,...,...,...,...,...,...
102057,eia,2020,Coal,Exports,HKG,Hong Kong,thousand short tons,0,TST
102062,eia,2020,Coal,Exports,HKG,Hong Kong,1000 metric tons,0,MT
102079,eia,2020,Coal,Exports,HKG,Hong Kong,million metric tons of oil equivalent,0,MTOE
102089,eia,2020,Coal,Exports,HKG,Hong Kong,terajoules,0,TJ


In [23]:
df_eia_coal[df_eia_coal["unit"]=="MST"].head(20)

Unnamed: 0,data_source,year,product_name,activity_name,country_iso3,country_name,unit_name,value,unit
117609,eia,2020,Coal,Reserves,CHE,Switzerland,million short tons,0,MST
117615,eia,2021,Coal,Reserves,CRI,Costa Rica,million short tons,0,MST
117616,eia,2021,Coal,Reserves,BGR,Bulgaria,million short tons,2608.06546,MST
117617,eia,2020,Coal,Reserves,NLD,Netherlands,million short tons,547.84807,MST
117630,eia,2021,Coal,Reserves,DZA,Algeria,million short tons,65.03629,MST
117631,eia,2021,Coal,Reserves,MEX,Mexico,million short tons,1334.89741,MST
117634,eia,2021,Coal,Reserves,GMB,"Gambia, The",million short tons,0,MST
117635,eia,2021,Coal,Reserves,NPL,Nepal,million short tons,1.10231,MST
117656,eia,2021,Coal,Reserves,MWI,Malawi,million short tons,2.20462,MST
117657,eia,2021,Coal,Reserves,ISL,Iceland,million short tons,0,MST


In [24]:
#df_eia_coal[df_eia_coal["unit" != ""]].sort_values(["country_iso3", "activity_name", "year", "unit"]).head(50)

#il semble que l'unité MST soit utilisée que pour l'activité "Reserve"
df_eia_coal[df_eia_coal["activity_name"] == "Reserves"].sort_values(["country_iso3", "activity_name", "year", "unit"]).head(50)


#les unités 



Unnamed: 0,data_source,year,product_name,activity_name,country_iso3,country_name,unit_name,value,unit
117872,eia,2020,Coal,Reserves,ABW,Aruba,million short tons,0.0,MST
117937,eia,2021,Coal,Reserves,ABW,Aruba,million short tons,0.0,MST
118519,eia,2020,Coal,Reserves,AFG,Afghanistan,million short tons,72.75246,MST
118597,eia,2021,Coal,Reserves,AFG,Afghanistan,million short tons,72.75246,MST
118647,eia,2020,Coal,Reserves,AGO,Angola,million short tons,0.0,MST
118711,eia,2021,Coal,Reserves,AGO,Angola,million short tons,0.0,MST
117938,eia,2020,Coal,Reserves,ALB,Albania,million short tons,575.40582,MST
118016,eia,2021,Coal,Reserves,ALB,Albania,million short tons,575.40582,MST
118522,eia,2020,Coal,Reserves,ARE,United Arab Emirates,million short tons,0.0,MST
118596,eia,2021,Coal,Reserves,ARE,United Arab Emirates,million short tons,0.0,MST
