In [1]:
import numpy as np
import pandas as pd
import itertools

import constants as consts

## Determining the Change in Cost of Gas for Brazilian Residents by State

In this notebook, we will determine if the cost of gas has risen for Brazilian residents after inflation has been taken into account. To determine this, we will use the previously created inflation dataframe, and also account for wage growth in Brazil.

From [this](https://www.statista.com/statistics/941201/growth-rate-average-monthly-income-domestic-workers-brazil/) link, we can determine that wages from 2013 to 2019 have grown by 5.3% by adding the values above each bar in the chart. We will determine if gas prices have risen by a greater amount to determine if Brazilian workers are paying more for gas in 2019 when compared to 2013.

In [2]:
df = pd.read_csv('../../data/gas_prices_brazil/brazil_gas_inflation.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
0,0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.825,1.0,0.825
1,1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.711,1.0,1.711
2,2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,27.165,1.0,27.165001
3,3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.249,1.0,1.249
4,4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,0.763,1.0,0.763


In [4]:
# Making a copy prevents the 'SettingWithCopyWarning' warning.

cost_df = df[(df['Last Day of Week'] == '2013-01-12') | (df['Last Day of Week'] == '2019-06-29')].copy()

In [5]:
cost_df.reset_index(inplace = True)

In [6]:
# We can drop the 'Adjusted Mean Distribution Price' column because that value reflects inflation relative to 2004.

cost_df.drop(['Adjusted Mean Distribution Price',
              'Weeks Since First Day',
              'Unnamed: 0',
              'index'], 
               axis = 1, inplace = True)

In [7]:
# To determine inflation from 2013 to 2019, we will use the value of 1R$ in 2019 relative to 2004 from the already
# obtained data, and divide it by the value of 1R$ in 2013 relative to 2004.

r_inf = consts.inf_rates[2019] / consts.inf_rates[2013]
r_inf

1.40625

From this, we can conclude that inflation grew by roughly 41%, which closely resembles the calculation from [here](https://www.in2013dollars.com/brazil/inflation/2013?endYear=2019&amount=1).

In [8]:
cost_df

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874
1,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,2.384
2,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GLP,31.420
3,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.954
4,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,1.653
...,...,...,...,...,...,...,...
293,2019-06-29,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.784
294,2019-06-29,2019,6,SUL,SANTA CATARINA,GLP,51.702
295,2019-06-29,2019,6,SUL,SANTA CATARINA,GNV,1.998
296,2019-06-29,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.023


In [9]:
# Checking if there are any zeros in our cost dataframe.

cost_df[cost_df['Mean Distribution Price'] == 0]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
7,2013-01-12,2013,1,CENTRO OESTE,GOIAS,GNV,0.0
9,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ÓLEO DIESEL S10,0.0
19,2013-01-12,2013,1,CENTRO OESTE,MATO GROSSO DO SUL,GNV,0.0
36,2013-01-12,2013,1,NORDESTE,CEARA,GLP,0.0
50,2013-01-12,2013,1,NORDESTE,PARAIBA,ÓLEO DIESEL S10,0.0
64,2013-01-12,2013,1,NORDESTE,RIO GRANDE DO NORTE,GLP,0.0
78,2013-01-12,2013,1,NORTE,ACRE,ÓLEO DIESEL S10,0.0
79,2013-01-12,2013,1,NORTE,AMAPA,ETANOL HIDRATADO,0.0
106,2013-01-12,2013,1,NORTE,TOCANTINS,ÓLEO DIESEL S10,0.0
112,2013-01-12,2013,1,SUDESTE,ESPIRITO SANTO,ÓLEO DIESEL S10,0.0


### Replacing the Zero Values

Because there are 13 missing values, using Excel to change these values would not take too much time, and would likely be the best way to proceed in a working environment. However, if there were significantly more zero values, replacing them with Python would be necessary, and doing so will be left to the programmer as an exercise.

In [10]:
# We need the same_cols_df for a function that will be able to replace zero values in cost_df.

same_cols_df = df.copy()
same_cols_df.drop(df.columns.difference(list(cost_df.columns)), axis = 1, inplace = True) # Keeps columns from cost_df
same_cols_df.head()

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.825
1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.711
2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,27.165
3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.249
4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,0.763


In [11]:
def cost_query_not_zero(tup):
    
    # The tuple that is passed to this function needs to be of the following form.
    # (State, Type of Product)
    
    return same_cols_df[
        (same_cols_df['State'] == tup[0]) & 
        (same_cols_df['Type of Product'] == tup[1]) &
        (same_cols_df['Mean Distribution Price'] != 0) &
        (same_cols_df['Year'] >= 2013)
    ]

In [12]:
# This is how we will obtain the 2013 values that need to be replaced. Note the the dataframes are in chronological
# order, which is relevant for indexing.

cost_query_not_zero(('GOIAS', 'GNV')).iloc[0]

Last Day of Week             2013-01-19
Year                               2013
Month                                 1
Macro Region               CENTRO OESTE
State                             GOIAS
Type of Product                     GNV
Mean Distribution Price           1.909
Name: 55916, dtype: object

In [13]:
# This is how we will obtain the 2019 values that need to be replaced.

cost_query_not_zero(('ESPIRITO SANTO', 'GNV')).iloc[-1]

Last Day of Week               2019-06-15
Year                                 2019
Month                                   6
Macro Region                      SUDESTE
State                      ESPIRITO SANTO
Type of Product                       GNV
Mean Distribution Price             2.242
Name: 106486, dtype: object

In [14]:
list(cost_df[cost_df['Mean Distribution Price'] == 0].index)

[7, 9, 19, 36, 50, 64, 78, 79, 106, 112, 259, 265, 283]

In [15]:
cost_df.iloc[7]

Last Day of Week             2013-01-12
Year                               2013
Month                                 1
Macro Region               CENTRO OESTE
State                             GOIAS
Type of Product                     GNV
Mean Distribution Price             0.0
Name: 7, dtype: object

In [16]:
cost_df.iloc[7]['Year']

2013

In [17]:
# Iterate over the indicies with zero values, and replace the values appropriately.

for ind in list(cost_df[cost_df['Mean Distribution Price'] == 0].index):
    if cost_df.iloc[ind]['Year'] == 2013:
        cost_df.iloc[ind] = cost_query_not_zero((cost_df.iloc[ind]['State'], cost_df.iloc[ind]['Type of Product'])).iloc[0]
    else:
        cost_df.iloc[ind] = cost_query_not_zero((cost_df.iloc[ind]['State'], cost_df.iloc[ind]['Type of Product'])).iloc[-1]

In [18]:
# Checking that the replacement worked as intended.
# The numbers are the indices where the values were replaced.

for i in [7, 9, 19, 36, 50, 64, 78, 79, 106, 112, 259, 265, 283]:
    print (cost_df.iloc[i]['Mean Distribution Price'])

1.909
2.058
1.42
46.0
1.958
31.5
2.284
1.973
1.998
1.929
2.242
2.822
1.909


In [19]:
# If the year is 2019, adjust for inflation. Otherwise, do not adjust for inflation.

inf_adj_dist_price = []

for i in range (len(cost_df)):
    if cost_df.loc[i, 'Year'] == 2019:
        inf_adj_dist_price.append(cost_df.loc[i, 'Mean Distribution Price'] / r_inf)
    else:
        inf_adj_dist_price.append(cost_df.loc[i, 'Mean Distribution Price'])

In [20]:
cost_df['Adjusted Mean Distribution Price'] = inf_adj_dist_price

In [21]:
# Check to make sure our calculation is working as intended.

cost_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
223,2019-06-29,2019,6,NORTE,ACRE,GASOLINA COMUM,4.198,2.985244
53,2013-01-12,2013,1,NORDESTE,PERNAMBUCO,GLP,28.303,28.303
134,2013-01-12,2013,1,SUL,PARANA,GNV,1.394,1.394
171,2019-06-29,2019,6,NORDESTE,ALAGOAS,GASOLINA COMUM,4.026,2.862933
74,2013-01-12,2013,1,NORTE,ACRE,ETANOL HIDRATADO,2.213,2.213
124,2013-01-12,2013,1,SUDESTE,RIO DE JANEIRO,ÓLEO DIESEL S10,1.977,1.977
50,2013-01-19,2013,1,NORDESTE,PARAIBA,ÓLEO DIESEL S10,1.958,1.958
173,2019-06-29,2019,6,NORDESTE,ALAGOAS,GNV,2.451,1.742933
88,2013-01-12,2013,1,NORTE,PARA,ETANOL HIDRATADO,2.148,2.148
221,2019-06-29,2019,6,NORDESTE,SERGIPE,ÓLEO DIESEL S10,3.324,2.363733


In [22]:
print (cost_df['Year'].value_counts())

2019    149
2013    148
2015      1
Name: Year, dtype: int64


In [23]:
# We need to correct the row with the 2015 value.

cost_df.iloc[36]

Last Day of Week                    2015-12-05
Year                                      2015
Month                                       11
Macro Region                          NORDESTE
State                                    CEARA
Type of Product                            GLP
Mean Distribution Price                   46.0
Adjusted Mean Distribution Price          46.0
Name: 36, dtype: object

In [24]:
r_inf_2015 = consts.inf_rates[2019] / consts.inf_rates[2015] # Adjusting for inflation accordingly.
cost_df.at[36, 'Adjusted Mean Distribution Price'] = cost_df.iloc[36]['Adjusted Mean Distribution Price'] * r_inf_2015
cost_df.iloc[36]

Last Day of Week                    2015-12-05
Year                                      2015
Month                                       11
Macro Region                          NORDESTE
State                                    CEARA
Type of Product                            GLP
Mean Distribution Price                   46.0
Adjusted Mean Distribution Price     55.645161
Name: 36, dtype: object

We will need to note that this specific value is the only value that will have a different inflation rate when compared to the others. 

### Calculating the Change

In [25]:
cost_df[
    (cost_df['State'] == 'SAO PAULO') & 
    (cost_df['Type of Product'] == 'GASOLINA COMUM')
]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
126,2013-01-12,2013,1,SUDESTE,SAO PAULO,GASOLINA COMUM,2.268,2.268
275,2019-06-29,2019,6,SUDESTE,SAO PAULO,GASOLINA COMUM,3.713,2.640356


In [26]:
# We need all combinations of State and Type of Product to calculate the change.
# The number of combinations should be the product of the number of elements in the list, and should also be the 
# number of rows in the dataframe divided by two.

states = list(cost_df['State'].unique())
products = list(cost_df['Type of Product'].unique())

print (len(states), len(products))

sp = [states, products]

combos = list(itertools.product(*sp))
print (len(combos))
print (len(cost_df))

27 6
162
298


In [34]:
for combo in combos:
    if cost_query_not_zero(combo).empty:
        print (f"{combo} has no data from 2013 to 2019.")
    elif len(cost_query_not_zero(combo)) == 1:
        print (f"{combo} is missing one value.")

('DISTRITO FEDERAL', 'GNV') is missing one value.
('MARANHAO', 'GNV') has no data from 2013 to 2019.
('PIAUI', 'GNV') has no data from 2013 to 2019.
('ACRE', 'GNV') has no data from 2013 to 2019.
('AMAPA', 'GNV') has no data from 2013 to 2019.
('PARA', 'GNV') has no data from 2013 to 2019.
('RONDONIA', 'GNV') has no data from 2013 to 2019.
('RORAIMA', 'GNV') has no data from 2013 to 2019.
('TOCANTINS', 'GNV') has no data from 2013 to 2019.


#### Addressing the Issue of Missing GNV Prices

Using Tableau to explore the data, and by noticing that there should be two rows per combination in our `cost_df` dataframe, it was observed that prices for GNV are missing for multiple regions for the years 2013 to 2019. For this reason, we will not be able to adjust for inflation for these values, and the calculations will remain omitted from our data.

Recall that this data is based off of the table that resulted from the `inflation_cleaning` notebook, and values that originally had a value of '-' were replaced with 0. The `cost_query_not_zero` function explicitly checks for nonzero values, so if there were no nonzero values, then the values would be omitted from the `cost_df` dataframe.

In [36]:
cost_df

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874,1.874000
1,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,2.384,2.384000
2,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GLP,31.420,31.420000
3,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.954,1.954000
4,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,1.653,1.653000
...,...,...,...,...,...,...,...,...
293,2019-06-29,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.784,2.690844
294,2019-06-29,2019,6,SUL,SANTA CATARINA,GLP,51.702,36.765867
295,2019-06-29,2019,6,SUL,SANTA CATARINA,GNV,1.998,1.420800
296,2019-06-29,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.023,2.149689


In [37]:
combos[0] 

('DISTRITO FEDERAL', 'ETANOL HIDRATADO')

In [40]:
cost_df[(cost_df['State'] == combos[0][0]) & (cost_df['Type of Product'] == combos[0][1])]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874,1.874
149,2019-06-29,2019,6,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,2.801,1.991822


In [46]:
percent_change = lambda original, new: ((new - original) / original) * 100

In [47]:
percent_change(1.87, 1.99)

6.417112299465234