In [1]:
import numpy as np
import pandas as pd
import itertools

import constants as consts

## Determining the Change in Cost of Gas for Brazilian Residents by State

In this notebook, we will determine if the cost of gas has risen for Brazilian residents after inflation has been taken into account. To determine this, we will use the previously created inflation dataframe, and also account for wage growth in Brazil.

From [this](https://www.statista.com/statistics/941201/growth-rate-average-monthly-income-domestic-workers-brazil/) link, we can determine that wages from 2013 to 2019 have grown by 5.3% by adding the values above or below each bar in the chart. We will determine if gas prices have risen by a greater amount after adjusting for inflation to determine if Brazilian workers are paying more for gas in 2019 when compared to 2013.

In [2]:
df = pd.read_csv('../../data/gas_prices_brazil/brazil_gas_inflation.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
0,0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.825,1.0,0.825
1,1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.711,1.0,1.711
2,2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,27.165,1.0,27.165001
3,3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.249,1.0,1.249
4,4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,0.763,1.0,0.763


In [4]:
# Making a copy prevents the 'SettingWithCopyWarning' warning.

cost_df = df[(df['Last Day of Week'] == '2013-01-12') | (df['Last Day of Week'] == '2019-06-29')].copy()

In [5]:
cost_df.reset_index(inplace = True)

In [6]:
# We can drop the 'Adjusted Mean Distribution Price' column because that value reflects inflation relative to 2004.

cost_df.drop(['Adjusted Mean Distribution Price',
              'Weeks Since First Day',
              'Unnamed: 0',
              'index'], 
               axis = 1, inplace = True)

In [7]:
# To determine inflation from 2013 to 2019, we will use the value of 1R$ in 2019 relative to 2004 from the already
# obtained data, and divide it by the value of 1R$ in 2013 relative to 2004.

r_inf = consts.inf_rates[2019] / consts.inf_rates[2013]
r_inf

1.40625

From this, we can conclude that inflation grew by roughly 41%, which closely resembles the calculation from [here](https://www.in2013dollars.com/brazil/inflation/2013?endYear=2019&amount=1).

In [8]:
cost_df

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874
1,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,2.384
2,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GLP,31.420
3,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.954
4,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,1.653
...,...,...,...,...,...,...,...
293,2019-06-29,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.784
294,2019-06-29,2019,6,SUL,SANTA CATARINA,GLP,51.702
295,2019-06-29,2019,6,SUL,SANTA CATARINA,GNV,1.998
296,2019-06-29,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.023


In [9]:
# Checking if there are any zeros in our cost dataframe.

cost_df[cost_df['Mean Distribution Price'] == 0]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
7,2013-01-12,2013,1,CENTRO OESTE,GOIAS,GNV,0.0
9,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ÓLEO DIESEL S10,0.0
19,2013-01-12,2013,1,CENTRO OESTE,MATO GROSSO DO SUL,GNV,0.0
36,2013-01-12,2013,1,NORDESTE,CEARA,GLP,0.0
50,2013-01-12,2013,1,NORDESTE,PARAIBA,ÓLEO DIESEL S10,0.0
64,2013-01-12,2013,1,NORDESTE,RIO GRANDE DO NORTE,GLP,0.0
78,2013-01-12,2013,1,NORTE,ACRE,ÓLEO DIESEL S10,0.0
79,2013-01-12,2013,1,NORTE,AMAPA,ETANOL HIDRATADO,0.0
106,2013-01-12,2013,1,NORTE,TOCANTINS,ÓLEO DIESEL S10,0.0
112,2013-01-12,2013,1,SUDESTE,ESPIRITO SANTO,ÓLEO DIESEL S10,0.0


### Replacing the Zero Values

Because there are 13 missing values, using Excel to change these values would not take too much time, and would likely be the best way to proceed in a working environment. However, if there were significantly more zero values, replacing them with Python would be necessary, and doing so will be left to the programmer as an exercise.

In [10]:
# We need the same_cols_df for a function that will be able to replace zero values in cost_df.

same_cols_df = df.copy()
same_cols_df.drop(df.columns.difference(list(cost_df.columns)), axis = 1, inplace = True) # Keeps columns from cost_df
same_cols_df.head()

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price
0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.825
1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.711
2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,27.165
3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.249
4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,0.763


In [11]:
def cost_query_not_zero(tup):
    
    # The tuple that is passed to this function needs to be of the following form.
    # (State, Type of Product)
    
    return same_cols_df[
        (same_cols_df['State'] == tup[0]) & 
        (same_cols_df['Type of Product'] == tup[1]) &
        (same_cols_df['Mean Distribution Price'] != 0) &
        (same_cols_df['Year'] >= 2013)
    ]

In [12]:
# This is how we will obtain the 2013 values that need to be replaced. Note the the dataframes are in chronological
# order, which is relevant for indexing.

cost_query_not_zero(('GOIAS', 'GNV')).iloc[0]

Last Day of Week             2013-01-19
Year                               2013
Month                                 1
Macro Region               CENTRO OESTE
State                             GOIAS
Type of Product                     GNV
Mean Distribution Price           1.909
Name: 55916, dtype: object

In [13]:
# This is how we will obtain the 2019 values that need to be replaced.

cost_query_not_zero(('ESPIRITO SANTO', 'GNV')).iloc[-1]

Last Day of Week               2019-06-15
Year                                 2019
Month                                   6
Macro Region                      SUDESTE
State                      ESPIRITO SANTO
Type of Product                       GNV
Mean Distribution Price             2.242
Name: 106486, dtype: object

In [14]:
list(cost_df[cost_df['Mean Distribution Price'] == 0].index)

[7, 9, 19, 36, 50, 64, 78, 79, 106, 112, 259, 265, 283]

In [15]:
cost_df.iloc[7]

Last Day of Week             2013-01-12
Year                               2013
Month                                 1
Macro Region               CENTRO OESTE
State                             GOIAS
Type of Product                     GNV
Mean Distribution Price             0.0
Name: 7, dtype: object

In [16]:
cost_df.iloc[7]['Year']

2013

In [17]:
# Iterate over the indices with zero values, and replace the values appropriately.

for ind in list(cost_df[cost_df['Mean Distribution Price'] == 0].index):
    if cost_df.iloc[ind]['Year'] == 2013:
        cost_df.iloc[ind] = cost_query_not_zero((cost_df.iloc[ind]['State'], cost_df.iloc[ind]['Type of Product'])).iloc[0]
    else:
        cost_df.iloc[ind] = cost_query_not_zero((cost_df.iloc[ind]['State'], cost_df.iloc[ind]['Type of Product'])).iloc[-1]


In [18]:
# Checking that the replacement worked as intended.
# The numbers are the indices where the values were replaced.

for i in [7, 9, 19, 36, 50, 64, 78, 79, 106, 112, 259, 265, 283]:
    print (cost_df.iloc[i]['Mean Distribution Price'])

1.909
2.058
1.42
46.0
1.958
31.5
2.284
1.973
1.998
1.929
2.242
2.822
1.909


In [19]:
# If the year is 2019, adjust for inflation. Otherwise, do not adjust for inflation.

inf_adj_dist_price = []

for i in range (len(cost_df)):
    if cost_df.loc[i, 'Year'] == 2019:
        inf_adj_dist_price.append(round(cost_df.loc[i, 'Mean Distribution Price'] / r_inf, 2))
    else:
        inf_adj_dist_price.append(round(cost_df.loc[i, 'Mean Distribution Price'], 2))

In [20]:
cost_df['Adjusted Mean Distribution Price'] = inf_adj_dist_price

In [21]:
# Check to make sure our calculation is working as intended.

cost_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
190,2019-06-29,2019,6,NORDESTE,MARANHAO,GLP,57.642,40.99
55,2013-01-12,2013,1,NORDESTE,PERNAMBUCO,ÓLEO DIESEL,1.904,1.9
276,2019-06-29,2019,6,SUDESTE,SAO PAULO,GLP,49.214,35.0
289,2019-06-29,2019,6,SUL,RIO GRANDE DO SUL,GNV,2.451,1.74
224,2019-06-29,2019,6,NORTE,ACRE,GLP,70.993,50.48
198,2019-06-29,2019,6,NORDESTE,PARAIBA,ÓLEO DIESEL S10,3.268,2.32
201,2019-06-29,2019,6,NORDESTE,PERNAMBUCO,GLP,53.498,38.04
61,2013-01-12,2013,1,NORDESTE,PIAUI,ÓLEO DIESEL S10,1.945,1.94
98,2013-01-12,2013,1,NORTE,RORAIMA,GASOLINA COMUM,2.413,2.41
111,2013-01-12,2013,1,SUDESTE,ESPIRITO SANTO,ÓLEO DIESEL,1.92,1.92


In [22]:
# Verifying that the years are either 2013 or 2019.

print (cost_df['Year'].value_counts())

2019    149
2013    148
2015      1
Name: Year, dtype: int64


In [23]:
# We need to correct the row with the 2015 value.

cost_df.iloc[36]

Last Day of Week                    2015-12-05
Year                                      2015
Month                                       11
Macro Region                          NORDESTE
State                                    CEARA
Type of Product                            GLP
Mean Distribution Price                   46.0
Adjusted Mean Distribution Price          46.0
Name: 36, dtype: object

In [24]:
r_inf_2015 = consts.inf_rates[2019] / consts.inf_rates[2015] # Adjusting for inflation accordingly.
cost_df.at[36, 'Adjusted Mean Distribution Price'] = cost_df.iloc[36]['Adjusted Mean Distribution Price'] * r_inf_2015
cost_df.iloc[36]

Last Day of Week                    2015-12-05
Year                                      2015
Month                                       11
Macro Region                          NORDESTE
State                                    CEARA
Type of Product                            GLP
Mean Distribution Price                   46.0
Adjusted Mean Distribution Price     55.645161
Name: 36, dtype: object

We will need to note that this specific value is the only value that will have a different inflation rate when compared to the others. 

### Calculating the Change in Percent

In [25]:
cost_df[
    (cost_df['State'] == 'SAO PAULO') & 
    (cost_df['Type of Product'] == 'GASOLINA COMUM')
]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
126,2013-01-12,2013,1,SUDESTE,SAO PAULO,GASOLINA COMUM,2.268,2.27
275,2019-06-29,2019,6,SUDESTE,SAO PAULO,GASOLINA COMUM,3.713,2.64


In [26]:
# We need all combinations of State and Type of Product to calculate the change.
# The number of combinations should be the product of the number of elements in the list, and should also be the 
# number of rows in the dataframe divided by two.

states = list(cost_df['State'].unique())
products = list(cost_df['Type of Product'].unique())

print (len(states), len(products))

sp = [states, products]

combos = list(itertools.product(*sp))
print (len(combos))
print (len(cost_df))

27 6
162
298


In [27]:
# If the dataframe that results from running the cost_query_not_zero function with the combination parameters is 
# empty, then the data does not exist for that combination. If the dataframe has one value, then the information
# is incomplete.

for combo in combos:
    if cost_query_not_zero(combo).empty:
        print (f"{combo} has no data from 2013 to 2019.")
    elif len(cost_query_not_zero(combo)) == 1:
        print (f"{combo} is missing one value.")

('DISTRITO FEDERAL', 'GNV') is missing one value.
('MARANHAO', 'GNV') has no data from 2013 to 2019.
('PIAUI', 'GNV') has no data from 2013 to 2019.
('ACRE', 'GNV') has no data from 2013 to 2019.
('AMAPA', 'GNV') has no data from 2013 to 2019.
('PARA', 'GNV') has no data from 2013 to 2019.
('RONDONIA', 'GNV') has no data from 2013 to 2019.
('RORAIMA', 'GNV') has no data from 2013 to 2019.
('TOCANTINS', 'GNV') has no data from 2013 to 2019.


#### Addressing the Issue of Missing GNV Prices

Using Tableau to explore the data, and by noticing that there should be two rows per combination in our `cost_df` dataframe, it was observed that prices for GNV are missing for multiple regions for the years 2013 to 2019. For this reason, we will not be able to adjust for inflation for these values, and the calculations will remain omitted from our data.

Recall that this data is based off of the table that resulted from the `inflation_cleaning` notebook, and values that originally had a value of '-' were replaced with 0. The `cost_query_not_zero` function checks for nonzero values, so if there were no nonzero values, then the values would be omitted from the `cost_df` dataframe.

In [28]:
cost_df

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874,1.87
1,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,2.384,2.38
2,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GLP,31.420,31.42
3,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.954,1.95
4,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,1.653,1.65
...,...,...,...,...,...,...,...,...
293,2019-06-29,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.784,2.69
294,2019-06-29,2019,6,SUL,SANTA CATARINA,GLP,51.702,36.77
295,2019-06-29,2019,6,SUL,SANTA CATARINA,GNV,1.998,1.42
296,2019-06-29,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.023,2.15


In [29]:
percent_change = lambda original, new: ((new - original) / original) * 100

In [30]:
def before_after_query(tup):
    
    # The tuple passed into this function must be of the form (State, Type of Product)
    
    return cost_df[
        (cost_df['State'] == tup[0]) & 
        (cost_df['Type of Product'] == tup[1])
    ]

In [31]:
before_after_query(combos[0])

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874,1.87
149,2019-06-29,2019,6,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,2.801,1.99


In [32]:
# This is how we will index the before and after values from each combination.

print (list(before_after_query(combos[0])['Adjusted Mean Distribution Price'])[0])
print (list(before_after_query(combos[0])['Adjusted Mean Distribution Price'])[-1])

1.87
1.99


In [33]:
# This was used to see why there was an error.

# print(f"The change in percent for {combos[4]} is {percent_change(list(before_after_query(combos[4])['Adjusted Mean Distribution Price'])[0], list(before_after_query(combos[4])['Adjusted Mean Distribution Price'])[-1])}.")

In [34]:
combos[4]

('DISTRITO FEDERAL', 'GNV')

In [35]:
# The IndexError occurs when the (State, Type of Product) combination is missing from the dataframe.
# These are the values that we will need to add to cost_df

for combo in combos:
    try:
        print(f"The change in percent for {combo} is {percent_change(list(before_after_query(combo)['Adjusted Mean Distribution Price'])[0], list(before_after_query(combo)['Adjusted Mean Distribution Price'])[-1])}.")
    except IndexError:
        pass
    

The change in percent for ('DISTRITO FEDERAL', 'ETANOL HIDRATADO') is 6.417112299465234.
The change in percent for ('DISTRITO FEDERAL', 'GASOLINA COMUM') is 17.226890756302527.
The change in percent for ('DISTRITO FEDERAL', 'GLP') is 17.56842775302354.
The change in percent for ('DISTRITO FEDERAL', 'ÓLEO DIESEL') is 19.999999999999996.
The change in percent for ('DISTRITO FEDERAL', 'ÓLEO DIESEL S10') is 0.0.
The change in percent for ('GOIAS', 'ETANOL HIDRATADO') is 3.63636363636364.
The change in percent for ('GOIAS', 'GASOLINA COMUM') is 18.852459016393443.
The change in percent for ('GOIAS', 'GLP') is 20.763723150357976.
The change in percent for ('GOIAS', 'ÓLEO DIESEL') is 18.974358974358967.
The change in percent for ('GOIAS', 'GNV') is 0.0.
The change in percent for ('GOIAS', 'ÓLEO DIESEL S10') is 15.533980582524265.
The change in percent for ('MATO GROSSO', 'ETANOL HIDRATADO') is -1.9354838709677438.
The change in percent for ('MATO GROSSO', 'GASOLINA COMUM') is 9.88142292490118

The change in percent for ('RIO DE JANEIRO', 'ÓLEO DIESEL') is 17.55319148936171.
The change in percent for ('RIO DE JANEIRO', 'GNV') is 36.8.
The change in percent for ('RIO DE JANEIRO', 'ÓLEO DIESEL S10') is 14.646464646464649.
The change in percent for ('SAO PAULO', 'ETANOL HIDRATADO') is 2.5477707006369448.
The change in percent for ('SAO PAULO', 'GASOLINA COMUM') is 16.299559471365644.
The change in percent for ('SAO PAULO', 'GLP') is 23.109391487864933.
The change in percent for ('SAO PAULO', 'ÓLEO DIESEL') is 16.04278074866309.
The change in percent for ('SAO PAULO', 'GNV') is 26.119402985074615.
The change in percent for ('SAO PAULO', 'ÓLEO DIESEL S10') is 15.02590673575131.
The change in percent for ('PARANA', 'ETANOL HIDRATADO') is 3.4682080924855523.
The change in percent for ('PARANA', 'GASOLINA COMUM') is 13.278008298755179.
The change in percent for ('PARANA', 'GLP') is 27.148110999665665.
The change in percent for ('PARANA', 'ÓLEO DIESEL') is 11.64021164021164.
The chang

In [36]:
# Get the 2019 indices that need to be replaced.
# Then, create a dictionary of the form { index: (combination) } that needs to be updated in cost_df

ind_2019 = list(cost_df[cost_df['Year'] == 2019].index)
updates = { k: (cost_df.iloc[k]['State'], cost_df.iloc[k]['Type of Product']) for k in ind_2019 }
updates

{149: ('DISTRITO FEDERAL', 'ETANOL HIDRATADO'),
 150: ('DISTRITO FEDERAL', 'GASOLINA COMUM'),
 151: ('DISTRITO FEDERAL', 'GLP'),
 152: ('DISTRITO FEDERAL', 'ÓLEO DIESEL'),
 153: ('DISTRITO FEDERAL', 'ÓLEO DIESEL S10'),
 154: ('GOIAS', 'ETANOL HIDRATADO'),
 155: ('GOIAS', 'GASOLINA COMUM'),
 156: ('GOIAS', 'GLP'),
 157: ('GOIAS', 'ÓLEO DIESEL'),
 158: ('GOIAS', 'ÓLEO DIESEL S10'),
 159: ('MATO GROSSO', 'ETANOL HIDRATADO'),
 160: ('MATO GROSSO', 'GASOLINA COMUM'),
 161: ('MATO GROSSO', 'GLP'),
 162: ('MATO GROSSO', 'ÓLEO DIESEL'),
 163: ('MATO GROSSO', 'ÓLEO DIESEL S10'),
 164: ('MATO GROSSO DO SUL', 'ETANOL HIDRATADO'),
 165: ('MATO GROSSO DO SUL', 'GASOLINA COMUM'),
 166: ('MATO GROSSO DO SUL', 'GLP'),
 167: ('MATO GROSSO DO SUL', 'GNV'),
 168: ('MATO GROSSO DO SUL', 'ÓLEO DIESEL'),
 169: ('MATO GROSSO DO SUL', 'ÓLEO DIESEL S10'),
 170: ('ALAGOAS', 'ETANOL HIDRATADO'),
 171: ('ALAGOAS', 'GASOLINA COMUM'),
 172: ('ALAGOAS', 'GLP'),
 173: ('ALAGOAS', 'GNV'),
 174: ('ALAGOAS', 'ÓLEO DIESE

In [37]:
# Create a new column of all zeros so that the column exists, then run a for loop to adjust the values accordingly.

cost_df['Adjusted Percent Increase'] = 0.0

In [38]:
# Reference for indexing the needed values for the percent_change function.

list(before_after_query(updates[149])['Adjusted Mean Distribution Price'])

[1.87, 1.99]

In [39]:
for key in updates:
    cost_df.loc[key, 'Adjusted Percent Increase'] = round(percent_change(
        list(before_after_query(updates[key])['Adjusted Mean Distribution Price'])[0],
        list(before_after_query(updates[key])['Adjusted Mean Distribution Price'])[-1]
    ), 2)
    
cost_df    

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price,Adjusted Percent Increase
0,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.874,1.87,0.00
1,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,2.384,2.38,0.00
2,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,GLP,31.420,31.42,0.00
3,2013-01-12,2013,1,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.954,1.95,0.00
4,2013-01-12,2013,1,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,1.653,1.65,0.00
...,...,...,...,...,...,...,...,...,...
293,2019-06-29,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.784,2.69,14.47
294,2019-06-29,2019,6,SUL,SANTA CATARINA,GLP,51.702,36.77,13.42
295,2019-06-29,2019,6,SUL,SANTA CATARINA,GNV,1.998,1.42,-11.25
296,2019-06-29,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.023,2.15,11.98


In [40]:
# Recall that the source listed underneath the first line in the notebook claims that wages have grown by 5.3%
# Recall that updates.keys() is the list of indicies where the year is 2019.
# Setting the value to 0.0 prevents the column from being created with integer values when we want floats.

cost_df['Adjusted Percent After Wage Growth'] = 0.0

In [41]:
for ind in list(updates.keys()):
    cost_df.at[ind, 'Adjusted Percent After Wage Growth'] = round(cost_df.at[ind, 'Adjusted Percent Increase'] - 5.3, 2)
    

In [42]:
# Checking the calculation

cost_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price,Adjusted Percent Increase,Adjusted Percent After Wage Growth
13,2013-01-12,2013,1,CENTRO OESTE,MATO GROSSO,GNV,1.43,1.43,0.0,0.0
236,2019-06-29,2019,6,NORTE,PARA,ETANOL HIDRATADO,3.377,2.4,11.63,6.33
245,2019-06-29,2019,6,NORTE,RONDONIA,ÓLEO DIESEL S10,3.396,2.41,0.0,-5.3
42,2013-01-12,2013,1,NORDESTE,MARANHAO,GLP,34.968,34.97,0.0,0.0
127,2013-01-12,2013,1,SUDESTE,SAO PAULO,GLP,28.429,28.43,0.0,0.0
168,2019-06-29,2019,6,CENTRO OESTE,MATO GROSSO DO SUL,ÓLEO DIESEL,3.126,2.22,7.77,2.47
248,2019-06-29,2019,6,NORTE,RORAIMA,GLP,72.39,51.48,28.12,22.82
204,2019-06-29,2019,6,NORDESTE,PERNAMBUCO,ÓLEO DIESEL S10,3.106,2.21,12.76,7.46
44,2013-01-12,2013,1,NORDESTE,MARANHAO,ÓLEO DIESEL S10,2.055,2.06,0.0,0.0
208,2019-06-29,2019,6,NORDESTE,PIAUI,ÓLEO DIESEL,3.222,2.29,15.66,10.36


In [43]:
# Checking for discrepancies in the calculation.

cost_df[(cost_df['Year'] != 2013) & (cost_df['Adjusted Percent Increase'] == 0)]

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price,Adjusted Percent Increase,Adjusted Percent After Wage Growth
36,2015-12-05,2015,11,NORDESTE,CEARA,GLP,46.0,55.645161,0.0,0.0
153,2019-06-29,2019,6,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL S10,3.42,2.43,0.0,-5.3
230,2019-06-29,2019,6,NORTE,AMAPA,ÓLEO DIESEL S10,4.015,2.86,0.0,-5.3
245,2019-06-29,2019,6,NORTE,RONDONIA,ÓLEO DIESEL S10,3.396,2.41,0.0,-5.3
