In [1]:
import numpy as np
import pandas as pd

import constants as consts

In [2]:
# Allows for .py files to automatically reload.
%reload_ext autoreload
%autoreload 2

## Inflation Exploration

In this notebook, we will determine if gas prices have kept up with inflation. Inflation data was gathered [here](https://www.in2013dollars.com/brazil/inflation/2004?endYear=2019&amount=1).

In [3]:
df = pd.read_csv('../../data/gas_prices_brazil/brazil_gas_cleaned.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,...,Mean Distribution Price,Distribution Standard Deviation,Distribution Min Price,Distribution Max Price,Distribution Variation Coefficient,Month,Year,Weeks Since First Day,Percent of Total Population in 2020,Population
0,0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,...,0.825,0.11,0.4201,0.9666,0.133,5,2004,1,1.4%,2051146
1,1,2004-05-16,2004-05-22,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.271,0.039,...,0.823,0.111,0.4094,1.1931,0.135,5,2004,2,1.4%,2051146
2,2,2004-05-23,2004-05-29,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,129,R$/l,1.282,0.024,...,0.818,0.137,0.3879,1.0336,0.167,5,2004,3,1.4%,2051146
3,3,2004-05-30,2004-06-05,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.373,0.051,...,0.894,0.147,0.4094,1.4206,0.164,5,2004,4,1.4%,2051146
4,4,2004-06-06,2004-06-12,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,129,R$/l,1.373,0.059,...,0.951,0.125,0.5169,1.115,0.131,6,2004,5,1.4%,2051146


In [5]:
df.columns

Index(['Unnamed: 0', 'First Day of Week', 'Last Day of Week', 'Macro Region',
       'State', 'Type of Product', 'Number of Stations', 'Unit of Measurement',
       'Mean Market Value', 'Std Dev', 'Min Price Observed',
       'Max Price Observed', 'Mean Price Margin', 'Variation Coefficient',
       'Mean Distribution Price', 'Distribution Standard Deviation',
       'Distribution Min Price', 'Distribution Max Price',
       'Distribution Variation Coefficient', 'Month', 'Year',
       'Weeks Since First Day', 'Percent of Total Population in 2020',
       'Population'],
      dtype='object')

In [6]:
# Keeps all of the columns in the list.
df.drop(df.columns.difference([
    'First Day of Week', 'Macro Region', 'State', 'Type of Product', 'Mean Distribution Price', 'Month', 'Year', 'Weeks Since First Day'
]), axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,First Day of Week,Macro Region,State,Type of Product,Mean Distribution Price,Month,Year,Weeks Since First Day
0,2004-05-09,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.825,5,2004,1
1,2004-05-16,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.823,5,2004,2
2,2004-05-23,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.818,5,2004,3
3,2004-05-30,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.894,5,2004,4
4,2004-06-06,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.951,6,2004,5


In [8]:
consts.inf_rates[2005]

1.07

In [9]:
df['Year'].map(type).value_counts()

<class 'int'>    106823
Name: Year, dtype: int64

In [10]:
df['Mean Distribution Price'].map(type).value_counts()

<class 'str'>    106823
Name: Mean Distribution Price, dtype: int64

In [11]:
df[df['Mean Distribution Price'] == '-']

Unnamed: 0,First Day of Week,Macro Region,State,Type of Product,Mean Distribution Price,Month,Year,Weeks Since First Day
291,2010-01-03,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,-,1,2010,296
292,2010-01-10,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,-,1,2010,297
294,2010-01-24,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,-,1,2010,299
305,2010-04-11,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,-,4,2010,310
306,2010-04-18,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,-,4,2010,311
...,...,...,...,...,...,...,...,...
106612,2018-10-21,SUL,SANTA CATARINA,GNV,-,10,2018,755
106636,2018-11-18,SUL,SANTA CATARINA,GNV,-,11,2018,759
106642,2018-11-25,SUL,SANTA CATARINA,GNV,-,11,2018,760
106660,2018-12-16,SUL,SANTA CATARINA,GNV,-,12,2018,763


In [12]:
# We need to convert the values in the 'Mean Distribution Price' column to floats.
df['Mean Distribution Price'] = df['Mean Distribution Price'].str.replace('-', '0').fillna(0).astype(float)
df['Mean Distribution Price'] = pd.to_numeric(df['Mean Distribution Price'], downcast = 'float')
df['Mean Distribution Price'].map(type).value_counts()

<class 'float'>    106823
Name: Mean Distribution Price, dtype: int64

In [13]:
inf_adj_dist_price = []

for i in range (len(df)):
    inf_adj_dist_price.append(df.loc[i, 'Mean Distribution Price'] / consts.inf_rates[df.loc[i, 'Year']])

In [14]:
df['Adjusted Mean Distribution Price'] = inf_adj_dist_price

In [15]:
df[df['Mean Distribution Price'] == 0]

Unnamed: 0,First Day of Week,Macro Region,State,Type of Product,Mean Distribution Price,Month,Year,Weeks Since First Day,Adjusted Mean Distribution Price
291,2010-01-03,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.0,1,2010,296,0.0
292,2010-01-10,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.0,1,2010,297,0.0
294,2010-01-24,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.0,1,2010,299,0.0
305,2010-04-11,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.0,4,2010,310,0.0
306,2010-04-18,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.0,4,2010,311,0.0
...,...,...,...,...,...,...,...,...,...
106612,2018-10-21,SUL,SANTA CATARINA,GNV,0.0,10,2018,755,0.0
106636,2018-11-18,SUL,SANTA CATARINA,GNV,0.0,11,2018,759,0.0
106642,2018-11-25,SUL,SANTA CATARINA,GNV,0.0,11,2018,760,0.0
106660,2018-12-16,SUL,SANTA CATARINA,GNV,0.0,12,2018,763,0.0


In [16]:
df.groupby(['Year', 'Month', 'Macro Region', 'State', 'Type of Product']).mean().drop('Weeks Since First Day', axis = 1).reset_index()

Unnamed: 0,Year,Month,Macro Region,State,Type of Product,Mean Distribution Price,Adjusted Mean Distribution Price
0,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,0.840000,0.840000
1,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.722500,1.722500
2,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,28.396000,28.396000
3,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.251500,1.251500
4,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,0.755500,0.755500
...,...,...,...,...,...,...,...
24945,2019,6,SUL,SANTA CATARINA,GASOLINA COMUM,3.811000,1.693778
24946,2019,6,SUL,SANTA CATARINA,GLP,51.582253,22.925445
24947,2019,6,SUL,SANTA CATARINA,GNV,1.909500,0.848667
24948,2019,6,SUL,SANTA CATARINA,ÓLEO DIESEL,3.057750,1.359000
