In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import shapiro
from scipy.stats import ttest_rel

import constants as consts

In [2]:
df = pd.read_csv('../../data/gas_prices_brazil/brazil_gas_inflation.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
0,0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.4%,0.825,1.0,0.825
1,1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.4%,1.711,1.0,1.711
2,2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,1.4%,27.165,1.0,27.165001
3,3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.4%,1.249,1.0,1.249
4,4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,3.4%,0.763,1.0,0.763


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

## Preparing the Data

We will start by separating the data into high and low percentages of the population.

In [5]:
# Need to convert 'Percent of Total Population in 2020' to float.
# Remove percent sign, then float(entry)

for ind in range(len(df)):
    df.at[ind, 'Percent of Total Population in 2020'] = float(df.at[ind, 'Percent of Total Population in 2020'][0:-1])

In [6]:
# Defining high population as having at least 4% of the population in 2020 was arbitrarily chosen.

high_pop_df = df[df['Percent of Total Population in 2020'] >= 4.0]
low_pop_df = df[df['Percent of Total Population in 2020'] < 4.0]

In [7]:
high_pop_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
25037,2008-04-05,2008,3,NORTE,PARA,GLP,4.1,26.348,204.0,21.596721
94480,2017-12-02,2017,11,SUL,RIO GRANDE DO SUL,ETANOL HIDRATADO,5.4,3.285,708.0,1.57177
482,2004-06-05,2004,5,SUL,RIO GRANDE DO SUL,ETANOL HIDRATADO,5.4,0.941,4.0,0.941
33449,2009-07-25,2009,7,NORDESTE,PERNAMBUCO,GLP,4.5,29.022,272.0,22.673437
36928,2010-02-20,2010,2,NORDESTE,CEARA,GNV,4.3,1.408,302.0,1.050746
43239,2011-02-05,2011,1,SUDESTE,RIO DE JANEIRO,GLP,8.2,27.459,352.0,19.202098
31382,2009-03-28,2009,3,SUDESTE,MINAS GERAIS,GASOLINA COMUM,10.1,2.145,255.0,1.675781
5098,2005-02-26,2005,2,NORDESTE,BAHIA,GNV,7.1,0.772,42.0,0.721495
38419,2010-05-15,2010,5,NORDESTE,BAHIA,GNV,7.1,1.411,314.0,1.052985
42001,2010-11-27,2010,11,SUL,PARANA,ETANOL HIDRATADO,5.4,1.464,342.0,1.092537


In [8]:
low_pop_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
61650,2013-10-05,2013,9,NORTE,TOCANTINS,ETANOL HIDRATADO,0.7,1.74,491.0,1.0875
36488,2010-01-23,2010,1,SUDESTE,ESPIRITO SANTO,ETANOL HIDRATADO,1.9,1.919,298.0,1.43209
41322,2010-10-23,2010,10,NORDESTE,RIO GRANDE DO NORTE,GNV,1.7,1.296,337.0,0.967164
72371,2015-02-07,2015,2,SUL,SANTA CATARINA,GLP,3.4,39.516,561.0,21.245161
101126,2018-10-13,2018,10,CENTRO OESTE,DISTRITO FEDERAL,GLP,1.4,53.281,753.0,24.667129
13337,2006-06-17,2006,6,SUDESTE,ESPIRITO SANTO,GNV,1.9,0.758,110.0,0.682883
18919,2007-04-28,2007,4,NORDESTE,SERGIPE,GNV,1.1,0.941,155.0,0.818261
80148,2016-02-13,2016,2,CENTRO OESTE,GOIAS,ÓLEO DIESEL S10,3.4,2.9,614.0,1.435644
95899,2018-02-10,2018,2,NORDESTE,PARAIBA,GASOLINA COMUM,1.9,3.625,718.0,1.678241
25745,2008-05-17,2008,5,NORDESTE,PARAIBA,GASOLINA COMUM,1.9,2.146,210.0,1.759016


The query below is how we will find the before and after values for the ttest.

In [9]:
def state_prod_query(df, state, prod_type):
    
    return df[
        (df['State'] == state) &
        (df['Type of Product'] == prod_type) &
        (df['Adjusted Mean Distribution Price'] != 0)
    ]

Creating the dataframe that will be used to run the ttest.

In [10]:
# Creating a list of the information needed and then creating a dataframe from the list is the preferred way
# to create the necessary dataframe.

# See the link below for more information.
# https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it

# The 'Before Year' and 'After Year' columns are used to make sure that the years for comparison are the same.

def ttest_prep(df, num_col, state_list, prod_type):
    
    # num_col is short for numerical column.
    # state_list and prod_type can be altered as necessary.
    
    state_before_after = []
    
    for state in state_list:
        state_before_after.append([
            state,
            state_prod_query(df, state, prod_type).iloc[0]['Year'], # Remove this line for improved reusability.
            state_prod_query(df, state, prod_type).iloc[-1]['Year'], # Remove this line for improved reusability.
            state_prod_query(df, state, prod_type).iloc[0][num_col],
            state_prod_query(df, state, prod_type).iloc[-1][num_col],
        ])
        
    ttest_df = pd.DataFrame(
        state_before_after, 
        columns = ['State', 'Before Year', 'After Year', 'Before Price', 'After Price']
    )
    
    ttest_df['Difference'] = ttest_df['After Price'] - ttest_df['Before Price']
    
    return ttest_df

In [11]:
# Need to repeat this process for all product types.

high_oleo_ttest = ttest_prep(
           high_pop_df, 
           'Adjusted Mean Distribution Price',
           list(high_pop_df['State'].unique()),
           'ÓLEO DIESEL'
          )

low_oleo_ttest = ttest_prep(
           low_pop_df, 
           'Adjusted Mean Distribution Price',
           list(low_pop_df['State'].unique()),
           'ÓLEO DIESEL'
          )

print (shapiro(high_oleo_ttest['Difference']))
print (shapiro(low_oleo_ttest['Difference']))

print ('\n')

print (ttest_rel(high_oleo_ttest['After Price'],
                 high_oleo_ttest['Before Price']))

print (ttest_rel(low_oleo_ttest['After Price'],
                 low_oleo_ttest['Before Price']))

print ('\n')

print (high_oleo_ttest['Difference'].mean() / high_oleo_ttest['Before Price'].mean())
print (low_oleo_ttest['Difference'].mean() / low_oleo_ttest['Before Price'].mean())

ShapiroResult(statistic=0.8792933225631714, pvalue=0.15435367822647095)
ShapiroResult(statistic=0.9824665188789368, pvalue=0.9718320965766907)


Ttest_relResult(statistic=9.888257686379502, pvalue=9.23045177857013e-06)
Ttest_relResult(statistic=9.668424607526658, pvalue=2.532215292628761e-08)


0.1455291805731164
0.1430331159350287


In [12]:
# Viewing the dataframe for reference.

high_oleo_ttest

Unnamed: 0,State,Before Year,After Year,Before Price,After Price,Difference
0,BAHIA,2004,2019,1.18,1.405778,0.225778
1,CEARA,2004,2019,1.362,1.487111,0.125111
2,PERNAMBUCO,2004,2019,1.226,1.456,0.23
3,PARA,2004,2019,1.268,1.496,0.228
4,MINAS GERAIS,2004,2019,1.201,1.436444,0.235445
5,RIO DE JANEIRO,2004,2019,1.192,1.378667,0.186667
6,SAO PAULO,2004,2019,1.21,1.354222,0.144222
7,PARANA,2004,2019,1.231,1.320444,0.089444
8,RIO GRANDE DO SUL,2004,2019,1.203,1.349778,0.146778


In [13]:
low_oleo_ttest

Unnamed: 0,State,Before Year,After Year,Before Price,After Price,Difference
0,DISTRITO FEDERAL,2004,2019,1.249,1.461333,0.212333
1,GOIAS,2004,2019,1.28,1.448889,0.168889
2,MATO GROSSO,2004,2019,1.405,1.497778,0.092778
3,MATO GROSSO DO SUL,2004,2019,1.363,1.389333,0.026333
4,ALAGOAS,2004,2019,1.216,1.446222,0.230222
5,MARANHAO,2004,2019,1.23,1.453778,0.223778
6,PARAIBA,2004,2019,1.199,1.437778,0.238778
7,PIAUI,2004,2019,1.226,1.432,0.206
8,RIO GRANDE DO NORTE,2004,2019,1.208,1.461778,0.253778
9,SERGIPE,2004,2019,1.225,1.469778,0.244778


In [14]:
# Check for missing data for each combination of 'State' and 'Type of Product' in low population states

for state in list(low_pop_df['State'].unique()):
    for prod in list(low_pop_df['Type of Product'].unique()):
        if state_prod_query(low_pop_df, state, prod).empty:
            print (f'Missing Data for {state} with product type {prod}')

Missing Data for ACRE with product type GNV
Missing Data for RONDONIA with product type GNV
Missing Data for RORAIMA with product type GNV


In [15]:
# Check for missing data for each combination of 'State' and 'Type of Product' in high population states

for state in list(high_pop_df['State'].unique()):
    for prod in list(high_pop_df['Type of Product'].unique()):
        if state_prod_query(high_pop_df, state, prod).empty:
            print (f'Missing Data for {state} with product type {prod}')        

Because the above cell did not print any results, we can conclude that there is no missing data for high population states

In [16]:
low_gnv_states = [
    'DISTRITO FEDERAL',
    'GOIAS',
    'MATO GROSSO',
    'MATO GROSSO DO SUL',
    'ALAGOAS',
    'MARANHAO',
    'PARAIBA',
    'PIAUI',
    'RIO GRANDE DO NORTE',
    'SERGIPE',
    'AMAPA',
    'AMAZONAS',
    'TOCANTINS',
    'ESPIRITO SANTO',
    'SANTA CATARINA'
]

low_gnv_ttest = ttest_prep(
    low_pop_df,
    'Adjusted Mean Distribution Price',
    low_gnv_states,
    'GNV'
)

low_gnv_ttest

Unnamed: 0,State,Before Year,After Year,Before Price,After Price,Difference
0,DISTRITO FEDERAL,2019,2019,1.273778,1.273778,0.0
1,GOIAS,2012,2013,1.206623,1.195625,-0.010998
2,MATO GROSSO,2006,2017,0.900901,0.923445,0.022544
3,MATO GROSSO DO SUL,2004,2019,0.842,0.903556,0.061556
4,ALAGOAS,2004,2019,0.789,1.089333,0.300333
5,MARANHAO,2007,2009,1.453913,1.477344,0.023431
6,PARAIBA,2004,2019,0.792,1.363556,0.571556
7,PIAUI,2004,2009,1.279,1.064844,-0.214156
8,RIO GRANDE DO NORTE,2004,2019,0.722,1.224889,0.502889
9,SERGIPE,2004,2019,0.833,1.119111,0.286111


In [17]:
ttest_prep(high_pop_df, 
           'Adjusted Mean Distribution Price',
           list(high_pop_df['State'].unique()),
           'GNV'
          )

Unnamed: 0,State,Before Year,After Year,Before Price,After Price,Difference
0,BAHIA,2004,2019,0.705,1.021333,0.316333
1,CEARA,2004,2019,0.83,1.235556,0.405556
2,PERNAMBUCO,2004,2019,0.859,0.847556,-0.011444
3,PARA,2009,2010,1.544531,0.916418,-0.628113
4,MINAS GERAIS,2004,2019,0.754,1.254222,0.500222
5,RIO DE JANEIRO,2004,2019,0.608,1.068444,0.460444
6,SAO PAULO,2004,2019,0.627,1.055111,0.428111
7,PARANA,2004,2019,0.913,0.848444,-0.064556
8,RIO GRANDE DO SUL,2004,2019,0.914,1.089333,0.175333


Because the data for GNV for low population states is too inconsistent relative to the high population states, we will omit GNV from our analysis.

In [20]:
prod_type_dict = {}
no_gnv = ['ETANOL HIDRATADO', 'GASOLINA COMUM', 'GLP', 'ÓLEO DIESEL', 'ÓLEO DIESEL S10']

for prod in no_gnv:
    prod_type_dict[prod] = {
        'High Population': ttest_prep(high_pop_df, 
                                      'Adjusted Mean Distribution Price',
                                      list(high_pop_df['State'].unique()),
                                      prod),

        'Low Population': ttest_prep(low_pop_df, 
                                     'Adjusted Mean Distribution Price',
                                     list(low_pop_df['State'].unique()),
                                     prod)
                
    }

In [24]:
# Example of indexing the dictionary.

prod_type_dict['ETANOL HIDRATADO']['High Population']

Unnamed: 0,State,Before Year,After Year,Before Price,After Price,Difference
0,BAHIA,2004,2019,0.957,1.244889,0.287889
1,CEARA,2004,2019,1.1,1.473333,0.373333
2,PERNAMBUCO,2004,2019,0.947,1.410667,0.463667
3,PARA,2004,2019,1.378,1.500889,0.122889
4,MINAS GERAIS,2004,2019,0.816,1.121333,0.305333
5,RIO DE JANEIRO,2004,2019,0.786,1.467556,0.681556
6,SAO PAULO,2004,2019,0.57,1.004889,0.434889
7,PARANA,2004,2019,0.667,1.120889,0.453889
8,RIO GRANDE DO SUL,2004,2019,0.892,1.554222,0.662222
