In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import shapiro
from scipy.stats import ttest_rel

import constants as consts

In [2]:
df = pd.read_csv('../../data/gas_prices_brazil/brazil_gas_inflation.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
0,0,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,1.4%,0.825,1.0,0.825
1,1,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.4%,1.711,1.0,1.711
2,2,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,GLP,1.4%,27.165,1.0,27.165001
3,3,2004-05-15,2004,5,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,1.4%,1.249,1.0,1.249
4,4,2004-05-15,2004,5,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,3.4%,0.763,1.0,0.763


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
# Need to convert 'Percent of Total Population in 2020' to float.
# Remove percent sign, then float(entry)

for ind in range(len(df)):
    df.at[ind, 'Percent of Total Population in 2020'] = float(df.at[ind, 'Percent of Total Population in 2020'][0:-1])

In [6]:
# Defining high population as having at least 4% of the population in 2020 was arbitrarily chosen.

high_pop_df = df[df['Percent of Total Population in 2020'] >= 4.0]
low_pop_df = df[df['Percent of Total Population in 2020'] < 4.0]

In [7]:
high_pop_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
72561,2015-02-21,2015,2,NORDESTE,BAHIA,ÓLEO DIESEL S10,7.1,2.593,563.0,1.394086
33389,2009-07-18,2009,7,SUDESTE,SAO PAULO,ÓLEO DIESEL,21.9,1.721,271.0,1.344531
96968,2018-03-31,2018,3,NORDESTE,PERNAMBUCO,ÓLEO DIESEL S10,4.5,2.965,725.0,1.372685
60531,2013-08-17,2013,8,NORDESTE,PERNAMBUCO,GNV,4.5,1.144,484.0,0.715
22950,2007-12-08,2007,12,SUL,PARANA,GASOLINA COMUM,5.4,2.13,187.0,1.852174
55389,2012-12-22,2012,12,NORDESTE,BAHIA,GASOLINA COMUM,7.1,2.362,450.0,1.564238
29894,2009-01-03,2008,12,SUDESTE,RIO DE JANEIRO,GLP,8.2,25.507,243.0,20.907377
98248,2018-05-26,2018,5,SUDESTE,SAO PAULO,GASOLINA COMUM,21.9,3.793,733.0,1.756019
79956,2016-01-30,2016,1,SUDESTE,RIO DE JANEIRO,ETANOL HIDRATADO,8.2,2.91,612.0,1.440594
5914,2005-04-09,2005,4,SUDESTE,MINAS GERAIS,GNV,10.1,0.833,48.0,0.778505


In [8]:
low_pop_df.sample(10)

Unnamed: 0,Last Day of Week,Year,Month,Macro Region,State,Type of Product,Percent of Total Population in 2020,Mean Distribution Price,Weeks Since First Day,Adjusted Mean Distribution Price
88813,2017-03-18,2017,3,NORDESTE,MARANHAO,ÓLEO DIESEL S10,3.4,2.876,671.0,1.376077
100323,2018-09-01,2018,8,NORTE,RORAIMA,ÓLEO DIESEL,0.3,3.091,747.0,1.431019
63152,2013-12-14,2013,12,NORTE,AMAPA,ÓLEO DIESEL,0.4,2.183,501.0,1.364375
42202,2010-12-11,2010,12,NORTE,ACRE,GASOLINA COMUM,0.4,2.535,344.0,1.891791
60997,2013-09-07,2013,9,NORDESTE,RIO GRANDE DO NORTE,ETANOL HIDRATADO,1.7,2.195,487.0,1.371875
16737,2006-12-23,2006,12,SUL,SANTA CATARINA,GLP,3.4,27.503,137.0,24.777478
62563,2013-11-16,2013,11,NORTE,RORAIMA,ÓLEO DIESEL,0.3,2.164,497.0,1.3525
22228,2007-11-03,2007,10,CENTRO OESTE,MATO GROSSO,GLP,1.7,34.738,182.0,30.206956
37341,2010-03-13,2010,3,NORTE,AMAZONAS,ÓLEO DIESEL,2.0,1.901,305.0,1.418657
28555,2008-10-25,2008,10,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,1.4,2.154,233.0,1.765574


In [9]:
high_pop_df['Year']

22        2004
23        2004
24        2004
25        2004
26        2004
          ... 
106812    2019
106813    2019
106814    2019
106815    2019
106816    2019
Name: Year, Length: 37586, dtype: int64

In [10]:
def state_prod_query(df, state, prod_type):
    
    # Requires that the dataframe passed has columns named 'State' and 'Type of Product'
    
    return df[
        (df['State'] == state) &
        (df['Type of Product'] == prod_type) &
        (df['Adjusted Mean Distribution Price'] != 0)
    ]

In [11]:
# Creating a list of the information needed and then creating a dataframe from the list is the preferred way
# to create the necessary dataframe.

# See the link below for more information.
# https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it

# The 'Before Year' and 'After Year' columns make sure that inflation is properly adjusted.

def ttest_prep(df, num_col, state_list, prod_type):
    
    # num_col is short for numerical column.
    # state_list and prod_type can be altered as necessary.
    
    state_before_after = []
    
    for state in state_list:
        state_before_after.append([
            state,
            state_prod_query(df, state, prod_type).iloc[0]['Year'], # Remove this line for improved reusability.
            state_prod_query(df, state, prod_type).iloc[-1]['Year'], # Remove this line for improved reusability.
            state_prod_query(df, state, prod_type).iloc[0][num_col],
            state_prod_query(df, state, prod_type).iloc[-1][num_col],
        ])
        
    ttest_df = pd.DataFrame(state_before_after, columns = ['State', 'Before Year', 'After Year', 'Before', 'After'])
    ttest_df['Difference'] = ttest_df['After'] - ttest_df['Before']
    
    return ttest_df

In [12]:
high_gc_ttest = ttest_prep(
    high_pop_df, 
    'Adjusted Mean Distribution Price', 
    list(high_pop_df['State'].unique()), 
    'GASOLINA COMUM'
)

low_gc_ttest = ttest_prep(
    low_pop_df, 
    'Adjusted Mean Distribution Price', 
    list(low_pop_df['State'].unique()), 
    'GASOLINA COMUM'
)

In [13]:
# View of the dataframes for reference.

high_gc_ttest

Unnamed: 0,State,Before Year,After Year,Before,After,Difference
0,BAHIA,2004,2019,1.73,1.753778,0.023778
1,CEARA,2004,2019,1.804,1.793333,-0.010667
2,PERNAMBUCO,2004,2019,1.726,1.753333,0.027333
3,PARA,2004,2019,1.935,1.8,-0.135
4,MINAS GERAIS,2004,2019,1.698,1.899556,0.201556
5,RIO DE JANEIRO,2004,2019,1.791,1.951111,0.160111
6,SAO PAULO,2004,2019,1.656,1.650222,-0.005778
7,PARANA,2004,2019,1.728,1.706222,-0.021778
8,RIO GRANDE DO SUL,2004,2019,1.793,1.826222,0.033222


In [14]:
low_gc_ttest

Unnamed: 0,State,Before Year,After Year,Before,After,Difference
0,DISTRITO FEDERAL,2004,2019,1.711,1.746222,0.035222
1,GOIAS,2004,2019,1.729,1.813778,0.084778
2,MATO GROSSO,2004,2019,1.886,1.74,-0.146
3,MATO GROSSO DO SUL,2004,2019,1.795,1.703111,-0.091889
4,ALAGOAS,2004,2019,1.74,1.789333,0.049333
5,MARANHAO,2004,2019,1.739,1.723556,-0.015444
6,PARAIBA,2004,2019,1.667,1.768444,0.101444
7,PIAUI,2004,2019,1.739,1.791556,0.052556
8,RIO GRANDE DO NORTE,2004,2019,1.715,1.775556,0.060555
9,SERGIPE,2004,2019,1.734,1.801778,0.067778


In [15]:
print (shapiro(high_gc_ttest['Difference']))
print (shapiro(low_gc_ttest['Difference']))

ShapiroResult(statistic=0.9126634001731873, pvalue=0.33496448397636414)
ShapiroResult(statistic=0.8634587526321411, pvalue=0.013907150365412235)


In [16]:
ttest_rel(high_gc_ttest['After'],
          high_gc_ttest['Before'])

Ttest_relResult(statistic=0.913521700220647, pvalue=0.3876707172350812)

In [17]:
# prod_type_dict = {}

# for prod in list(df['Type of Product'].unique()):
#     prod_type_dict[prod] = {
#         'High Population': ttest_prep(high_pop_df, 
#                                       'Adjusted Mean Distribution Price',
#                                       list(high_pop_df['State'].unique()),
#                                       prod),

#         'Low Population': ttest_prep(low_pop_df, 
#                                      'Adjusted Mean Distribution Price',
#                                      list(high_pop_df['State'].unique()),
#                                      prod)
                
#     }

In [18]:
print (list(low_pop_df['Type of Product'].unique()))
print (list(high_pop_df['Type of Product'].unique()))

['ETANOL HIDRATADO', 'GASOLINA COMUM', 'GLP', 'ÓLEO DIESEL', 'GNV', 'ÓLEO DIESEL S10']
['ETANOL HIDRATADO', 'GASOLINA COMUM', 'GLP', 'GNV', 'ÓLEO DIESEL', 'ÓLEO DIESEL S10']


In [19]:
# GNV may have too many missing values to determine an accurate conclusion.
# GNV is also what is raising an IndexError in In[16]

ttest_prep(low_pop_df, 
           'Adjusted Mean Distribution Price',
           list(low_pop_df['State'].unique()),
           'GNV'
          )

IndexError: single positional indexer is out-of-bounds