In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode as uni

plt.style.use('ggplot')

import constants as consts

In [2]:
# Allows for .py files to automatically reload.
%reload_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('../../data/gas_prices_brazil/2004-2019.tsv', sep = '\t')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,DATA INICIAL,DATA FINAL,REGIÃO,ESTADO,PRODUTO,NÚMERO DE POSTOS PESQUISADOS,UNIDADE DE MEDIDA,PREÇO MÉDIO REVENDA,DESVIO PADRÃO REVENDA,...,PREÇO MÁXIMO REVENDA,MARGEM MÉDIA REVENDA,COEF DE VARIAÇÃO REVENDA,PREÇO MÉDIO DISTRIBUIÇÃO,DESVIO PADRÃO DISTRIBUIÇÃO,PREÇO MÍNIMO DISTRIBUIÇÃO,PREÇO MÁXIMO DISTRIBUIÇÃO,COEF DE VARIAÇÃO DISTRIBUIÇÃO,MÊS,ANO
0,0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,...,1.35,0.463,0.012,0.825,0.11,0.4201,0.9666,0.133,5,2004
1,1,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,387,R$/l,1.162,0.114,...,1.449,0.399,0.098,0.763,0.088,0.5013,1.05,0.115,5,2004
2,2,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,ETANOL HIDRATADO,192,R$/l,1.389,0.097,...,1.76,0.419,0.07,0.97,0.095,0.5614,1.161,0.098,5,2004
3,3,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,ETANOL HIDRATADO,162,R$/l,1.262,0.07,...,1.509,0.432,0.055,0.83,0.119,0.5991,1.22242,0.143,5,2004
4,4,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,ETANOL HIDRATADO,103,R$/l,1.181,0.078,...,1.4,0.24,0.066,0.941,0.077,0.7441,1.0317,0.082,5,2004


In [5]:
df.isnull().sum()

Unnamed: 0                       0
DATA INICIAL                     0
DATA FINAL                       0
REGIÃO                           0
ESTADO                           0
PRODUTO                          0
NÚMERO DE POSTOS PESQUISADOS     0
UNIDADE DE MEDIDA                0
PREÇO MÉDIO REVENDA              0
DESVIO PADRÃO REVENDA            0
PREÇO MÍNIMO REVENDA             0
PREÇO MÁXIMO REVENDA             0
MARGEM MÉDIA REVENDA             0
COEF DE VARIAÇÃO REVENDA         0
PREÇO MÉDIO DISTRIBUIÇÃO         0
DESVIO PADRÃO DISTRIBUIÇÃO       0
PREÇO MÍNIMO DISTRIBUIÇÃO        0
PREÇO MÁXIMO DISTRIBUIÇÃO        0
COEF DE VARIAÇÃO DISTRIBUIÇÃO    0
MÊS                              0
ANO                              0
dtype: int64

In [6]:
df.isna().sum()

Unnamed: 0                       0
DATA INICIAL                     0
DATA FINAL                       0
REGIÃO                           0
ESTADO                           0
PRODUTO                          0
NÚMERO DE POSTOS PESQUISADOS     0
UNIDADE DE MEDIDA                0
PREÇO MÉDIO REVENDA              0
DESVIO PADRÃO REVENDA            0
PREÇO MÍNIMO REVENDA             0
PREÇO MÁXIMO REVENDA             0
MARGEM MÉDIA REVENDA             0
COEF DE VARIAÇÃO REVENDA         0
PREÇO MÉDIO DISTRIBUIÇÃO         0
DESVIO PADRÃO DISTRIBUIÇÃO       0
PREÇO MÍNIMO DISTRIBUIÇÃO        0
PREÇO MÁXIMO DISTRIBUIÇÃO        0
COEF DE VARIAÇÃO DISTRIBUIÇÃO    0
MÊS                              0
ANO                              0
dtype: int64

## Cleaning the Data

In [7]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [8]:
# Check that column numbers are equal
print (len(list(df.columns.values)))
print (len(consts.new_column_values))

20
20


In [9]:
df.columns = consts.new_column_values

In [10]:
df.head()

Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,Min Price Observed,Max Price Observed,Mean Price Margin,Variation Coefficient,Mean Distribution Price,Distribution Standard Deviation,Distribution Min Price,Distribution Max Price,Distribution Variation Coefficient,Month,Year
0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,1.19,1.35,0.463,0.012,0.825,0.11,0.4201,0.9666,0.133,5,2004
1,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,387,R$/l,1.162,0.114,0.89,1.449,0.399,0.098,0.763,0.088,0.5013,1.05,0.115,5,2004
2,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,ETANOL HIDRATADO,192,R$/l,1.389,0.097,1.18,1.76,0.419,0.07,0.97,0.095,0.5614,1.161,0.098,5,2004
3,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,ETANOL HIDRATADO,162,R$/l,1.262,0.07,1.09,1.509,0.432,0.055,0.83,0.119,0.5991,1.22242,0.143,5,2004
4,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,ETANOL HIDRATADO,103,R$/l,1.181,0.078,1.05,1.4,0.24,0.066,0.941,0.077,0.7441,1.0317,0.082,5,2004


In [11]:
for meas in list(df['Unit of Measurement'].unique()):
    print (f"Number of {meas}: {len(df[df['Unit of Measurement'] == meas])}")

Number of R$/l: 72603
Number of R$/13Kg: 21186
Number of R$/m3: 13034


In [12]:
print (df.groupby(['Macro Region', 'Type of Product', 'Unit of Measurement']).count()['State'].sum())
print (len(df))

106823
106823


In [13]:
df.groupby(['Macro Region', 'Type of Product', 'Unit of Measurement']).count()['State']

Macro Region  Type of Product   Unit of Measurement
CENTRO OESTE  ETANOL HIDRATADO  R$/l                   3140
              GASOLINA COMUM    R$/l                   3140
              GLP               R$/13Kg                3137
              GNV               R$/m3                  1523
              ÓLEO DIESEL       R$/l                   3140
              ÓLEO DIESEL S10   R$/l                   1349
NORDESTE      ETANOL HIDRATADO  R$/l                   7064
              GASOLINA COMUM    R$/l                   7065
              GLP               R$/13Kg                7061
              GNV               R$/m3                  5575
              ÓLEO DIESEL       R$/l                   7065
              ÓLEO DIESEL S10   R$/l                   3039
NORTE         ETANOL HIDRATADO  R$/l                   5404
              GASOLINA COMUM    R$/l                   5494
              GLP               R$/13Kg                5494
              GNV               R$/m3           

From the above information, we can conclude that certain types of gasoline have their own units of measurement.

In [14]:
print (df.iloc[0]['First Day of Week'])
print (type(df.iloc[0]['First Day of Week']))

2004-05-09
<class 'str'>


We will need to use datetime arithemetic to obtain the number of weeks since the first day that the data was collected.

In [15]:
# first_day is given a value of the 8th to make sure that (# of days) % 7 = 0.
first_day = pd.to_datetime('2004-05-08')
df['Weeks Since First Day'] = (pd.to_datetime(df['Last Day of Week']) - first_day).apply(lambda x: x / np.timedelta64(1, 'W')).astype('int')

In [16]:
df

Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,Min Price Observed,...,Mean Price Margin,Variation Coefficient,Mean Distribution Price,Distribution Standard Deviation,Distribution Min Price,Distribution Max Price,Distribution Variation Coefficient,Month,Year,Weeks Since First Day
0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,1.190,...,0.463,0.012,0.825,0.11,0.4201,0.9666,0.133,5,2004,1
1,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,387,R$/l,1.162,0.114,0.890,...,0.399,0.098,0.763,0.088,0.5013,1.05,0.115,5,2004,1
2,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,ETANOL HIDRATADO,192,R$/l,1.389,0.097,1.180,...,0.419,0.070,0.97,0.095,0.5614,1.161,0.098,5,2004,1
3,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,ETANOL HIDRATADO,162,R$/l,1.262,0.070,1.090,...,0.432,0.055,0.83,0.119,0.5991,1.22242,0.143,5,2004,1
4,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,ETANOL HIDRATADO,103,R$/l,1.181,0.078,1.050,...,0.24,0.066,0.941,0.077,0.7441,1.0317,0.082,5,2004,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106818,2019-06-23,2019-06-29,NORDESTE,RIO GRANDE DO NORTE,GNV,7,R$/m3,3.574,0.065,3.499,...,0.818,0.018,2.756,0,2.756,2.756,0,6,2019,790
106819,2019-06-23,2019-06-29,SUL,RIO GRANDE DO SUL,GNV,23,R$/m3,3.401,0.129,3.230,...,0.95,0.038,2.451,0.402,1.9842,2.8661,0.164,6,2019,790
106820,2019-06-23,2019-06-29,SUL,SANTA CATARINA,GNV,24,R$/m3,2.912,0.190,2.699,...,0.914,0.065,1.998,0,1.9981,1.9981,0,6,2019,790
106821,2019-06-23,2019-06-29,SUDESTE,SAO PAULO,GNV,52,R$/m3,3.020,0.229,2.699,...,0.646,0.076,2.374,0.165,2.0179,2.5093,0.07,6,2019,790


## Adding Population Information

Finding information for each state per year proved difficult, however, population data that was collected every ten years was available. We proceed by adding population information to the dataframe.

In [17]:
br_pops = pd.read_html('https://en.wikipedia.org/wiki/List_of_Brazilian_states_by_population')
pop_df = br_pops[0]
pop_df.head()

Unnamed: 0,Rank,State,Population (2020)[2],Population (2010)[3],Population (2000)[4],Population approximately equal,"Percent of the total Brazil population, 2020[note 1]"
0,1.0,São Paulo,46289333,41262199,37032403,Argentina,21.9%
1,2.0,Minas Gerais,21292666,19597330,17891494,Sri Lanka,10.1%
2,3.0,Rio de Janeiro,17366189,15989929,14391282,Netherlands,8.2%
3,4.0,Bahia,14930634,14016906,13070250,Cambodia,7.1%
4,5.0,Paraná,11516840,10444526,9569458,Belgium,5.4%


In [18]:
pop_df.drop(['Rank', 'Population approximately equal'], axis = 1, inplace = True)

In [19]:
pop_df.rename(columns = {
    'Population (2020)[2]': '2020 Population',
    'Population (2010)[3]': '2010 Population',
    'Population (2000)[4]': '2000 Population',
    'Percent of the total Brazil population, 2020[note 1]': 'Percent of Total Population in 2020'
}, inplace = True)

In [20]:
pop_df

Unnamed: 0,State,2020 Population,2010 Population,2000 Population,Percent of Total Population in 2020
0,São Paulo,46289333,41262199,37032403,21.9%
1,Minas Gerais,21292666,19597330,17891494,10.1%
2,Rio de Janeiro,17366189,15989929,14391282,8.2%
3,Bahia,14930634,14016906,13070250,7.1%
4,Paraná,11516840,10444526,9569458,5.4%
5,Rio Grande do Sul,11422973,10693929,10187798,5.4%
6,Pernambuco,9616621,8796448,7918344,4.5%
7,Ceará,9187103,8452381,7430661,4.3%
8,Pará,8690745,7581051,6192307,4.1%
9,Santa Catarina,7252502,6248436,5356360,3.4%


In [21]:
print (pop_df.columns[1].split()[0])
print (type(pop_df.columns[1].split()[0]))

2020
<class 'str'>


In [22]:
df['Year']

0         2004
1         2004
2         2004
3         2004
4         2004
          ... 
106818    2019
106819    2019
106820    2019
106821    2019
106822    2019
Name: Year, Length: 106823, dtype: int64

In [23]:
# Gives a list of state names without accents and in upper case.
# This will make it easier to merge the dataframes on state name.

state_list = []

for state in pop_df['State']:
    state_list.append(uni.unidecode(state).upper())

state_list.pop() # Removes the unnecessary 'Total' row.

'TOTAL'

In [24]:
# Remove the 'Total' row.
pop_df.drop(27, inplace = True)

In [25]:
pop_df

Unnamed: 0,State,2020 Population,2010 Population,2000 Population,Percent of Total Population in 2020
0,São Paulo,46289333,41262199,37032403,21.9%
1,Minas Gerais,21292666,19597330,17891494,10.1%
2,Rio de Janeiro,17366189,15989929,14391282,8.2%
3,Bahia,14930634,14016906,13070250,7.1%
4,Paraná,11516840,10444526,9569458,5.4%
5,Rio Grande do Sul,11422973,10693929,10187798,5.4%
6,Pernambuco,9616621,8796448,7918344,4.5%
7,Ceará,9187103,8452381,7430661,4.3%
8,Pará,8690745,7581051,6192307,4.1%
9,Santa Catarina,7252502,6248436,5356360,3.4%


In [26]:
# Because the states in state_list appear in the same order as State in the dataframe, we do not need to worry
# about the order of states being misplaced.

pop_df.drop('State', axis = 1, inplace = True)
pop_df['State'] = state_list
pop_df

Unnamed: 0,2020 Population,2010 Population,2000 Population,Percent of Total Population in 2020,State
0,46289333,41262199,37032403,21.9%,SAO PAULO
1,21292666,19597330,17891494,10.1%,MINAS GERAIS
2,17366189,15989929,14391282,8.2%,RIO DE JANEIRO
3,14930634,14016906,13070250,7.1%,BAHIA
4,11516840,10444526,9569458,5.4%,PARANA
5,11422973,10693929,10187798,5.4%,RIO GRANDE DO SUL
6,9616621,8796448,7918344,4.5%,PERNAMBUCO
7,9187103,8452381,7430661,4.3%,CEARA
8,8690745,7581051,6192307,4.1%,PARA
9,7252502,6248436,5356360,3.4%,SANTA CATARINA


In [27]:
df.head()

Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,Min Price Observed,...,Mean Price Margin,Variation Coefficient,Mean Distribution Price,Distribution Standard Deviation,Distribution Min Price,Distribution Max Price,Distribution Variation Coefficient,Month,Year,Weeks Since First Day
0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,1.19,...,0.463,0.012,0.825,0.11,0.4201,0.9666,0.133,5,2004,1
1,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,ETANOL HIDRATADO,387,R$/l,1.162,0.114,0.89,...,0.399,0.098,0.763,0.088,0.5013,1.05,0.115,5,2004,1
2,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,ETANOL HIDRATADO,192,R$/l,1.389,0.097,1.18,...,0.419,0.07,0.97,0.095,0.5614,1.161,0.098,5,2004,1
3,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,ETANOL HIDRATADO,162,R$/l,1.262,0.07,1.09,...,0.432,0.055,0.83,0.119,0.5991,1.22242,0.143,5,2004,1
4,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,ETANOL HIDRATADO,103,R$/l,1.181,0.078,1.05,...,0.24,0.066,0.941,0.077,0.7441,1.0317,0.082,5,2004,1


In [28]:
m_df = df.merge(pop_df, on = 'State')

In [29]:
m_df.head(10)

Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,Min Price Observed,...,Distribution Min Price,Distribution Max Price,Distribution Variation Coefficient,Month,Year,Weeks Since First Day,2020 Population,2010 Population,2000 Population,Percent of Total Population in 2020
0,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,127,R$/l,1.288,0.016,1.19,...,0.4201,0.9666,0.133,5,2004,1,3055149,2570160,2051146,1.4%
1,2004-05-16,2004-05-22,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.271,0.039,1.06,...,0.4094,1.1931,0.135,5,2004,2,3055149,2570160,2051146,1.4%
2,2004-05-23,2004-05-29,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,129,R$/l,1.282,0.024,1.17,...,0.3879,1.0336,0.167,5,2004,3,3055149,2570160,2051146,1.4%
3,2004-05-30,2004-06-05,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.373,0.051,1.18,...,0.4094,1.4206,0.164,5,2004,4,3055149,2570160,2051146,1.4%
4,2004-06-06,2004-06-12,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,129,R$/l,1.373,0.059,1.09,...,0.5169,1.115,0.131,6,2004,5,3055149,2570160,2051146,1.4%
5,2004-06-13,2004-06-19,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.355,0.069,1.09,...,0.4094,1.2331,0.161,6,2004,6,3055149,2570160,2051146,1.4%
6,2004-06-20,2004-06-26,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,130,R$/l,1.442,0.02,1.36,...,0.4631,1.1114,0.15,6,2004,7,3055149,2570160,2051146,1.4%
7,2004-06-27,2004-07-03,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,145,R$/l,1.433,0.037,1.24,...,0.4094,1.345,0.16,6,2004,8,3055149,2570160,2051146,1.4%
8,2004-07-04,2004-07-10,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,130,R$/l,1.412,0.087,1.05,...,0.4631,1.1073,0.128,7,2004,9,3055149,2570160,2051146,1.4%
9,2004-07-11,2004-07-17,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,144,R$/l,1.293,0.111,0.99,...,0.508,1.4089,0.144,7,2004,10,3055149,2570160,2051146,1.4%


In [30]:
len(m_df)

106823

In [31]:
# for i in range(len(m_df)):
# #     print (m_df.loc[i, 'Year'])
#     if m_df.loc[i, 'Year'] < 2010:
#         m_df.loc[i, 'Population'] = m_df.loc[i, '2000 Population']
        
#     if (m_df.loc[i, 'Year'] >= 2010 & m_df.loc[i, 'Year'] <= 2018):
#         m_df.loc[i, 'Population'] = m_df.loc[i, '2010 Population']
        
#     if m_df.loc[i, 'Year'] == 2019:
#         m_df.loc[i, 'Population'] = m_df['2020 Population']

In [32]:
m_df.loc[0, 'Year']

2004

In [33]:
pops = []

for i in range(len(m_df)):
    
    if m_df.loc[i, 'Year'] < 2010:
        pops.append(m_df.loc[i, '2000 Population'])
        
    elif (m_df.loc[i, 'Year'] >= 2010 and m_df.loc[i, 'Year'] <= 2018):
        pops.append(m_df.loc[i, '2010 Population'])
        
    else:
        pops.append(m_df.loc[i, '2020 Population'])
        
pops  

#     if m_df.loc[i, 'Year'] == 2019:
#         pops.append(m_df.loc[i, '2020 Population'])

[2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,
 2051146,


In [34]:
len(pops)

106823

In [35]:
m_df.loc[95000]

First Day of Week                      2005-06-26
Last Day of Week                       2005-07-02
Macro Region                                  SUL
State                                      PARANA
Type of Product                               GLP
Number of Stations                            553
Unit of Measurement                       R$/13Kg
Mean Market Value                          29.962
Std Dev                                     1.919
Min Price Observed                           25.0
Max Price Observed                           37.0
Mean Price Margin                           4.727
Variation Coefficient                       0.064
Mean Distribution Price                    25.235
Distribution Standard Deviation             2.264
Distribution Min Price                         20
Distribution Max Price                      31.56
Distribution Variation Coefficient           0.09
Month                                           6
Year                                         2005


In [36]:
m_df['Population'] = pops

In [37]:
m_df[m_df['Year'] == 2019]

Unnamed: 0,First Day of Week,Last Day of Week,Macro Region,State,Type of Product,Number of Stations,Unit of Measurement,Mean Market Value,Std Dev,Min Price Observed,...,Distribution Max Price,Distribution Variation Coefficient,Month,Year,Weeks Since First Day,2020 Population,2010 Population,2000 Population,Percent of Total Population in 2020,Population
3354,2019-01-06,2019-01-12,CENTRO OESTE,DISTRITO FEDERAL,ETANOL HIDRATADO,43,R$/l,3.270,0.091,3.109,...,3.15,0.033,1,2019,766,3055149,2570160,2051146,1.4%,3055149
3355,2019-01-06,2019-01-12,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL,24,R$/l,3.684,0.165,3.370,...,3.255,0.045,1,2019,766,3055149,2570160,2051146,1.4%,3055149
3356,2019-01-06,2019-01-12,CENTRO OESTE,DISTRITO FEDERAL,ÓLEO DIESEL S10,39,R$/l,3.784,0.149,3.470,...,3.3796,0.026,1,2019,766,3055149,2570160,2051146,1.4%,3055149
3357,2019-01-06,2019-01-12,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,47,R$/l,4.040,0.139,3.789,...,3.735,0.008,1,2019,766,3055149,2570160,2051146,1.4%,3055149
3358,2019-01-06,2019-01-12,CENTRO OESTE,DISTRITO FEDERAL,GLP,42,R$/13Kg,73.095,4.933,65.000,...,60.75,0.095,1,2019,766,3055149,2570160,2051146,1.4%,3055149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106818,2019-06-23,2019-06-29,SUL,SANTA CATARINA,ÓLEO DIESEL,96,R$/l,3.504,0.125,3.199,...,3.2038,0.03,6,2019,790,7252502,6248436,5356360,3.4%,7252502
106819,2019-06-23,2019-06-29,SUL,SANTA CATARINA,ÓLEO DIESEL S10,206,R$/l,3.594,0.150,3.259,...,3.3915,0.037,6,2019,790,7252502,6248436,5356360,3.4%,7252502
106820,2019-06-23,2019-06-29,SUL,SANTA CATARINA,GASOLINA COMUM,252,R$/l,4.087,0.173,3.770,...,3.998,0.022,6,2019,790,7252502,6248436,5356360,3.4%,7252502
106821,2019-06-23,2019-06-29,SUL,SANTA CATARINA,GLP,100,R$/13Kg,69.977,5.119,55.000,...,61.5,0.065,6,2019,790,7252502,6248436,5356360,3.4%,7252502


In [38]:
m_df.to_csv(path_or_buf = '../../data/gas_prices_brazil/brazil_gas_cleaned.csv')