# Caipora Project

__Objetivo__




 
__Data Source__

https://queimadas.dgi.inpe.br/queimadas/portal

https://ipsamazonia.org.br/

https://openaq.org/


__Data characteristics__

- Time Series;
- Geographic coordinates – Latitude/Longitude;
- Satellite Name

In [1]:
import os
import math
import unicodedata

import glob

import datetime
import gmaps

from pathlib import Path

import numpy as np
import pandas as pd

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

## Constants

In [2]:
WORKDIR = os.path.abspath(os.getcwd())

YEARS_AVAILABLE = [2014, 2018, 2021]

BRAZILIAN_STATES = [
    { "acronym": "AC", "name": "Acre" },
    { "acronym": "AL", "name": "Alagoas" },
    { "acronym": "AP", "name": "Amapá" },
    { "acronym": "AM", "name": "Amazonas" },
    { "acronym": "BA", "name": "Bahia" },
    { "acronym": "CE", "name": "Ceará" },
    { "acronym": "DF", "name": "Distrito Federal" },
    { "acronym": "ES", "name": "Espírito Santo" },
    { "acronym": "GO", "name": "Goiás" },
    { "acronym": "MA", "name": "Maranhão" },
    { "acronym": "MT", "name": "Mato Grosso" },
    { "acronym": "MS", "name": "Mato Grosso do Sul" },
    { "acronym": "MG", "name": "Minas Gerais" },
    { "acronym": "PA", "name": "Pará" },
    { "acronym": "PB", "name": "Paraíba" },
    { "acronym": "PR", "name": "Paraná" },
    { "acronym": "PE", "name": "Pernambuco" },
    { "acronym": "PI", "name": "Piauí" },
    { "acronym": "RJ", "name": "Rio de Janeiro" },
    { "acronym": "RN", "name": "Rio Grande do Norte" },
    { "acronym": "RS", "name": "Rio Grande do Sul" },
    { "acronym": "RO", "name": "Rondônia" },
    { "acronym": "RR", "name": "Roraima" },
    { "acronym": "SC", "name": "Santa Catarina" },
    { "acronym": "SP", "name": "São Paulo" },
    { "acronym": "SE", "name": "Sergipe" },
    { "acronym": "TO", "name": "Tocantins" }
]

## Get the data

### Hotspot data

__List files used in the analysis__

In [3]:
path = ''.join([WORKDIR, "/data/hotspot/**/*"]) 
hotspot_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [4]:
hotspot_df = pd.concat(map(pd.read_csv, hotspot_files))
hotspot_df.head()

Unnamed: 0,datahora,satelite,pais,estado,municipio,bioma,diasemchuva,precipitacao,riscofogo,latitude,longitude,frp
0,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,CURURUPU,Amazonia,0.0,0.8,0.0,-1.87136,-44.78587,
1,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,MARACACUME,Amazonia,0.0,0.1,0.1,-1.82566,-45.8867,
2,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,BURITICUPU,Amazonia,0.0,1.1,0.1,-4.57874,-46.3866,
3,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,PAULO RAMOS,Amazonia,0.0,1.4,0.1,-4.59554,-45.66039,
4,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,ARAME,Amazonia,0.0,0.4,0.3,-5.2196,-46.12886,


### Amazon SPI data

![Amazon SPI Content Table](doc/images/table_contents_spi.png "Amazon SPI Content Table")

__List files used in the analysis__

In [5]:
path = ''.join([WORKDIR, "/data/spi/amazonia/detailed"])
spi_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [6]:
spi_df = pd.concat(map(pd.read_csv, spi_files))
spi_df.head()

Unnamed: 0,Ano,Código IBGE,Município,Estado,IPS Amazônia,Ranking IPS,Necessidades Humanas Básicas,Fundamentos para o Bem-Estar,Oportunidades,Nutrição e cuidados médicos básicos,...,Gravidez na infância e adolescência,Trabalho Infantil,Vulnerabilidade familiar,Violência contra indígenas,Violência contra indígenas Taxa,Violência contra mulheres,Violência infantil,Violência infantil Taxa,Empregos ensino superior,Mulheres com empregos ensino superior
0,2014,1100015.0,Alta Floresta D'Oeste,RO,56.59417,197.0,60.195511,57.36934,52.217659,95.154708,...,20.234604,110.12616,18.768328,0.0,1.0,25.768768,0.0,1.0,11.266178,6.70513
1,2014,1100023.0,Ariquemes,RO,55.728511,264.0,59.059533,59.304304,48.821695,93.920755,...,23.497268,240.966315,22.890103,0.0,1.0,58.752026,62.994606,3.0,20.746646,12.852421
2,2014,1100031.0,Cabixi,RO,58.915724,90.0,74.178132,54.787265,47.781775,88.092664,...,16.0,110.304534,38.666667,0.0,1.0,34.55425,71.326676,3.0,14.788294,9.806974
3,2014,1100049.0,Cacoal,RO,61.717654,28.0,72.691093,60.613765,51.848105,91.956107,...,16.985463,55.112064,17.521041,0.681663,2.0,26.380795,20.894275,2.0,27.450437,16.058968
4,2014,1100056.0,Cerejeiras,RO,54.593926,361.0,62.606094,62.512401,38.663282,87.810163,...,21.2,313.969533,40.4,0.0,1.0,270.970782,51.813472,3.0,14.156443,8.937989


## Explore the data

### Describe the data I

__Hotspot__

In [7]:
hotspot_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diasemchuva,16943121.0,13.348092,65.530664,-999.0,2.0,5.0,20.0,120.0
precipitacao,16943121.0,0.859245,3.598378,0.0,0.0,0.0,0.1,203.7
riscofogo,16943121.0,-5.579153,79.406999,-999.0,0.6,1.0,1.0,1.0
latitude,19706174.0,-8.260597,4.323302,-18.039,-11.07466,-8.680406,-5.3,5.23
longitude,19706174.0,-53.774619,6.601975,-73.93146,-58.207,-53.091,-48.23736,-41.8
frp,5140138.0,19.285664,56.710458,-3.7,3.3,7.4,17.1,9722.6


__Amazon SPI__

In [8]:
spi_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Ano,2319.0,2.017667e+03,2.868060e+00,2.014000e+03,2.014000e+03,2.018000e+03,2.021000e+03,2.021000e+03
Código IBGE,2316.0,2.288414e+06,1.365403e+06,1.100015e+06,1.502798e+06,1.713254e+06,2.109278e+06,5.108956e+06
IPS Amazônia,2316.0,5.456015e+01,3.811311e+00,4.359790e+01,5.192879e+01,5.430994e+01,5.680819e+01,7.441633e+01
Ranking IPS,2316.0,3.865000e+02,2.229051e+02,1.000000e+00,1.937500e+02,3.865000e+02,5.792500e+02,7.720000e+02
Necessidades Humanas Básicas,2319.0,6.437410e+01,6.825254e+00,4.337849e+01,5.969696e+01,6.432076e+01,6.887852e+01,8.712833e+01
...,...,...,...,...,...,...,...,...
Violência contra mulheres,2316.0,6.649387e+01,1.025700e+02,0.000000e+00,3.697673e+00,2.414303e+01,8.353261e+01,6.320000e+02
Violência infantil,2316.0,4.888766e+01,1.090521e+02,0.000000e+00,0.000000e+00,1.426542e+01,5.048753e+01,2.606975e+03
Violência infantil Taxa,2316.0,1.982729e+00,1.017622e+00,1.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,5.000000e+00
Empregos ensino superior,2316.0,1.666922e+01,1.302646e+01,0.000000e+00,9.823585e+00,1.438967e+01,2.013205e+01,1.516212e+02


### Get information about data

__Hotspot__

In [9]:
hotspot_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19706174 entries, 0 to 318737
Data columns (total 12 columns):
 #   Column        Dtype  
---  ------        -----  
 0   datahora      object 
 1   satelite      object 
 2   pais          object 
 3   estado        object 
 4   municipio     object 
 5   bioma         object 
 6   diasemchuva   float64
 7   precipitacao  float64
 8   riscofogo     float64
 9   latitude      float64
 10  longitude     float64
 11  frp           float64
dtypes: float64(6), object(6)
memory usage: 1.9+ GB


__Amazon SPI__

In [10]:
spi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 772
Data columns (total 70 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Ano                                            2319 non-null   int64  
 1   Código IBGE                                    2316 non-null   float64
 2   Município                                      2316 non-null   object 
 3   Estado                                         2316 non-null   object 
 4   IPS Amazônia                                   2316 non-null   float64
 5   Ranking IPS                                    2316 non-null   float64
 6   Necessidades Humanas Básicas                   2319 non-null   float64
 7   Fundamentos para o Bem-Estar                   2319 non-null   float64
 8   Oportunidades                                  2319 non-null   float64
 9   Nutrição e cuidados médicos básicos            2319 n

### Percentage of missing data

In [11]:
def get_pct_missing_data(dataset):
    """
    Get Percentage of missing data
    
    Attributes
    ----------
    dataset : Pandas DataFrame
    """
    total = dataset.isnull().sum().sort_values(ascending=False)
    percent = dataset.isnull().sum() / dataset.isnull().count() * 100
    percent = (round(percent, 4)).sort_values(ascending=False)

    missing_data = pd.concat([total, percent], keys=["Total", '%'], axis=1)

    return missing_data

__Hotspot__

In [12]:
get_pct_missing_data(hotspot_df)

Unnamed: 0,Total,%
frp,14566036,73.9161
diasemchuva,2763053,14.0213
precipitacao,2763053,14.0213
riscofogo,2763053,14.0213
datahora,0,0.0
satelite,0,0.0
pais,0,0.0
estado,0,0.0
municipio,0,0.0
bioma,0,0.0


__Amazon SPI__

In [13]:
get_pct_missing_data(spi_df)

Unnamed: 0,Total,%
Homicídios,3,0.1294
Densidade telefonia fixa,3,0.1294
Mortalidade por doenças respiratórias,3,0.1294
Mortalidade por doenças circulatórias,3,0.1294
Mortalidade por câncer,3,0.1294
...,...,...
Qualidade do meio ambiente,0,0.0000
Liberdade individual e de escolha,0,0.0000
Tolerância e inclusão,0,0.0000
Acesso à educação superior,0,0.0000


## Prepare the data

### Standardize Country State Names

The feature __estado__ in the two datasets are diffenrents. In _hotspot_df_ the name of state is long and spi_df is short.

__Amazon SPI__

In [14]:
def get_full_name_state(acronym):
    """
    Retrieves the long name of state related to acronym.
    """
    lnames = [x.get('name') for x in BRAZILIAN_STATES if x.get('acronym') == acronym]
    return lnames[0] if lnames else np.nan


def purge_spec_chars(word):
    """
    Remove all special characters in the word.
    """
    wv = word if word is not np.nan else ""
    nfkd_form = unicodedata.normalize("NFKD", wv) 
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [15]:
# Get full name of state
spi_df["Estado"] = spi_df["Estado"].apply(get_full_name_state)

# Remove special characteres
spi_df["Estado"] = spi_df["Estado"].apply(purge_spec_chars)

# Capitalize the name
spi_df["Estado"] = spi_df["Estado"].apply(lambda x: x.upper())

### Standardize Cities Names

The feature __municipio__ in the two datasets are diffenrents.

In [16]:
# Remove special characteres
spi_df["Município"] = spi_df["Município"].apply(purge_spec_chars)

# Capitalize the name
spi_df["Município"] = spi_df["Município"].apply(lambda x: x.upper())

#### Some city names are written incorrectly. In this section I will fix it with the correct form

__hotspot__

In [17]:
hotspot_df.loc[(hotspot_df["municipio"] == "ELDORADO DO CARAJAS"), "municipio"] = "ELDORADO DOS CARAJAS"

__Amazon SPI__

In [18]:
# spi_df.loc[spi_df["Município"] == "BARREIRINHA", "Município"] = "BARREIRINHAS"

# Acre
spi_df.loc[spi_df["Município"] == "RIO BRANCO (ACRE)", "Município"] = "RIO BRANCO"

# Maranhão
spi_df.loc[spi_df["Município"] == "ARAGUANA (MARANHAO)", "Município"] = "ARAGUANA"
spi_df.loc[spi_df["Município"] == "PRESIDENTE MEDICI (MARANHAO)", "Município"] = "PRESIDENTE MEDICI"

# Mato Grosso
spi_df.loc[spi_df["Município"] == "POXOREO", "Município"] = "POXOREU"
spi_df.loc[spi_df["Município"] == "RIO BRANCO (MATO GROSSO)", "Município"] = "RIO BRANCO"

# Rondonia
spi_df.loc[spi_df["Município"] == "PRESIDENTE MEDICI (RONDONIA)", "Município"] = "PRESIDENTE MEDICI"

# Pará
spi_df.loc[spi_df["Município"] == "BOM JESUS DO TOCANTINS (PARA)", "Município"] = "BOM JESUS DO TOCANTINS"
spi_df.loc[spi_df["Município"] == "SANTA ISABEL DO PARA", "Município"] = "SANTA IZABEL DO PARA"

# Tocantins
spi_df.loc[spi_df["Município"] == "ARAGUANA (TOCANTINS)", "Município"] = "ARAGUANA"
spi_df.loc[spi_df["Município"] == "BOM JESUS DO TOCANTINS (TOCANTINS)", "Município"] = "BOM JESUS DO TOCANTINS"
spi_df.loc[spi_df["Município"] == "COUTO DE MAGALHAES", "Município"] = "COUTO MAGALHAES"
spi_df.loc[spi_df["Município"] == "SAO VALERIO DA NATIVIDADE", "Município"] = "SAO VALERIO"

#### Shows the difference between the number of cities in the two datasets

In [19]:
hotspot_regions = hotspot_df.groupby(['municipio', 'estado'])
spi_region = spi_df.groupby(['Município', 'Estado'])

n_hr = len(hotspot_regions)
n_sr = len(spi_region)

dic_cities = set(hotspot_regions.groups.keys()) ^ set(spi_region.groups.keys())

print("hotspot_df has {} cities and spi_df has {}. The Difference is {}.".format(n_hr, n_sr, (n_hr - n_sr)))
print("---------------------------------------------------------------------------------------------------")
print("Cities not found:\n{}".format(dic_cities))

hotspot_df has 808 cities and spi_df has 773. The Difference is 35.
---------------------------------------------------------------------------------------------------
Cities not found:
{('ANAPURUS', 'MARANHAO'), ('AGUA DOCE DO MARANHAO', 'MARANHAO'), ('TUTOIA', 'MARANHAO'), ('CHAPADINHA', 'MARANHAO'), ('SUCUPIRA DO RIACHAO', 'MARANHAO'), ('', ''), ('SANTANA DO MARANHAO', 'MARANHAO'), ('NINA RODRIGUES', 'MARANHAO'), ('SANTA QUITERIA DO MARANHAO', 'MARANHAO'), ('BELAGUA', 'MARANHAO'), ('PARNARAMA', 'MARANHAO'), ('PASSAGEM FRANCA', 'MARANHAO'), ('COELHO NETO', 'MARANHAO'), ('BARAO DE GRAJAU', 'MARANHAO'), ('ALDEIAS ALTAS', 'MARANHAO'), ('SAO BERNARDO', 'MARANHAO'), ('SAO FRANCISCO DO MARANHAO', 'MARANHAO'), ('BURITI', 'MARANHAO'), ('URBANO SANTOS', 'MARANHAO'), ('TIMON', 'MARANHAO'), ('HUMBERTO DE CAMPOS', 'MARANHAO'), ('MATOES', 'MARANHAO'), ('PRIMEIRA CRUZ', 'MARANHAO'), ('PAULINO NEVES', 'MARANHAO'), ('AFONSO CUNHA', 'MARANHAO'), ('BREJO', 'MARANHAO'), ('MATA ROMA', 'MARANHAO'), ('SAO

In [None]:
dic_cities_df = pd.DataFrame(dic_cities, columns=["municipio", "estado"])
dic_cities_df = dic_cities_df.dropna()

for index, row in dic_cities_df.iterrows():
    hotspot_df = hotspot_df.drop(hotspot_df[(hotspot_df.municipio == row.municipio) & (hotspot_df.estado == row.estado)].index)

### Parse datehour of string to date

__Hotspot__

In [None]:
hotspot_df['datahora'] = pd.to_datetime(hotspot_df['datahora'], format='%Y/%m/%d %H:%M:%S')

### Add features

__Hotspot__

In [None]:
hotspot_df["ano"] = pd.DatetimeIndex(hotspot_df['datahora']).year
# hotspot_df["mes"] = pd.DatetimeIndex(hotspot_df['datahora']).month
# hotspot_df["dia"] = pd.DatetimeIndex(hotspot_df['datahora']).day

__Amazon SPI__

In [None]:
mean_spi = spi_df.groupby("Ano")["IPS Amazônia"].transform("mean").rename("Media IPS")
spi_df = pd.concat([spi_df, mean_spi], axis=1)

### Fix or remove outliers (optional)

__Hotspot__

In [None]:
hotspot_df.loc[hotspot_df["riscofogo"] < 0, "riscofogo"] = 0
hotspot_df.loc[hotspot_df["diasemchuva"] < 0, "diasemchuva"] = 0
hotspot_df.loc[hotspot_df["frp"] < 0, "frp"] = 0

__Amazon SPI__

In [None]:
spi_df.dropna(subset=["Município", "Estado", "IPS Amazônia"], inplace=True)

### Fill in missing values (e.g., with zero, mean, median...) or drop their rows (or columns)

__Hotspot__

In [None]:
hotspot_df["frp"].fillna(value=0, inplace=True)
hotspot_df["riscofogo"].fillna(value=0, inplace=True)
hotspot_df["diasemchuva"].fillna(value=0, inplace=True)
hotspot_df["precipitacao"].fillna(value=0, inplace=True)

### Drop columns not used

__Hotspot__

In [None]:
hotspot_df = hotspot_df.drop(columns=["datahora", "satelite", "pais", "bioma", "latitude", "longitude"])
hotspot_df.head()

__Amazon SPI__

In [None]:
spi_df.drop("Código IBGE", axis=1, inplace=True)

# spi_df = spi_df[[
#     "Ano", "Município","Estado","Ranking IPS","IPS Amazônia","Media IPS",
#     "Necessidades Humanas Básicas", "Fundamentos para o Bem-Estar","Oportunidades", # 3 Dimensions
#     "Nutrição e cuidados médicos básicos","Água e saneamento","Moradia","Segurança pessoal", # Necessidades Humanas Básicas components
#     "Acesso ao conhecimento básico", "Acesso à informação e comunicação","Saúde e bem-estar","Qualidade do meio ambiente", # Fundamentos para o Bem-Estar components
#     "Direitos individuais","Liberdade individual e de escolha","Tolerância e inclusão","Acesso à educação superior" # Oportunidades components
# ]]
# spi_df.head()

### Rearrange columns pandas

__Amazon SPI__

In [None]:
column = spi_df.pop('Media IPS')
spi_df.insert(4, 'Media IPS', column)

### Describe the data II

__Hotspot__

In [None]:
hotspot_df.describe().transpose()

In [None]:
get_pct_missing_data(hotspot_df)

__Amazon SPI__

In [None]:
spi_df.describe().transpose()

In [None]:
get_pct_missing_data(spi_df)

### Data Transform

__hotspot__

In [None]:
hotspot_df.loc[hotspot_df.ano < 2014, "ano"] = 2014
hotspot_df.loc[(hotspot_df.ano > 2014) & (hotspot_df.ano < 2018), "ano"] = 2018
hotspot_df.loc[(hotspot_df.ano > 2018) & (hotspot_df.ano < 2021), "ano"] = 2021

#### Create _fococalor_ feature

__fococalor__: Hotspot is any temperature recorded above 47°C. It is not necessarily a fire spot or fire.

In [None]:
hotspot_size = hotspot_df.groupby(["ano", "estado", "municipio"]).size()
hotspot_size = hotspot_size.reset_index(name="fococalor")
hotspot_size

#### Getting anual mean of each cities by years

In [None]:
hotspot_df = hotspot_df.groupby(["ano", "estado", "municipio"]).mean()
hotspot_df = hotspot_df.reset_index()

#### Add fococalor to hotspot_df

In [None]:
hotspot_df["fococalor"] = hotspot_size["fococalor"]

#### Add categorical feature called "riscofogocat" that represents a calculated risk based on formula below:

_formula_

$$ \left(\frac{k_i}{\sum_{i=1}^{N} k} \right) |(d_i - p_i) r_i | * (f_i + 0.01) $$

```
where:

i:index of a row;
k = fococalor;
d = diasemchuva;
p = precipitacao;
r = riscofogo;
f = frp
```






__Categories__
1. __MUITO_ALTO__: greater than equals 0.9
2. __ALTO__: greater than equals 0.7 AND smaller than 0.9
3. __MODERADO__: greater than equals 0.5 AND smaller than 0.7
4. __BAIXO__: greater than equals 0.15 AND smaller than 0.5
5. __MUITO_BAIXO__: smaller than 0.15





In [None]:
def add_riskfirecat(dataset, labels=None):
    for year in YEARS_AVAILABLE:
        ds = dataset[dataset.ano == year]
        
        sum_hotspot = ds.fococalor.sum()
        fire_index = np.abs(np.divide(ds.diasemchuva, (ds.precipitacao + 0.01)) * ds.riscofogo) * (ds.frp + 0.01)
        result = np.multiply(np.divide(ds.fococalor, sum_hotspot), fire_index)
        
        categories = pd.qcut(result, 5, labels=labels)
        dataset.loc[dataset.ano == year, "riscofogocat"] = categories

add_riskfirecat(hotspot_df, ["MUITO_BAIXO", "BAIXO", "MODERADO", "ALTO", "MUITO_ALTO"])

hotspot_df[hotspot_df.ano == 2014].sort_values("riscofogo", ascending=False)

In [None]:
output = pd.merge(hotspot_df, spi_df, left_on=["ano","estado", "municipio"], right_on=["Ano", "Estado", "Município"])

In [None]:
output = output.drop(["Ano", "Município", "Estado"], axis=1)

### Parsing and rounding double values

In [None]:
output = output.round(2)
output = output.round({"diasemchuva":1})
output = output.astype({"Ranking IPS":"int"})

In [None]:
output

### Save data in a csv file

In [None]:
path = Path(f"{WORKDIR}/data/output/hotspot_spi.csv")
         
path.parent.mkdir(parents=True, exist_ok=True)
output.to_csv(path, index=False)