# Caipora Project

__Objetivo__




 
__Data Source__

https://queimadas.dgi.inpe.br/queimadas/portal

https://ipsamazonia.org.br/

https://openaq.org/


__Data characteristics__

- Time Series;
- Geographic coordinates – Latitude/Longitude;
- Satellite Name

In [1]:
import os
import math
import unicodedata

import glob

import datetime
import gmaps

from pathlib import Path

import numpy as np
import pandas as pd

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

## Constants

In [2]:
WORKDIR = os.path.abspath(os.getcwd())

BRAZILIAN_STATES = [
    { "acronym": "AC", "name": "Acre" },
    { "acronym": "AL", "name": "Alagoas" },
    { "acronym": "AP", "name": "Amapá" },
    { "acronym": "AM", "name": "Amazonas" },
    { "acronym": "BA", "name": "Bahia" },
    { "acronym": "CE", "name": "Ceará" },
    { "acronym": "DF", "name": "Distrito Federal" },
    { "acronym": "ES", "name": "Espírito Santo" },
    { "acronym": "GO", "name": "Goiás" },
    { "acronym": "MA", "name": "Maranhão" },
    { "acronym": "MT", "name": "Mato Grosso" },
    { "acronym": "MS", "name": "Mato Grosso do Sul" },
    { "acronym": "MG", "name": "Minas Gerais" },
    { "acronym": "PA", "name": "Pará" },
    { "acronym": "PB", "name": "Paraíba" },
    { "acronym": "PR", "name": "Paraná" },
    { "acronym": "PE", "name": "Pernambuco" },
    { "acronym": "PI", "name": "Piauí" },
    { "acronym": "RJ", "name": "Rio de Janeiro" },
    { "acronym": "RN", "name": "Rio Grande do Norte" },
    { "acronym": "RS", "name": "Rio Grande do Sul" },
    { "acronym": "RO", "name": "Rondônia" },
    { "acronym": "RR", "name": "Roraima" },
    { "acronym": "SC", "name": "Santa Catarina" },
    { "acronym": "SP", "name": "São Paulo" },
    { "acronym": "SE", "name": "Sergipe" },
    { "acronym": "TO", "name": "Tocantins" }
]

## Get the data

### Hotspot data

__List files used in the analysis__

In [3]:
path = ''.join([WORKDIR, "/data/hotspot/**/*"]) 
hotspot_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [4]:
hotspot_df = pd.concat(map(pd.read_csv, hotspot_files))
hotspot_df.head()

Unnamed: 0,datahora,satelite,pais,estado,municipio,bioma,diasemchuva,precipitacao,riscofogo,latitude,longitude,frp
0,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,CURURUPU,Amazonia,0.0,0.8,0.0,-1.87136,-44.78587,
1,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,MARACACUME,Amazonia,0.0,0.1,0.1,-1.82566,-45.8867,
2,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,BURITICUPU,Amazonia,0.0,1.1,0.1,-4.57874,-46.3866,
3,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,PAULO RAMOS,Amazonia,0.0,1.4,0.1,-4.59554,-45.66039,
4,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,ARAME,Amazonia,0.0,0.4,0.3,-5.2196,-46.12886,


### Amazon SPI data

__List files used in the analysis__

In [5]:
path = ''.join([WORKDIR, "/data/spi/amazonia/detailed"])
spi_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [6]:
spi_df = pd.concat(map(pd.read_csv, spi_files))
spi_df.head()

Unnamed: 0,Ano,Código IBGE,Município,Estado,IPS Amazônia,Ranking IPS,Necessidades Humanas Básicas,Fundamentos para o Bem-Estar,Oportunidades,Nutrição e cuidados médicos básicos,...,"Desmatamento recente 2019, 2019, 2020 (% área total do município)",Focos de calor 2020 (nª de focos/1.000 habitantes),Diversidade Partidária 2020 (% vereadores eleitos partidos diferentes),Transporte Público 2020 (nº de ônibus e micro-ônibus/1.000 habitantes),"Acesso à cultura, esporte e lazer 2018 (Categórica 1-10)",Gravidez na infância e adolescência 2019 (% de filhos de mães com até 19 anos),Trabalho Infantil 2019 (nº de famílias com ao menos 1 membro em trabalho infantil/1.000 famílias),Vulnerabilidade familiar 2019 (% de filhos de mães solteiras),Empregos ensino superior 2019 (% de empregos em relação ao total),Mulheres com empregos ensino superior 2019 (% de empregos em relação ao total)
0,2014,1100015.0,Alta Floresta D'Oeste,RO,56.59417,197.0,60.195511,57.36934,52.217659,95.154708,...,,,,,,,,,,
1,2014,1100023.0,Ariquemes,RO,55.728511,264.0,59.059533,59.304304,48.821695,93.920755,...,,,,,,,,,,
2,2014,1100031.0,Cabixi,RO,58.915724,90.0,74.178132,54.787265,47.781775,88.092664,...,,,,,,,,,,
3,2014,1100049.0,Cacoal,RO,61.717654,28.0,72.691093,60.613765,51.848105,91.956107,...,,,,,,,,,,
4,2014,1100056.0,Cerejeiras,RO,54.593926,361.0,62.606094,62.512401,38.663282,87.810163,...,,,,,,,,,,


## Explore the data

### Describe the data I

__Hotspot__

In [7]:
hotspot_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diasemchuva,16943121.0,13.348092,65.530664,-999.0,2.0,5.0,20.0,120.0
precipitacao,16943121.0,0.859245,3.598378,0.0,0.0,0.0,0.1,203.7
riscofogo,16943121.0,-5.579153,79.406999,-999.0,0.6,1.0,1.0,1.0
latitude,19706174.0,-8.260597,4.323302,-18.039,-11.07466,-8.680406,-5.3,5.23
longitude,19706174.0,-53.774619,6.601975,-73.93146,-58.207,-53.091,-48.23736,-41.8
frp,5140138.0,19.285664,56.710458,-3.7,3.3,7.4,17.1,9722.6


__Amazon SPI__

In [8]:
spi_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Ano,2319.0,2.017667e+03,2.868060e+00,2.014000e+03,2.014000e+03,2.018000e+03,2.021000e+03,2.021000e+03
Código IBGE,2316.0,2.288414e+06,1.365403e+06,1.100015e+06,1.502798e+06,1.713254e+06,2.109278e+06,5.108956e+06
IPS Amazônia,2316.0,5.456015e+01,3.811311e+00,4.359790e+01,5.192879e+01,5.430994e+01,5.680819e+01,7.441633e+01
Ranking IPS,2316.0,3.865000e+02,2.229051e+02,1.000000e+00,1.937500e+02,3.865000e+02,5.792500e+02,7.720000e+02
Necessidades Humanas Básicas,2319.0,6.437410e+01,6.825254e+00,4.337849e+01,5.969696e+01,6.432076e+01,6.887852e+01,8.712833e+01
...,...,...,...,...,...,...,...,...
Gravidez na infância e adolescência 2019 (% de filhos de mães com até 19 anos),772.0,2.384973e+01,6.295921e+00,5.172414e+00,1.955196e+01,2.412254e+01,2.815534e+01,4.545455e+01
Trabalho Infantil 2019 (nº de famílias com ao menos 1 membro em trabalho infantil/1.000 famílias),772.0,3.170346e+01,6.065901e+01,0.000000e+00,2.748239e+00,9.631206e+00,3.342557e+01,4.740000e+02
Vulnerabilidade familiar 2019 (% de filhos de mães solteiras),772.0,4.370381e+01,2.095850e+01,5.000000e+00,2.629215e+01,3.899752e+01,5.949844e+01,9.534161e+01
Empregos ensino superior 2019 (% de empregos em relação ao total),772.0,1.828965e+01,1.281655e+01,4.964422e+00,1.129523e+01,1.595505e+01,2.173583e+01,1.479940e+02


### Get information about data

__Hotspot__

In [9]:
hotspot_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19706174 entries, 0 to 318737
Data columns (total 12 columns):
 #   Column        Dtype  
---  ------        -----  
 0   datahora      object 
 1   satelite      object 
 2   pais          object 
 3   estado        object 
 4   municipio     object 
 5   bioma         object 
 6   diasemchuva   float64
 7   precipitacao  float64
 8   riscofogo     float64
 9   latitude      float64
 10  longitude     float64
 11  frp           float64
dtypes: float64(6), object(6)
memory usage: 1.9+ GB


__Amazon SPI__

In [10]:
spi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 772
Columns: 158 entries, Ano to Mulheres com empregos ensino superior 2019 (% de empregos em relação ao total)
dtypes: float64(155), int64(1), object(2)
memory usage: 2.8+ MB


### Percentage of missing data

In [11]:
def get_pct_missing_data(dataset):
    """
    Get Percentage of missing data
    
    Attributes
    ----------
    dataset : Pandas DataFrame
    """
    total = dataset.isnull().sum().sort_values(ascending=False)
    percent = dataset.isnull().sum() / dataset.isnull().count() * 100
    percent = (round(percent, 4)).sort_values(ascending=False)

    missing_data = pd.concat([total, percent], keys=["Total", '%'], axis=1)

    return missing_data

__Hotspot__

In [12]:
get_pct_missing_data(hotspot_df)

Unnamed: 0,Total,%
frp,14566036,73.9161
diasemchuva,2763053,14.0213
precipitacao,2763053,14.0213
riscofogo,2763053,14.0213
datahora,0,0.0
satelite,0,0.0
pais,0,0.0
estado,0,0.0
municipio,0,0.0
bioma,0,0.0


__Amazon SPI__

In [13]:
get_pct_missing_data(spi_df)

Unnamed: 0,Total,%
Moradias com iluminação adequada 2018 (% de domicilios),1547,66.7098
Vulnerabilidade familiar 2017 (% de filhos de mães solteiras),1547,66.7098
Desmatamento acumulado 2017 (% área total do município),1547,66.7098
"Desmatamento recente 2017, 2017, 2018 (% área total do município)",1547,66.7098
Focos de calor 2018 (nª de focos/1.000 habitantes),1547,66.7098
...,...,...
Segurança pessoal,0,0.0000
Acesso ao conhecimento básico,0,0.0000
Acesso à informação e comunicação,0,0.0000
Saúde e bem-estar,0,0.0000


## Prepare the data

### Standardize Country State Names

The feature __estado__ in the two datasets are diffenrents. In _hotspot_df_ the name of state is long and spi_df is short.

__Amazon SPI__

In [14]:
def get_full_name_state(acronym):
    """
    Retrieves the long name of state related to acronym.
    """
    lnames = [x.get('name') for x in BRAZILIAN_STATES if x.get('acronym') == acronym]
    return lnames[0] if lnames else np.nan


def purge_spec_chars(word):
    """
    Remove all special characters in the word.
    """
    wv = word if word is not np.nan else ""
    nfkd_form = unicodedata.normalize("NFKD", wv) 
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [15]:
# Get full name of state
spi_df["Estado"] = spi_df["Estado"].apply(get_full_name_state)

# Remove special characteres
spi_df["Estado"] = spi_df["Estado"].apply(purge_spec_chars)

# Capitalize the name
spi_df["Estado"] = spi_df["Estado"].apply(lambda x: x.upper())

### Standardize Cities Names

The feature __municipio__ in the two datasets are diffenrents.

In [16]:
# Remove special characteres
spi_df["Município"] = spi_df["Município"].apply(purge_spec_chars)

# Capitalize the name
spi_df["Município"] = spi_df["Município"].apply(lambda x: x.upper())

#### Some city names are written incorrectly. In this section I will fix it with the correct form

__hotspot__

In [17]:
hotspot_df.loc[(hotspot_df["municipio"] == "ELDORADO DO CARAJAS"), "municipio"] = "ELDORADO DOS CARAJAS"

__Amazon SPI__

In [18]:
# spi_df.loc[spi_df["Município"] == "BARREIRINHA", "Município"] = "BARREIRINHAS"

# Acre
spi_df.loc[spi_df["Município"] == "RIO BRANCO (ACRE)", "Município"] = "RIO BRANCO"

# Maranhão
spi_df.loc[spi_df["Município"] == "ARAGUANA (MARANHAO)", "Município"] = "ARAGUANA"
spi_df.loc[spi_df["Município"] == "PRESIDENTE MEDICI (MARANHAO)", "Município"] = "PRESIDENTE MEDICI"

# Mato Grosso
spi_df.loc[spi_df["Município"] == "POXOREO", "Município"] = "POXOREU"
spi_df.loc[spi_df["Município"] == "RIO BRANCO (MATO GROSSO)", "Município"] = "RIO BRANCO"

# Rondonia
spi_df.loc[spi_df["Município"] == "PRESIDENTE MEDICI (RONDONIA)", "Município"] = "PRESIDENTE MEDICI"

# Pará
spi_df.loc[spi_df["Município"] == "BOM JESUS DO TOCANTINS (PARA)", "Município"] = "BOM JESUS DO TOCANTINS"
spi_df.loc[spi_df["Município"] == "SANTA ISABEL DO PARA", "Município"] = "SANTA IZABEL DO PARA"

# Tocantins
spi_df.loc[spi_df["Município"] == "ARAGUANA (TOCANTINS)", "Município"] = "ARAGUANA"
spi_df.loc[spi_df["Município"] == "BOM JESUS DO TOCANTINS (TOCANTINS)", "Município"] = "BOM JESUS DO TOCANTINS"
spi_df.loc[spi_df["Município"] == "COUTO DE MAGALHAES", "Município"] = "COUTO MAGALHAES"
spi_df.loc[spi_df["Município"] == "SAO VALERIO DA NATIVIDADE", "Município"] = "SAO VALERIO"

#### Shows the difference between the number of cities in the two datasets

In [19]:
hotspot_regions = hotspot_df.groupby(['municipio', 'estado'])
spi_region = spi_df.groupby(['Município', 'Estado'])

n_hr = len(hotspot_regions)
n_sr = len(spi_region)

dic_cities = set(hotspot_regions.groups.keys()) ^ set(spi_region.groups.keys())

print("hotspot_df has {} cities and spi_df has {}. The Difference is {}.".format(n_hr, n_sr, (n_hr - n_sr)))
print("---------------------------------------------------------------------------------------------------")
print("Cities not found:\n{}".format(dic_cities))

hotspot_df has 808 cities and spi_df has 773. The Difference is 35.
---------------------------------------------------------------------------------------------------
Cities not found:
{('SANTANA DO MARANHAO', 'MARANHAO'), ('', ''), ('LAGOA DO MATO', 'MARANHAO'), ('SANTA QUITERIA DO MARANHAO', 'MARANHAO'), ('BREJO', 'MARANHAO'), ('SUCUPIRA DO RIACHAO', 'MARANHAO'), ('SAO FRANCISCO DO MARANHAO', 'MARANHAO'), ('HUMBERTO DE CAMPOS', 'MARANHAO'), ('SAO BERNARDO', 'MARANHAO'), ('CAXIAS', 'MARANHAO'), ('AGUA DOCE DO MARANHAO', 'MARANHAO'), ('ANAPURUS', 'MARANHAO'), ('NINA RODRIGUES', 'MARANHAO'), ('MILAGRES DO MARANHAO', 'MARANHAO'), ('PAULINO NEVES', 'MARANHAO'), ('BARAO DE GRAJAU', 'MARANHAO'), ('ARAIOSES', 'MARANHAO'), ('PASSAGEM FRANCA', 'MARANHAO'), ('BELAGUA', 'MARANHAO'), ('TUTOIA', 'MARANHAO'), ('SAO BENEDITO DO RIO PRETO', 'MARANHAO'), ('CHAPADINHA', 'MARANHAO'), ('PRIMEIRA CRUZ', 'MARANHAO'), ('PARNARAMA', 'MARANHAO'), ('MATA ROMA', 'MARANHAO'), ('COELHO NETO', 'MARANHAO'), ('SANT

In [20]:
dic_cities_df = pd.DataFrame(dic_cities, columns=["municipio", "estado"])
dic_cities_df = dic_cities_df.dropna()

for index, row in dic_cities_df.iterrows():
    hotspot_df = hotspot_df.drop(hotspot_df[(hotspot_df.municipio == row.municipio) & (hotspot_df.estado == row.estado)].index)

### Parse datehour of string to date

__Hotspot__

In [21]:
hotspot_df['datahora'] = pd.to_datetime(hotspot_df['datahora'], format='%Y/%m/%d %H:%M:%S')

### Add features

__Hotspot__

In [22]:
hotspot_df["ano"] = pd.DatetimeIndex(hotspot_df['datahora']).year
# hotspot_df["mes"] = pd.DatetimeIndex(hotspot_df['datahora']).month
# hotspot_df["dia"] = pd.DatetimeIndex(hotspot_df['datahora']).day

### Fix or remove outliers (optional)

__Hotspot__

In [23]:
hotspot_df.loc[hotspot_df["riscofogo"] < 0, "riscofogo"] = 0
hotspot_df.loc[hotspot_df["diasemchuva"] < 0, "diasemchuva"] = 0
hotspot_df.loc[hotspot_df["frp"] < 0, "frp"] = 0

__Amazon SPI__

In [24]:
spi_df.dropna(subset=["Município", "Estado", "IPS Amazônia"], inplace=True)

### Fill in missing values (e.g., with zero, mean, median...) or drop their rows (or columns)

__Hotspot__

In [25]:
hotspot_df["frp"].fillna(value=0, inplace=True)
hotspot_df["riscofogo"].fillna(value=0, inplace=True)
hotspot_df["diasemchuva"].fillna(value=0, inplace=True)
hotspot_df["precipitacao"].fillna(value=0, inplace=True)

### Drop columns not used

__Hotspot__

In [26]:
hotspot_df = hotspot_df.drop(columns=["datahora", "satelite", "pais", "bioma", "latitude", "longitude"])
hotspot_df.head()

Unnamed: 0,estado,municipio,diasemchuva,precipitacao,riscofogo,frp,ano
3,MARANHAO,PAULO RAMOS,0.0,1.4,0.1,0.0,2018
11,MARANHAO,BURITICUPU,0.0,1.1,0.3,0.0,2018
16,MARANHAO,BACURI,0.0,0.0,0.0,11.0,2018
17,MARANHAO,PEDRO DO ROSARIO,0.0,0.4,0.2,14.2,2018
19,MARANHAO,ANAJATUBA,0.0,0.8,0.1,65.5,2018


__Amazon SPI__

_Dimensions and their components:_

1. Necessidades Humanas Básicas
    * Nutrição e cuidados médicos básicos
    * Água e saneamento
    * Moradia
    * Segurança pessoal
2. Fundamentos para o Bem-Estar
    * Acesso ao conhecimento básico
    * Acesso à informação e comunicação
    * Saúde e bem-estar
    * Qualidade do meio ambiente
3. Oportunidades
    * Direitos individuais
    * Liberdade individual e de escolha
    * Tolerância e inclusão
    * Acesso à educação superior

In [27]:
spi_df = spi_df[[
    "Ano", "Município","Estado","IPS Amazônia","Ranking IPS",
    "Necessidades Humanas Básicas", "Fundamentos para o Bem-Estar","Oportunidades", # 3 Dimensions
    "Nutrição e cuidados médicos básicos","Água e saneamento","Moradia","Segurança pessoal", # Necessidades Humanas Básicas components
    "Acesso ao conhecimento básico", "Acesso à informação e comunicação","Saúde e bem-estar","Qualidade do meio ambiente", # Fundamentos para o Bem-Estar components
    "Direitos individuais","Liberdade individual e de escolha","Tolerância e inclusão","Acesso à educação superior" # Oportunidades components
]]
spi_df.head()

Unnamed: 0,Ano,Município,Estado,IPS Amazônia,Ranking IPS,Necessidades Humanas Básicas,Fundamentos para o Bem-Estar,Oportunidades,Nutrição e cuidados médicos básicos,Água e saneamento,Moradia,Segurança pessoal,Acesso ao conhecimento básico,Acesso à informação e comunicação,Saúde e bem-estar,Qualidade do meio ambiente,Direitos individuais,Liberdade individual e de escolha,Tolerância e inclusão,Acesso à educação superior
0,2014,ALTA FLORESTA D'OESTE,RONDONIA,56.59417,197.0,60.195511,57.36934,52.217659,95.154708,26.7253,74.756358,44.145679,67.279579,12.246689,79.040866,70.910227,35.009508,69.313167,97.10723,7.44073
1,2014,ARIQUEMES,RONDONIA,55.728511,264.0,59.059533,59.304304,48.821695,93.920755,30.394326,83.599855,28.323196,69.307394,32.951213,79.71472,55.24389,29.002168,75.927282,76.373664,13.983665
2,2014,CABIXI,RONDONIA,58.915724,90.0,74.178132,54.787265,47.781775,88.092664,32.626943,77.918497,98.074422,75.614583,15.284836,74.519566,53.730077,32.391236,69.318097,79.090076,10.327693
3,2014,CACOAL,RONDONIA,61.717654,28.0,72.691093,60.613765,51.848105,91.956107,70.286301,88.352513,40.169451,78.123863,32.410825,74.847956,57.072414,28.67385,80.286243,80.457931,17.974396
4,2014,CEREJEIRAS,RONDONIA,54.593926,361.0,62.606094,62.512401,38.663282,87.810163,32.596208,79.234759,50.783248,73.719665,29.292791,70.785929,76.251219,28.424049,64.043321,52.550253,9.635506


In [28]:
spi_df.sort_values(["Estado", "Município"])

Unnamed: 0,Ano,Município,Estado,IPS Amazônia,Ranking IPS,Necessidades Humanas Básicas,Fundamentos para o Bem-Estar,Oportunidades,Nutrição e cuidados médicos básicos,Água e saneamento,Moradia,Segurança pessoal,Acesso ao conhecimento básico,Acesso à informação e comunicação,Saúde e bem-estar,Qualidade do meio ambiente,Direitos individuais,Liberdade individual e de escolha,Tolerância e inclusão,Acesso à educação superior
52,2014,ACRELANDIA,ACRE,54.641460,357.0,62.070081,57.083374,44.770925,93.844101,33.225781,76.599839,44.610605,69.920813,12.540836,89.820978,56.050871,21.912884,60.375032,88.161807,8.633976
52,2018,ACRELANDIA,ACRE,52.378261,555.0,63.410558,52.442985,41.281239,92.372944,41.122931,86.087333,34.059025,66.892830,12.696702,76.063784,54.118622,27.516797,64.407504,65.087510,8.113147
52,2021,ACRELANDIA,ACRE,53.704085,447.0,65.404443,57.168104,38.539709,94.408658,39.098194,80.431839,47.679083,74.858325,14.324709,85.637740,53.851641,27.236275,56.986957,61.686595,8.249007
53,2014,ASSIS BRASIL,ACRE,53.478734,446.0,61.932376,59.029676,39.474150,88.564634,47.359008,55.451652,56.354211,50.448176,18.736928,75.527140,91.406462,21.594557,62.796183,68.290771,5.215090
53,2018,ASSIS BRASIL,ACRE,54.679562,362.0,66.736706,57.977678,39.324301,90.476435,54.527203,69.152385,52.790803,45.869844,15.838253,81.014939,89.187675,19.611379,72.321974,58.191214,7.172637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,2018,WANDERLANDIA,TOCANTINS,53.490276,457.0,58.828293,55.610930,46.031604,86.142675,35.500180,90.598179,23.072136,75.615553,9.648831,71.837143,65.342193,24.390652,59.518123,89.110017,11.107624
448,2021,WANDERLANDIA,TOCANTINS,54.307443,390.0,63.443215,57.441927,42.037186,86.277592,35.500180,91.313659,40.681428,77.027995,10.920105,76.643000,65.176608,17.349272,48.181703,90.870568,11.747203
449,2014,XAMBIOA,TOCANTINS,51.622412,590.0,64.645318,50.726865,39.495054,89.741147,37.401687,86.647441,44.790998,71.216617,18.232100,60.707397,52.751345,27.293412,57.218554,66.472214,6.996035
449,2018,XAMBIOA,TOCANTINS,53.446151,464.0,63.966321,51.611447,44.760686,92.960267,37.401687,91.614549,33.888780,69.734559,14.646713,69.379721,52.684795,30.255791,63.869062,74.434087,10.483803


### Describe the data II

__Hotspot__

In [29]:
hotspot_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diasemchuva,6759109.0,15.546962,27.767763,0.0,0.0,4.0,14.0,120.0
precipitacao,6759109.0,0.977803,4.046112,0.0,0.0,0.0,0.2,203.7
riscofogo,6759109.0,0.651073,0.403029,0.0,0.2,0.9,1.0,1.0
frp,6759109.0,5.735754,32.560079,0.0,0.0,0.0,2.0,9722.6
ano,6759109.0,2018.171407,2.608099,2012.0,2017.0,2019.0,2020.0,2021.0


In [30]:
get_pct_missing_data(hotspot_df)

Unnamed: 0,Total,%
estado,0,0.0
municipio,0,0.0
diasemchuva,0,0.0
precipitacao,0,0.0
riscofogo,0,0.0
frp,0,0.0
ano,0,0.0


__Amazon SPI__

In [31]:
spi_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Ano,2316.0,2017.666667,2.868061,2014.0,2014.0,2018.0,2021.0,2021.0
IPS Amazônia,2316.0,54.560151,3.811311,43.597897,51.928793,54.309938,56.80819,74.416331
Ranking IPS,2316.0,386.5,222.905145,1.0,193.75,386.5,579.25,772.0
Necessidades Humanas Básicas,2316.0,64.382508,6.825672,43.378488,59.720342,64.323796,68.882983,87.128332
Fundamentos para o Bem-Estar,2316.0,54.987886,5.494759,39.293879,50.947697,54.704587,58.360695,78.701385
Oportunidades,2316.0,44.31006,6.03064,21.269663,41.082509,45.19826,48.364121,70.141618
Nutrição e cuidados médicos básicos,2316.0,89.044665,5.530187,16.214906,87.492299,90.387805,92.28094,99.990615
Água e saneamento,2316.0,39.05314,15.85546,1.163647,28.615377,38.016425,48.419468,92.106201
Moradia,2316.0,76.378695,14.611324,9.720165,67.823184,78.639396,87.459554,99.50406
Segurança pessoal,2316.0,53.053531,18.69657,0.0,39.436642,49.811377,64.307924,100.0


In [32]:
get_pct_missing_data(spi_df)

Unnamed: 0,Total,%
Ano,0,0.0
Município,0,0.0
Tolerância e inclusão,0,0.0
Liberdade individual e de escolha,0,0.0
Direitos individuais,0,0.0
Qualidade do meio ambiente,0,0.0
Saúde e bem-estar,0,0.0
Acesso à informação e comunicação,0,0.0
Acesso ao conhecimento básico,0,0.0
Segurança pessoal,0,0.0


### Data Transform

__hotspot__

In [33]:
hotspot_df.loc[hotspot_df.ano < 2014, "ano"] = 2014
hotspot_df.loc[(hotspot_df.ano > 2014) & (hotspot_df.ano < 2018), "ano"] = 2018
hotspot_df.loc[(hotspot_df.ano > 2018) & (hotspot_df.ano < 2021), "ano"] = 2021

#### Create _fococalor_ feature

__fococalor__: Hotspot is any temperature recorded above 47°C. It is not necessarily a fire spot or fire.

In [34]:
hotspot_size = hotspot_df.groupby(["ano", "estado", "municipio"]).size()
hotspot_size = hotspot_size.reset_index(name="fococalor")
hotspot_size

Unnamed: 0,ano,estado,municipio,fococalor
0,2014,ACRE,ACRELANDIA,441
1,2014,ACRE,ASSIS BRASIL,327
2,2014,ACRE,BRASILEIA,1120
3,2014,ACRE,BUJARI,339
4,2014,ACRE,CAPIXABA,455
...,...,...,...,...
2311,2021,TOCANTINS,TOCANTINOPOLIS,799
2312,2021,TOCANTINS,TUPIRAMA,180
2313,2021,TOCANTINS,TUPIRATINS,690
2314,2021,TOCANTINS,WANDERLANDIA,622


#### Getting anual mean of each cities by years

In [35]:
hotspot_df = hotspot_df.groupby(["ano", "estado", "municipio"]).mean()
hotspot_df = hotspot_df.reset_index()

#### Add fococalor to hotspot_df

In [36]:
hotspot_df["fococalor"] = hotspot_size["fococalor"]
hotspot_df

Unnamed: 0,ano,estado,municipio,diasemchuva,precipitacao,riscofogo,frp,fococalor
0,2014,ACRE,ACRELANDIA,0.922902,0.376644,0.230839,0.000000,441
1,2014,ACRE,ASSIS BRASIL,1.743119,0.458410,0.149541,0.000000,327
2,2014,ACRE,BRASILEIA,1.429464,0.364107,0.188661,0.000000,1120
3,2014,ACRE,BUJARI,0.840708,0.364012,0.121239,0.000000,339
4,2014,ACRE,CAPIXABA,0.826374,0.102198,0.084176,0.000000,455
...,...,...,...,...,...,...,...,...
2311,2021,TOCANTINS,TOCANTINOPOLIS,23.246558,0.321151,0.850814,9.033292,799
2312,2021,TOCANTINS,TUPIRAMA,39.522222,0.941111,0.805556,7.527222,180
2313,2021,TOCANTINS,TUPIRATINS,43.468116,0.413913,0.888551,9.987971,690
2314,2021,TOCANTINS,WANDERLANDIA,17.697749,0.416720,0.800322,4.854662,622


In [37]:
output = pd.merge(hotspot_df, spi_df, left_on=["ano","estado", "municipio"], right_on=["Ano", "Estado", "Município"])

In [38]:
output = output.drop(["Ano", "Município", "Estado"], axis=1)

### Parsing and rounding double values

In [39]:
output = output.round(2)
output = output.round({"diasemchuva":1})
output = output.astype({"Ranking IPS":"int"})

In [40]:
output

Unnamed: 0,ano,estado,municipio,diasemchuva,precipitacao,riscofogo,frp,fococalor,IPS Amazônia,Ranking IPS,...,Moradia,Segurança pessoal,Acesso ao conhecimento básico,Acesso à informação e comunicação,Saúde e bem-estar,Qualidade do meio ambiente,Direitos individuais,Liberdade individual e de escolha,Tolerância e inclusão,Acesso à educação superior
0,2014,ACRE,ACRELANDIA,0.9,0.38,0.23,0.00,441,54.64,357,...,76.60,44.61,69.92,12.54,89.82,56.05,21.91,60.38,88.16,8.63
1,2014,ACRE,ASSIS BRASIL,1.7,0.46,0.15,0.00,327,53.48,446,...,55.45,56.35,50.45,18.74,75.53,91.41,21.59,62.80,68.29,5.22
2,2014,ACRE,BRASILEIA,1.4,0.36,0.19,0.00,1120,50.05,692,...,71.35,39.08,79.48,23.60,80.69,74.71,17.70,62.01,0.00,6.95
3,2014,ACRE,BUJARI,0.8,0.36,0.12,0.00,339,54.02,405,...,68.66,70.37,54.28,9.62,79.25,63.17,19.18,54.60,86.42,9.37
4,2014,ACRE,CAPIXABA,0.8,0.10,0.08,0.00,455,53.15,479,...,72.30,47.44,55.57,13.55,81.29,57.89,18.46,60.37,100.00,7.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,2021,TOCANTINS,TOCANTINOPOLIS,23.2,0.32,0.85,9.03,799,53.76,441,...,88.02,53.41,61.92,21.05,77.33,81.74,52.61,55.15,8.02,11.19
2312,2021,TOCANTINS,TUPIRAMA,39.5,0.94,0.81,7.53,180,53.14,486,...,90.28,49.77,87.32,20.03,49.07,68.11,20.60,52.21,57.06,14.56
2313,2021,TOCANTINS,TUPIRATINS,43.5,0.41,0.89,9.99,690,60.37,63,...,83.47,97.42,68.42,9.51,77.47,67.89,16.74,52.59,83.34,15.03
2314,2021,TOCANTINS,WANDERLANDIA,17.7,0.42,0.80,4.85,622,54.31,390,...,91.31,40.68,77.03,10.92,76.64,65.18,17.35,48.18,90.87,11.75


### Save data in a csv file

In [41]:
path = Path(f"{WORKDIR}/data/output/hotspot_spi.csv")
         
path.parent.mkdir(parents=True, exist_ok=True)
output.to_csv(path, index=False)