# Caipora Project

__Objetivo__




 
__Data Source__

https://queimadas.dgi.inpe.br/queimadas/portal

https://ipsamazonia.org.br/

https://openaq.org/


__Data characteristics__

- Time Series;
- Geographic coordinates – Latitude/Longitude;
- Satellite Name

In [1]:
import os
import math
import unicodedata

import glob

import datetime
import gmaps

import numpy as np
import pandas as pd

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

## Prerequisite

In [2]:
WORKDIR = os.path.abspath(os.getcwd())

## Get the data

### Hotspot data

__List files used in the analysis__

In [3]:
path = ''.join([WORKDIR, "/data/states/*"]) 
hotspot_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [4]:
hotspot_df = pd.concat(map(pd.read_csv, hotspot_files))
hotspot_df.head()

Unnamed: 0,datahora,satelite,pais,estado,municipio,bioma,diasemchuva,precipitacao,riscofogo,latitude,longitude,frp
0,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,CURURUPU,Amazonia,0.0,0.8,0.0,-1.87136,-44.78587,
1,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,MARACACUME,Amazonia,0.0,0.1,0.1,-1.82566,-45.8867,
2,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,BURITICUPU,Amazonia,0.0,1.1,0.1,-4.57874,-46.3866,
3,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,PAULO RAMOS,Amazonia,0.0,1.4,0.1,-4.59554,-45.66039,
4,2018/01/01 04:06:00,NPP-375D,Brasil,MARANHAO,ARAME,Amazonia,0.0,0.4,0.3,-5.2196,-46.12886,


In [5]:
hotspot_df.estado.unique()

array(['MARANHAO', 'MATO GROSSO', 'ACRE', 'TOCANTINS', 'AMAZONAS',
       'RORAIMA', 'AMAPA', 'RONDONIA', 'PARA'], dtype=object)

### Amazon SPI data

__List files used in the analysis__

In [6]:
path = ''.join([WORKDIR, "/data/spi/amazonia/compactado"]) 
spi_files = glob.glob(os.path.join(path, "*.csv"))

__Load into Pandas DataFrame__

In [7]:
spi_df = pd.concat(map(pd.read_csv, spi_files))
spi_df.head()

Unnamed: 0,cibge,municipio,estado,ano,ips,rankips,nhbs,fbes,opts
0,1100015.0,ALTA FLORESTA D'OESTE,RONDONIA,2014.0,56.59417,59.0,60.195511,57.36934,52.217659
1,1100023.0,ARIQUEMES,RONDONIA,2014.0,55.728511,73.0,59.059533,59.304304,48.821695
2,1100031.0,CABIXI,RONDONIA,2014.0,58.915724,92.0,74.178132,54.787265,47.781775
3,1100049.0,CACOAL,RONDONIA,2014.0,61.717654,72.0,72.691093,60.613765,51.848105
4,1100056.0,CEREJEIRAS,RONDONIA,2014.0,54.593926,59.0,62.606094,62.512401,38.663282


## Explore the data

### Describe the data I

__Hotspot__

In [8]:
hotspot_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diasemchuva,11060285.0,7.623764,70.880709,-999.0,1.0,4.0,12.0,120.0
precipitacao,11060285.0,1.03443,3.854016,0.0,0.0,0.0,0.3,203.7
riscofogo,11060285.0,-5.310438,77.318383,-999.0,0.4,0.9,1.0,1.0
latitude,12651992.0,-7.019591,3.980039,-16.29,-9.85041,-7.715,-3.879,5.23
longitude,12651992.0,-56.207436,6.320051,-73.93146,-61.002,-55.22246,-51.21,-43.64537
frp,3390008.0,19.308732,59.70576,-3.7,3.0,7.2,16.8,9722.6


__Amazon SPI__

In [9]:
spi_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cibge,2316.0,2288414.0,1365403.0,1100015.0,1502798.0,1713254.0,2109278.0,5108956.0
ano,2316.0,2017.667,2.868061,2014.0,2014.0,2018.0,2021.0,2021.0
ips,2316.0,54.56015,3.811311,43.5979,51.92879,54.30994,56.80819,74.41633
rankips,2316.0,48.3329,28.86438,0.0,23.0,47.0,73.0,99.0
nhbs,2319.0,64.3741,6.825254,43.37849,59.69696,64.32076,68.87852,87.12833
fbes,2319.0,54.99606,5.495901,39.29388,50.94986,54.70515,58.37245,78.70139
opts,2319.0,44.29596,6.039469,21.26966,41.06181,45.19378,48.3621,70.14162


### Get information about data

__Hotspot__

In [10]:
hotspot_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12651992 entries, 0 to 318737
Data columns (total 12 columns):
 #   Column        Dtype  
---  ------        -----  
 0   datahora      object 
 1   satelite      object 
 2   pais          object 
 3   estado        object 
 4   municipio     object 
 5   bioma         object 
 6   diasemchuva   float64
 7   precipitacao  float64
 8   riscofogo     float64
 9   latitude      float64
 10  longitude     float64
 11  frp           float64
dtypes: float64(6), object(6)
memory usage: 1.2+ GB


__Amazon SPI__

In [11]:
spi_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2319 entries, 0 to 772
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   cibge      2316 non-null   float64
 1   municipio  2316 non-null   object 
 2   estado     2316 non-null   object 
 3   ano        2316 non-null   float64
 4   ips        2316 non-null   float64
 5   rankips    2316 non-null   float64
 6   nhbs       2319 non-null   float64
 7   fbes       2319 non-null   float64
 8   opts       2319 non-null   float64
dtypes: float64(7), object(2)
memory usage: 181.2+ KB


### Percentage of missing data

In [12]:
def get_pct_missing_data(dataset):
    """
    Get Percentage of missing data
    
    Attributes
    ----------
    dataset : Pandas DataFrame
    """
    total = dataset.isnull().sum().sort_values(ascending=False)
    percent = dataset.isnull().sum() / dataset.isnull().count() * 100
    percent = (round(percent, 4)).sort_values(ascending=False)

    missing_data = pd.concat([total, percent], keys=["Total", '%'], axis=1)

    return missing_data

__Hotspot__

In [13]:
get_pct_missing_data(hotspot_df)

Unnamed: 0,Total,%
frp,9261984,73.2057
diasemchuva,1591707,12.5807
precipitacao,1591707,12.5807
riscofogo,1591707,12.5807
datahora,0,0.0
satelite,0,0.0
pais,0,0.0
estado,0,0.0
municipio,0,0.0
bioma,0,0.0


__Amazon SPI__

In [14]:
get_pct_missing_data(spi_df)

Unnamed: 0,Total,%
cibge,3,0.1294
municipio,3,0.1294
estado,3,0.1294
ano,3,0.1294
ips,3,0.1294
rankips,3,0.1294
nhbs,0,0.0
fbes,0,0.0
opts,0,0.0


## Prepare the data

### Fix or remove outliers (optional)

__Hotspot__

In [15]:
hotspot_df.loc[hotspot_df["riscofogo"] < 0, "riscofogo"] = 0
hotspot_df.loc[hotspot_df["diasemchuva"] < 0, "diasemchuva"] = 0

__Amazon SPI__

In [16]:
spi_df.dropna(subset=["cibge", "municipio", "estado", "ips"], inplace=True)

In [17]:
def purge_spec_chars(word):
    """
    Remove all special characters in the word.
    
    Attributes
    ----------
    word : str
    
    Returns
    -------
    word_without_spec_chars: str
    """
    nfkd_form = unicodedata.normalize("NFKD", word)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [18]:
spi_df.municipio = spi_df.municipio.apply(purge_spec_chars)

### Fill in missing values (e.g., with zero, mean, median...) or drop their rows (or columns)

__Hotspot__

In [19]:
hotspot_df['frp'].fillna(value=0, inplace=True)
hotspot_df['riscofogo'].fillna(value=0, inplace=True)
hotspot_df['diasemchuva'].fillna(value=0, inplace=True)
hotspot_df['precipitacao'].fillna(value=0, inplace=True)

### Parse datehour of string to date

__Hotspot__

In [20]:
hotspot_df['datahora'] = pd.to_datetime(hotspot_df['datahora'], format='%Y/%m/%d %H:%M:%S')

### Add Year and Month columns

__Hotspot__

In [21]:
hotspot_df["ano"] = pd.DatetimeIndex(hotspot_df['datahora']).year
hotspot_df["mes"] = pd.DatetimeIndex(hotspot_df['datahora']).month

### Describe the data II

__Hotspot__

In [22]:
hotspot_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
diasemchuva,12651992.0,10.617849,19.514854,0.0,0.0,3.0,10.0,120.0
precipitacao,12651992.0,0.904291,3.619731,0.0,0.0,0.0,0.2,203.7
riscofogo,12651992.0,0.613064,0.401937,0.0,0.2,0.8,1.0,1.0
latitude,12651992.0,-7.019591,3.980039,-16.29,-9.85041,-7.715,-3.879,5.23
longitude,12651992.0,-56.207436,6.320051,-73.93146,-61.002,-55.22246,-51.21,-43.64537
frp,12651992.0,5.173632,32.066891,-3.7,0.0,0.0,1.1,9722.6
ano,12651992.0,2017.520379,2.79264,2012.0,2015.0,2018.0,2020.0,2022.0
mes,12651992.0,8.714163,1.990419,1.0,8.0,9.0,10.0,12.0


In [23]:
get_pct_missing_data(hotspot_df)

Unnamed: 0,Total,%
datahora,0,0.0
satelite,0,0.0
pais,0,0.0
estado,0,0.0
municipio,0,0.0
bioma,0,0.0
diasemchuva,0,0.0
precipitacao,0,0.0
riscofogo,0,0.0
latitude,0,0.0


__Amazon SPI__

In [24]:
spi_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cibge,2316.0,2288414.0,1365403.0,1100015.0,1502798.0,1713254.0,2109278.0,5108956.0
ano,2316.0,2017.667,2.868061,2014.0,2014.0,2018.0,2021.0,2021.0
ips,2316.0,54.56015,3.811311,43.5979,51.92879,54.30994,56.80819,74.41633
rankips,2316.0,48.3329,28.86438,0.0,23.0,47.0,73.0,99.0
nhbs,2316.0,64.38251,6.825672,43.37849,59.72034,64.3238,68.88298,87.12833
fbes,2316.0,54.98789,5.494759,39.29388,50.9477,54.70459,58.36069,78.70139
opts,2316.0,44.31006,6.03064,21.26966,41.08251,45.19826,48.36412,70.14162


In [25]:
get_pct_missing_data(spi_df)

Unnamed: 0,Total,%
cibge,0,0.0
municipio,0,0.0
estado,0,0.0
ano,0,0.0
ips,0,0.0
rankips,0,0.0
nhbs,0,0.0
fbes,0,0.0
opts,0,0.0
