Here, we try do determine how complete is the data for the SNISB dataset and to estimate how compliant with the national safety procols the dams are.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from unidecode import unidecode
from shapely.geometry import Point
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')

In [2]:
def read_dams():
    '''
    Reads the dam safety dataset using
    the appropriate configurations.
    '''

    dams = pd.read_csv("../../data/brazil/snisb/dam-report-07022021.csv", encoding='Latin5', sep=';', skiprows=[0,1])

    return dams


def make_gdf(df):
    '''
    Converts the dataframe to a geodaframe
    using the columns Longitude and Latitude
    ---
    Parameters:
    
    df -> The dam safety dataframe
    '''

    df['geometry'] = df.apply(lambda row: Point(row.longitude, row.latitude), axis=1)

    df = gpd.GeoDataFrame(df)

    return df


def crs_to_area(gdf):
    '''
    Converts the CRS for equal
    area calculations.
    ---

    Parameters:

    gdf -> A geodataframe
    '''

    return gdf.to_crs('''PROJCS["Brasil_Albers_Equal_Area",GEOGCS["GCS_WGS_1984",DATUM["D_SIRGAS_2000",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Albers"],PARAMETER["false_easting",5000000.0],PARAMETER["false_northing",10000000.0],PARAMETER["central_meridian",-54.0],PARAMETER["standard_parallel_1",-2.0],PARAMETER["standard_parallel_2",-22.0],PARAMETER["latitude_of_origin",-12.0],UNIT["Meter",1.0]]''')

def crs_to_coords(gdf):
    '''
    Converts the CRS of the geodataframe
    to the Brazilian standard for geogra-
    phic projections.
    ---
    Parameters:

    gdf -> A geodataframe
    '''

    return gdf.to_crs("EPSG:4674")

def standardize_columns(df):
    '''
    Remove special characters from the column
    names and makes them all lowercase. 
    ---
    Parameters:

    df -> The dam safety dataframe
    '''

    df.columns = df.columns.map(unidecode)

    df.columns = df.columns.map(lambda x: x.lower())

    df.columns = df.columns.map(lambda x: x.strip())

    return df

def fix_separators(df):
    '''
    The Latitude and Longitude columns in the dataframe
    are currently stored as strings with a ',' as the decimal
    separator. This function changes the separator to '.' and
    casts it to float.
    ---
    Parameters:

    df -> The dam safety dataframe
    '''

    df.latitude = df.latitude.str.replace(",", ".").astype(float)

    df.longitude = df.longitude.str.replace(",", ".").astype(float)

    df.capacidade_hm3 = df.capacidade_hm3.str.replace(",", ".").astype(float)

    return df

def classify_risky_dams(df):
    '''
    Marks the dams that have both a
    high risk category and a high
    potential damage so we can proceed 
    in the analysis.
    ---
    Parameters:

    df -> The dam safety dataframe
    '''

    condition = (df.categoria_de_risco == 'Alto') & (df.dano_potencial_associado == 'Alto')

    df['high_risk_high_damage'] = np.where(condition, True, False)

    return df


def create_buffer(gdf, r):
    '''
    Creates a buffer around the
    geometries of the given gdf
    with a r radius.
    '''

    gdf.geometry = gdf.geometry.buffer(r)    

    return gdf



In [3]:
def get_dams():
    '''
    Runs all the functions to prepare
    the dam safety dataset sequentially.
    
    NOTICE THAT IN THIS SCRIPT WE ARE NOT
    MASKING OUT THE LOW RISK DAMS AND
    ARE NOT CREATING BUFFERS AROUND THE
    POINTS.
    '''
    
    dams = read_dams()
    dams = standardize_columns(dams)
    dams = fix_separators(dams)
    dams = make_gdf(dams)
    dams = classify_risky_dams(dams)
    #dams = dams[dams.high_risk_high_damage] # selects using boolean mask
    dams = dams.set_crs("EPSG:4674") # Brazilian standard projection
    dams = crs_to_area(dams)
    #dams = create_buffer(dams, 1000)
    
    return dams

In [4]:
# On the first run, save as feather
# for quicker loading later

# dams = get_dams()
# dams.to_feather("../../data/brazil/snisb/all-dams-clean.feather")

In [5]:
dams = gpd.read_feather("../../data/brazil/snisb/all-dams-clean.feather")

Of all dams, how many are in which level of information completude?

In [6]:
dams.completude.value_counts(normalize=True)

mínima    0.581805
ótima     0.156465
boa       0.101429
baixa     0.082069
média     0.078232
Name: completude, dtype: float64

And of the high risk, high potential damage dams?

In [7]:
dams[(dams.high_risk_high_damage)].completude.value_counts(normalize=True)

baixa     0.333607
boa       0.326230
mínima    0.295082
ótima     0.045082
Name: completude, dtype: float64

How many dams are regulated by the PNSB?

In [8]:
dams.regulada_pela_pnsb.value_counts()

Não Classificada    12884
Sim                  5489
Não                  4303
Name: regulada_pela_pnsb, dtype: int64

In [9]:
dams.regulada_pela_pnsb.value_counts(normalize=True)

Não Classificada    0.568178
Sim                 0.242062
Não                 0.189760
Name: regulada_pela_pnsb, dtype: float64

Of the dams regulated by the PNSB, the national dam safety policy, how many do have a safety plan?

In [10]:
dams[(dams.regulada_pela_pnsb=='Sim')].possui_plano_de_seguranassa.value_counts(normalize=True)

Não    0.737293
Sim    0.262707
Name: possui_plano_de_seguranassa, dtype: float64

Of the dams regulated by the PNSB, how many are high risk and high potential damage?

In [11]:
dams[(dams.regulada_pela_pnsb=='Sim')].high_risk_high_damage.value_counts(normalize=True)

False    0.777737
True     0.222263
Name: high_risk_high_damage, dtype: float64

In [12]:
dams[(dams.regulada_pela_pnsb=='Sim')].dano_potencial_associado.value_counts(normalize=True)

Alto                0.678630
Médio               0.198397
Baixo               0.061942
Não Classificado    0.061031
Name: dano_potencial_associado, dtype: float64

Of the dams that should have an emergency plan, how many do?

In [13]:
dams[dams.dano_potencial_associado=="Alto"].possui_pae.value_counts(normalize=True)

Sim    0.536082
Não    0.463918
Name: possui_pae, dtype: float64

How many dams have a known responsible person?

In [14]:
dams.nome_do_empreendedor.isna().value_counts(normalize=True)

False    0.50516
True     0.49484
Name: nome_do_empreendedor, dtype: float64

How many of the dangerous dams have a known date of last inspection?

In [15]:
dams[dams.high_risk_high_damage].data_da_ultima_inspecao.isna().value_counts(normalize=True)

True     0.863115
False    0.136885
Name: data_da_ultima_inspecao, dtype: float64

In [16]:
dams[(dams.regulada_pela_pnsb=="Sim") & (dams.dano_potencial_associado=="Alto")].data_da_ultima_inspecao.isna().value_counts(normalize=True)

True     0.817718
False    0.182282
Name: data_da_ultima_inspecao, dtype: float64

How many dams are fully compliant regarding the safety and emergency action plans?

In [20]:
#Which dams have both the safety plan and emergency plan requirements fullfiled?
condition = (dams.regulada_pela_pnsb=="Sim") & (dams.possui_plano_de_seguranassa=="Sim") & ( 
     ((dams.dano_potencial_associado=="Alto") & (dams.possui_pae=="Sim")) | (dams.dano_potencial_associado!="Alto")
)

pnsb_dams = dams[dams.regulada_pela_pnsb=="Sim"]

    
pnsb_dams['compliant'] = condition

display(pnsb_dams.compliant.value_counts())
display(pnsb_dams.compliant.value_counts(normalize=True))

False    4137
True     1352
Name: compliant, dtype: int64

False    0.753689
True     0.246311
Name: compliant, dtype: float64