In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import seaborn as sns

pd.set_option("display.max_columns", None)

In [25]:
year = 2018
election_round = "runoff"

In [26]:
# Read full data
df = pd.read_csv(f"data/Brazil/{year}_{election_round}.csv.zip")
df["polling_id"] = df["sg_uf"].astype(str) + "-" + df["nm_municipio"].astype(str) \
    + "-" + df["nr_zona"].astype(str)
df.head()

Unnamed: 0,sg_uf,nr_zona,nm_municipio,cd_cargo,ds_cargo,nr_candidato,nm_candidato,nm_urna_candidato,sg_partido,ds_composicao_coligacao,nr_turno,ds_sit_totalizacao,dt_ult_totalizacao,sg_ue,sq_candidato,sq_eleicao_divulga,qt_votos_nominais,qt_votos_validos,polling_id
0,AC,1,PORTO ACRE,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,7150,9209,AC-PORTO ACRE-1
1,AC,1,PORTO ACRE,1,Presidente,13,FERNANDO HADDAD,FERNANDO HADDAD,PT,PT / PC do B / PROS,2,Não Eleito,2019-02-27 18:15:58,BR,280000629808,2022802018,2059,9209,AC-PORTO ACRE-1
2,AC,1,RIO BRANCO,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,80106,96658,AC-RIO BRANCO-1
3,AC,1,RIO BRANCO,1,Presidente,13,FERNANDO HADDAD,FERNANDO HADDAD,PT,PT / PC do B / PROS,2,Não Eleito,2019-02-27 18:15:58,BR,280000629808,2022802018,16552,96658,AC-RIO BRANCO-1
4,AC,2,CAPIXABA,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,3895,4981,AC-CAPIXABA-2


In [27]:
df["rank"] = df.groupby(["polling_id"])["qt_votos_nominais"].rank("min", ascending=False).astype(int)

In [28]:
tt = df.groupby(["polling_id", "nm_candidato"]).agg({"qt_votos_nominais": "sum"})
tt["rate"] = tt.groupby(level=[0]).apply(lambda x: x/x.sum())
tt = tt.reset_index()
tt = tt[["polling_id", "nm_candidato", "rate"]]

In [29]:
df_filtered = pd.merge(df, tt, on=["polling_id", "nm_candidato"])
df_filtered = df_filtered.rename(columns={"qt_votos_nominais": "value", "nm_candidato": "candidate"})

df_filtered.head()

Unnamed: 0,sg_uf,nr_zona,nm_municipio,cd_cargo,ds_cargo,nr_candidato,candidate,nm_urna_candidato,sg_partido,ds_composicao_coligacao,nr_turno,ds_sit_totalizacao,dt_ult_totalizacao,sg_ue,sq_candidato,sq_eleicao_divulga,value,qt_votos_validos,polling_id,rank,rate
0,AC,1,PORTO ACRE,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,7150,9209,AC-PORTO ACRE-1,1,0.776414
1,AC,1,PORTO ACRE,1,Presidente,13,FERNANDO HADDAD,FERNANDO HADDAD,PT,PT / PC do B / PROS,2,Não Eleito,2019-02-27 18:15:58,BR,280000629808,2022802018,2059,9209,AC-PORTO ACRE-1,2,0.223586
2,AC,1,RIO BRANCO,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,80106,96658,AC-RIO BRANCO-1,1,0.828757
3,AC,1,RIO BRANCO,1,Presidente,13,FERNANDO HADDAD,FERNANDO HADDAD,PT,PT / PC do B / PROS,2,Não Eleito,2019-02-27 18:15:58,BR,280000629808,2022802018,16552,96658,AC-RIO BRANCO-1,2,0.171243
4,AC,2,CAPIXABA,1,Presidente,17,JAIR MESSIAS BOLSONARO,JAIR BOLSONARO,PSL,PSL / PRTB,2,Eleito,2019-02-27 18:15:58,BR,280000614517,2022802018,3895,4981,AC-CAPIXABA-2,1,0.781971


In [30]:
df_filtered.to_csv(f"data_output/Brazil/{year}_{election_round}.csv.gz", compression="gzip", index=False)

In [31]:
df_location = df[["sg_uf", "nr_zona", "nm_municipio", "qt_votos_validos", "polling_id"]].drop_duplicates()

df_location = df_location.rename(columns={
    "sg_uf": "region_id",
    "nm_municipio": "commune",
    "nr_zona": "polling_place",
    "qt_votos_validos": "electors"
})

df_location.to_csv(f"data_output/Brazil/{year}_{election_round}_location.csv.gz", compression="gzip", index=False)