In [1]:
from helpers import create_polling_id
import os
import pandas as pd

os.chdir("../")

In [2]:
year = 2023

df = pd.read_excel(f"data/Chile/{year}_PlebiscitoConstitucional_DatosPlebiscito.xlsx", header=6)
df.head()

Unnamed: 0,Nro Región,Región,Circunscripción senatorial,Distrito,Comuna,Circunscripción electoral,Local,Mesa,Nro.Voto,Opciones,Votos,Selección
0,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,1.0,A FAVOR,154.0,
1,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,2.0,EN CONTRA,163.0,EN CONTRA
2,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,VOTOS EN BLANCO,4.0,
3,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,VOTOS NULOS,14.0,
4,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,66,1.0,A FAVOR,142.0,


In [3]:
df = df.rename(columns={
    "Nro Región": "region_id",
    "Región": "region",
    "Circunscripción senatorial": "senatorial_constituency",
    "Distrito": "district",
    "Comuna": "commune",
    "Circunscripción electoral": "electoral_district",
    "Local": "polling_place",
    "Mesa": "polling_station",
    "Nro.Voto": "candidate_order",
    "Selección": "option",
    "Opciones": "candidate",
    "Votos": "value"
})

df["candidate"] = df["candidate"].replace({
    "VOTOS EN BLANCO": "BLANK",
    "VOTOS NULOS": "SPOILT"
})

df["polling_id"] = create_polling_id(df, columns=["region_id", "senatorial_constituency", "district", "commune", "electoral_district", "polling_place", "polling_station"])

df.head()

Unnamed: 0,region_id,region,senatorial_constituency,district,commune,electoral_district,polling_place,polling_station,candidate_order,candidate,value,option,polling_id
0,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,1.0,A FAVOR,154.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
1,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,2.0,EN CONTRA,163.0,EN CONTRA,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
2,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,BLANK,4.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
3,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,SPOILT,14.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
4,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,66,1.0,A FAVOR,142.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...


Spoilt votes

In [4]:
df_spoilt = df[df["candidate"].isin(["SPOILT", "BLANK"])].copy()
df_spoilt = df_spoilt[["candidate", "value", "polling_id"]]
df_spoilt["flag_candidates"] = 0
df_spoilt.head()

Unnamed: 0,candidate,value,polling_id,flag_candidates
2,BLANK,4.0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,0
3,SPOILT,14.0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,0
6,BLANK,4.0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,0
7,SPOILT,11.0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,0
10,BLANK,2.0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,0


In [5]:
df_filtered = df[~df["candidate"].isin(["BLANK", "SPOILT"])]

df_filtered = df_filtered.groupby(["polling_id", "candidate"]).agg({"value": "sum"})
df_filtered["rate"] = df_filtered.groupby(level=[0], group_keys=False).apply(lambda x: x/x.sum())
df_filtered = df_filtered.reset_index()
df_filtered["rank"] = df_filtered.groupby(["polling_id"])["value"].rank("min", ascending=False).astype(int)
df_filtered["flag_candidates"] = 1
df_filtered.head()

Unnamed: 0,polling_id,candidate,value,rate,rank,flag_candidates
0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,135.0,0.421875,2,1
1,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,EN CONTRA,185.0,0.578125,1,1
2,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,141.0,0.454839,2,1
3,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,EN CONTRA,169.0,0.545161,1,1
4,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,133.0,0.423567,2,1


In [6]:
data = pd.concat([df_filtered, df_spoilt])

data.head()

Unnamed: 0,polling_id,candidate,value,rate,rank,flag_candidates
0,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,135.0,0.421875,2.0,1
1,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,EN CONTRA,185.0,0.578125,1.0,1
2,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,141.0,0.454839,2.0,1
3,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,EN CONTRA,169.0,0.545161,1.0,1
4,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...,A FAVOR,133.0,0.423567,2.0,1


In [7]:
data.to_csv(f"data_output/Chile/{year}_plebiscite_polling_station.csv.gz", compression="gzip", index=False)

In [8]:
df.head()

Unnamed: 0,region_id,region,senatorial_constituency,district,commune,electoral_district,polling_place,polling_station,candidate_order,candidate,value,option,polling_id
0,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,1.0,A FAVOR,154.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
1,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,2.0,EN CONTRA,163.0,EN CONTRA,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
2,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,BLANK,4.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
3,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,67,,SPOILT,14.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...
4,1,DE TARAPACA,CIRCUNSCRIPCION SENATORIAL 2,DISTRITO 2,ALTO HOSPICIO,ALTO HOSPICIO,ANEXO DE COLEGIO SAN ANTONIO DE MATILLA,66,1.0,A FAVOR,142.0,,1-CIRCUNSCRIPCION SENATORIAL 2-DISTRITO 2-ALTO...


In [9]:
df_location = df[["polling_id", "region_id", "region", "senatorial_constituency", "district", "commune", "electoral_district", "polling_place", "polling_station"]].drop_duplicates()

df_location.to_csv(f"data_output/Chile/{year}_plebiscite_polling_station_location.csv.gz", compression="gzip", index=False)