In [13]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [17]:
df = (pd.read_csv("municipios.csv", index_col = 0)
    .assign(tratamento = lambda _: _.fuso_horario != "America/Sao_Paulo",
            coord = lambda _: list(zip(np.deg2rad(_.latitude), np.deg2rad(_.longitude))))
    .filter(["tratamento", "coord"])
)

In [19]:
df.head()

Unnamed: 0_level_0,tratamento,coord
id_municipio,Unnamed: 1_level_1,Unnamed: 2_level_1
5200050,False,"(-0.2924700587444458, -0.8629117261370205)"
5200100,False,"(-0.28269097894552153, -0.8500748294886021)"
5200134,False,"(-0.30361747667693356, -0.8792078653628914)"
5200159,False,"(-0.28645565414207336, -0.8755566365677194)"
5200175,False,"(-0.2614119247052067, -0.8339584591756866)"


In [71]:
abstencao = (pd.read_csv("abstencao.csv", index_col = 0)
    .assign(abstencao = lambda _: _.abstencoes / _.aptos)
    .drop(["aptos", "abstencoes", "secoes"], axis = 1)
)
abstencao.head()

Unnamed: 0,id_municipio,ano,turno,abstencao
0,1600303,2002,1,0.140636
1,1600600,2002,1,0.131125
2,1600303,2002,2,0.162194
3,1600600,2002,2,0.158141
4,5200258,2002,1,0.208052


In [35]:
municipios_controle = df.query("tratamento == 0")
municipios_tratamento = df.query("tratamento == 1")

In [36]:
coordenadas = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric="haversine").fit(list(municipios_controle.coord))
distancias, indices = coordenadas.kneighbors(list(municipios_tratamento.coord))
municipios_tratamento = municipios_tratamento.assign(distancia = distancias * 6371)
municipios_tratamento.head()

Unnamed: 0_level_0,tratamento,coord,distancia
id_municipio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2605459,True,"(-0.06701366195957427, -0.5656734278761262)",365.791137
5100102,True,"(-0.26518532654801846, -0.9837234169600679)",412.503233
5100201,True,"(-0.24523621319772324, -0.9103654831694943)",151.365068
5100250,True,"(-0.17220709943822532, -0.9788975815783036)",312.664214
5100300,True,"(-0.30220899597057416, -0.9288310666555943)",2.209545


In [37]:
coordenadas = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric="haversine").fit(list(municipios_tratamento.coord))
distancias, indices = coordenadas.kneighbors(list(municipios_controle.coord))
municipios_controle = municipios_controle.assign(distancia = distancias * 6371)
municipios_controle.head()

Unnamed: 0_level_0,tratamento,coord,distancia
id_municipio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5200050,False,"(-0.2924700587444458, -0.8629117261370205)",279.86959
5200100,False,"(-0.28269097894552153, -0.8500748294886021)",317.668256
5200134,False,"(-0.30361747667693356, -0.8792078653628914)",239.108406
5200159,False,"(-0.28645565414207336, -0.8755566365677194)",193.795343
5200175,False,"(-0.2614119247052067, -0.8339584591756866)",352.222984


In [47]:
df.head()

Unnamed: 0_level_0,tratamento,coord
id_municipio,Unnamed: 1_level_1,Unnamed: 2_level_1
5200050,False,"(-0.2924700587444458, -0.8629117261370205)"
5200100,False,"(-0.28269097894552153, -0.8500748294886021)"
5200134,False,"(-0.30361747667693356, -0.8792078653628914)"
5200159,False,"(-0.28645565414207336, -0.8755566365677194)"
5200175,False,"(-0.2614119247052067, -0.8339584591756866)"


In [80]:
base = (
pd.concat([municipios_controle, municipios_tratamento])
 .query("distancia < 150")
 .drop("distancia", axis = 1)
 .assign(incluso = 1)
 .reset_index()
.merge(df.reset_index(), how = "right")
.assign(incluso = lambda _: np.where(_.incluso.isna(), 0, 1))
)
base.to_csv("base.csv")
base.head()

Unnamed: 0,id_municipio,tratamento,coord,incluso
0,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0
1,5200100,False,"(-0.28269097894552153, -0.8500748294886021)",0
2,5200134,False,"(-0.30361747667693356, -0.8792078653628914)",0
3,5200159,False,"(-0.28645565414207336, -0.8755566365677194)",0
4,5200175,False,"(-0.2614119247052067, -0.8339584591756866)",0


In [81]:
abst = pd.merge(base, abstencao)
abst.to_csv("base_abstencao.csv")
abst.head()

Unnamed: 0,id_municipio,tratamento,coord,incluso,ano,turno,abstencao
0,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0,2002,1,0.136529
1,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0,2002,2,0.199327
2,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0,2006,1,0.161905
3,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0,2006,2,0.203101
4,5200050,False,"(-0.2924700587444458, -0.8629117261370205)",0,2010,1,0.16151
