In [1]:
import numpy as np
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

from comchoice.preprocessing.to_pairwise import to_pairwise
from comchoice.aggregate import borda, divisiveness, win_rate
from glob import glob



In [2]:
year = 2009
country = "Romania"
location_level = "county_name"

RATE_THRESHOLD = 0.02

In [3]:
df = pd.read_csv(f"data_output/{country}/{year}_first_round.csv.gzip", compression="gzip")
df.columns = [x.lower() for x in df.columns]
df.head()

Unnamed: 0,polling_id,candidate,value,rank,rate
0,261_1_1,Traian Băsescu,328,1,0.358862
1,261_1_1,Crin Antonescu,282,2,0.308534
2,261_1_1,Mircea Geoană,230,3,0.251641
3,261_1_1,Corneliu Vadim Tudor,32,4,0.035011
4,261_1_1,Sorin Oprescu,17,5,0.0186


In [4]:
dd = df.groupby("candidate").agg({"value": "sum"})
dd["rate"] = dd.apply(lambda x: x/x.sum())
values = list(dd[dd["rate"] > RATE_THRESHOLD].index.unique())

In [5]:
df_runoff = pd.read_csv(f"data_output/{country}/{year}_runoff.csv.gzip", compression="gzip")
df_runoff.columns = [x.lower() for x in df_runoff.columns]
df_runoff.head()

Unnamed: 0,polling_id,candidate,value,rank,rate
0,261_1_1,Traian Băsescu,493,1,0.536453
1,261_1_1,Mircea Geoană,426,2,0.463547
2,261_1_2,Traian Băsescu,412,1,0.575419
3,261_1_2,Mircea Geoană,304,2,0.424581
4,261_1_3,Traian Băsescu,491,1,0.554176


In [6]:
df = df[df["candidate"].isin(values)]
df_runoff = df_runoff[df_runoff["candidate"].isin(values)]

In [7]:
df_location = pd.read_csv(f"data_output/{country}/{year}_first_round_location.csv.gzip", compression="gzip")
df_location.head()

Unnamed: 0,location_id,location_id.1,Id,county_id,polling_place,address,location_id.2,county_id.1,county_name,election_id,polling_id
0,87312,ALBAC,87312,1,193,"Caminul cultural Albac, strada Closca, nr. 9, ...",87312,1,Alba,261,261_1_193
1,87312,ALBAC,87312,1,194,"Scoala cu clasele I - IV Cionesti, sat. Ciones...",87312,1,Alba,261,261_1_194
2,87312,ALBAC,87312,1,195,"Scoala cu clasele I - IV Costesti, sat. Costes...",87312,1,Alba,261,261_1_195
3,87312,ALBAC,87312,1,196,"Scoala cu clasele I - IV Rogoz, sat. Rogoz, Sa...",87312,1,Alba,261,261_1_196
4,87312,ALBAC,87312,1,197,"Scoala cu clasele I - IV Potionci, sat. Potion...",87312,1,Alba,261,261_1_197


In [8]:
df1 = pd.merge(df, df_location[["polling_id", location_level]])

df1 = df1.groupby([location_level, "candidate"]).agg({"value": "sum"})
df1["rate"] = df1.groupby(level=[0]).apply(lambda x: x/x.sum())
df1 = df1.reset_index()

df2 = pd.merge(df_runoff, df_location[["polling_id", location_level]])

df2 = df2.groupby([location_level, "candidate"]).agg({"value": "sum"})
df2["rate"] = df2.groupby(level=[0]).apply(lambda x: x/x.sum())
df2 = df2.reset_index()

df_rounds = pd.merge(df1, df2, on=[location_level, "candidate"])
df_rounds["diff"] = df_rounds["rate_y"] - df_rounds["rate_x"]

In [9]:
path = f"data_output/{country}/{year}_pairwise.csv.gzip"

if not os.path.isfile(path):

    df_pwc = to_pairwise(
        df,
        alternative="candidate",
        verbose=True,
        voter="polling_id"
    )
    df_pwc.to_csv(path, compression="gzip", index=False)
    
else: 
    df_pwc = pd.read_csv(path, compression="gzip")

In [10]:
# data = pd.merge(df_location[[location_level, "polling_id"]].drop_duplicates(), df.copy(), on="polling_id").copy()

# df_dv = data.groupby([location_level, "candidate"]).agg({"rank": "std"}).reset_index()\
#     .rename(columns={"rank": "value", "candidate": "alternative"})

In [11]:
data = pd.merge(df_location[[location_level, "polling_id"]].drop_duplicates(), df.copy(), on="polling_id").copy()

df_dv = data.groupby([location_level, "candidate"]).agg({"rate": "std"}).rename(columns={"rate": "value"}).reset_index()
path = f"data_output/{country}/{year}_divisiveness_{location_level}.csv.gzip"

df_dv.to_csv(path, compression="gzip", index=False)

In [12]:
# df_test = pd.merge(df_rounds, df_dv, on=["candidate", location_level])
# df_test.head()

# sns.set(font_scale=2)
# plt.figure(figsize=(7, 7))

# g = sns.scatterplot(x="value", y="diff", hue="candidate", s=70,
#                data=df_test[(df_test["value"] > 0) & (df_test["value"] < 1)])
# g.set_xlabel("Divisiveness")
# # g.set_xlabel("Votes (%) First Round")
# g.set_ylabel("Growth Rate First Round and Runoff")

# # g.set_axis_labels("Divisiveness", "Growth Rate First Round and Runoff")
# plt.legend(title="", loc='upper center', bbox_to_anchor=(0.5, -0.15),
#           fancybox=False, shadow=False, frameon=False, ncol=1)

# # plt.xscale('log')
# # plt.yscale('log')

In [13]:

# path = f"data_output/{country}/{year}_divisiveness_{location_level}.csv.gzip"

# if not os.path.isfile(path):

#     output = []

#     data = pd.merge(df_location[[location_level, "polling_id"]].drop_duplicates(), df_pwc.copy(), on="polling_id").copy()
#     for i, tmp in data.groupby(location_level):
#         dv = divisiveness(
#             tmp,
#             method=win_rate,
#             voter="polling_id",
#             method_kws=dict(voter="polling_id")
#         )
#         dv[location_level] = i
#         output.append(dv)

#         print(i)


#     df_dv = pd.concat(output, ignore_index=True)
#     df_dv.to_csv(path, compression="gzip", index=False)
    
# else:
#     df_dv = pd.read_csv(path, compression="gzip")

# df_dv.head()