In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import seaborn as sns

pd.set_option("display.max_columns", None)

In [2]:
year = 2022
election_round = "runoff"

In [3]:
# Read full data
df = pd.read_excel(f"data/France/{year}_{election_round}.xlsx")
df["polling_id"] = df["Libellé de la commune"].astype(str) + "-" + df["Code du b.vote"].astype(str)
df.head()

Unnamed: 0,Code du département,Libellé du département,Code de la circonscription,Libellé de la circonscription,Code de la commune,Libellé de la commune,Code du b.vote,Inscrits,Abstentions,% Abs/Ins,Votants,% Vot/Ins,Blancs,% Blancs/Ins,% Blancs/Vot,Nuls,% Nuls/Ins,% Nuls/Vot,Exprimés,% Exp/Ins,% Exp/Vot,N°Panneau,Sexe,Nom,Prénom,Voix,% Voix/Ins,% Voix/Exp,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,polling_id
0,1,Ain,4,4ème circonscription,1,L'Abergement-Clémenciat,1,643,146,22.71,497,77.29,42,6.53,8.45,5,0.78,1.01,450,69.98,90.54,1,M,MACRON,Emmanuel,237,36.86,52.67,2,F,LE PEN,Marine,213,33.13,47.33,L'Abergement-Clémenciat-0001
1,1,Ain,5,5ème circonscription,2,L'Abergement-de-Varey,1,213,45,21.13,168,78.87,23,10.8,13.69,3,1.41,1.79,142,66.67,84.52,1,M,MACRON,Emmanuel,94,44.13,66.2,2,F,LE PEN,Marine,48,22.54,33.8,L'Abergement-de-Varey-0001
2,1,Ain,5,5ème circonscription,4,Ambérieu-en-Bugey,1,1130,322,28.5,808,71.5,71,6.28,8.79,22,1.95,2.72,715,63.27,88.49,1,M,MACRON,Emmanuel,360,31.86,50.35,2,F,LE PEN,Marine,355,31.42,49.65,Ambérieu-en-Bugey-0001
3,1,Ain,5,5ème circonscription,4,Ambérieu-en-Bugey,2,1129,313,27.72,816,72.28,45,3.99,5.51,18,1.59,2.21,753,66.7,92.28,1,M,MACRON,Emmanuel,426,37.73,56.57,2,F,LE PEN,Marine,327,28.96,43.43,Ambérieu-en-Bugey-0002
4,1,Ain,5,5ème circonscription,4,Ambérieu-en-Bugey,3,1213,303,24.98,910,75.02,87,7.17,9.56,21,1.73,2.31,802,66.12,88.13,1,M,MACRON,Emmanuel,449,37.02,55.99,2,F,LE PEN,Marine,353,29.1,44.01,Ambérieu-en-Bugey-0003


In [4]:
if election_round == "runoff":
    df = df.rename(columns={
        "Voix": "Emmanuel MACRON",
        "Unnamed: 32": "Marine LE PEN"
    })
    
    df_filtered = df[[
        "polling_id",
#         "Code du département", 
#         "Libellé de la commune", 
#         "Code du b.vote", 
        "Emmanuel MACRON",
        "Marine LE PEN", 
    ]]
else:
    df = df.rename(columns={
        "Voix": "Nathalie ARTHAUD",
        "Unnamed: 32": "Fabien ROUSSEL",
        "Unnamed: 39": "Emmanuel MACRON",
        "Unnamed: 46": "Jean LASSALLE",
        "Unnamed: 53": "Marine LE PEN",
        "Unnamed: 60": "Éric ZEMMOUR",
        "Unnamed: 67": "Jean-Luc MÉLENCHON",
        "Unnamed: 74": "Anne HIDALGO",
        "Unnamed: 81": "Yannick JADOT",
        "Unnamed: 88": "Valérie PÉCRESSE",
        "Unnamed: 95": "Philippe POUTOU",
        "Unnamed: 102": "Nicolas DUPONT-AIGNAN"
    })
    
    df_filtered = df[[
        "polling_id",
#         "Code du département", 
#         "Libellé de la commune", 
#         "Code du b.vote", 
        "Nathalie ARTHAUD", 
        "Fabien ROUSSEL", 
        "Emmanuel MACRON",
        "Jean LASSALLE", 
        "Marine LE PEN", 
        "Éric ZEMMOUR", 
        "Jean-Luc MÉLENCHON", 
        "Anne HIDALGO",
        "Yannick JADOT",
        "Valérie PÉCRESSE",
        "Philippe POUTOU",
        "Nicolas DUPONT-AIGNAN"
    ]]

In [5]:
df_filtered = df_filtered.melt(
    id_vars=["polling_id"], 
    var_name="Candidate", 
    value_name="Votes"
)

# df_filtered["polling_id"] = df_filtered["Libellé de la commune"].astype(str) + "-" + df_filtered["Code du b.vote"].astype(str)

df_filtered.head()

Unnamed: 0,polling_id,Candidate,Votes
0,L'Abergement-Clémenciat-0001,Emmanuel MACRON,237
1,L'Abergement-de-Varey-0001,Emmanuel MACRON,94
2,Ambérieu-en-Bugey-0001,Emmanuel MACRON,360
3,Ambérieu-en-Bugey-0002,Emmanuel MACRON,426
4,Ambérieu-en-Bugey-0003,Emmanuel MACRON,449


In [6]:
df_filtered["rank"] = df_filtered.groupby(["polling_id"])["Votes"].rank("min", ascending=False).astype(int)

In [7]:
tt = df_filtered.groupby(["polling_id", "Candidate"]).agg({"Votes": "sum"})
tt["rate"] = tt.groupby(level=[0]).apply(lambda x: x/x.sum())
tt = tt.reset_index()
tt = tt[["polling_id", "Candidate", "rate"]]

In [8]:
df_filtered = pd.merge(df_filtered, tt, on=["polling_id", "Candidate"])
df_filtered = df_filtered.rename(columns={"Votes": "value", "Candidate": "candidate"})

In [9]:
df_filtered.to_csv(f"data_output/France/{year}_{election_round}.csv.gzip", compression="gzip", index=False)

In [30]:
df_filtered

Unnamed: 0,polling_id,candidate,value,rank,rate
0,L'Abergement-Clémenciat-0001,Emmanuel MACRON,237,1,0.526667
1,L'Abergement-de-Varey-0001,Emmanuel MACRON,94,1,0.661972
2,Ambérieu-en-Bugey-0001,Emmanuel MACRON,360,1,0.503497
3,Ambérieu-en-Bugey-0002,Emmanuel MACRON,426,1,0.565737
4,Ambérieu-en-Bugey-0003,Emmanuel MACRON,449,1,0.559850
...,...,...,...,...,...
139359,Zurich-0001,Marine LE PEN,1272,2,0.111208
139360,Taipei-0001,Marine LE PEN,132,2,0.177658
139361,Nour-Soultan-0001,Marine LE PEN,11,2,0.220000
139362,Monterrey-0001,Marine LE PEN,21,2,0.103960


In [10]:
df_location = df[["Code du département", "Libellé du département", "Code de la circonscription", "Libellé de la circonscription",
   "Code de la commune", "Libellé de la commune", "Code du b.vote", "Inscrits", "Abstentions", "polling_id"]]

df_location = df_location.rename(columns={
    "Code du département": "department_id",
    "Libellé du département": "department",
    "Code de la circonscription": "circunscription_id",
    "Libellé de la circonscription": "circunscription",
    "Code de la commune": "commune_id",
    "Libellé de la commune": "commune",
    "Code du b.vote": "polling_place",
    "Inscrits": "electors",
    "Abstentions": "abstentions"
})

df_location.to_csv(f"data_output/France/{year}_{election_round}_location.csv.gzip", compression="gzip", index=False)

In [11]:
# df_gini = pd.read_excel("https://www.insee.fr/fr/statistiques/fichier/5371235/RPM2021-F18.xlsx", sheet_name="figure 2", header=2)
# df_gini = df_gini.head(101)
# df_gini.head()

In [12]:
# measure = "Weight"

# data1 = df_1st_round.groupby(["Code du département", "Candidate"]).agg({measure: "mean"}).reset_index()

# vmax = data1[measure].max()
# vmin = data1[measure].min()

# tmp = pd.merge(gdf, data1, left_on="code", right_on="Code du département")
# matplotlib.rcParams["figure.dpi"] = 200

# tmp = tmp[tmp["Candidate"] == "Marine LE PEN"].copy()

# fig, ax = plt.subplots()

# tmp.plot(ax=ax, edgecolor="#222222", cmap="plasma", linewidth=0.3, column=measure, legend=True)
# # color=aa["color"], 

# ax.axis("off")
# fig.tight_layout()

In [13]:
# data2 = df_1st_round.groupby(["Code du département", "Candidate"]).agg({"Weight": "std"}).reset_index()
# data2 = data2.groupby(["Code du département"]).agg({"Weight": "mean"}).reset_index()

# dd = pd.merge(data2, df_gini, left_on="Code du département", right_on="Code")
# dd[["Weight", "Taux de pauvreté (en\xa0%)"]].corr()

In [14]:
# measure = "Taux de pauvreté (en\xa0%)"

# tmp = pd.merge(gdf, dd, left_on="code", right_on="Code du département")
# matplotlib.rcParams["figure.dpi"] = 200

# fig, ax = plt.subplots()

# tmp.plot(ax=ax, edgecolor="#222222", cmap="plasma", linewidth=0.3, column=measure, legend=True)
# # color=aa["color"], 

# ax.axis("off")
# fig.tight_layout()

In [15]:
# df_gini2 = pd.read_excel("/Users/cnavarreteliz/Downloads/indic-struct-distrib-revenu-2016-SUPRA/FILO2016_DISP_DEP.xls",
#                          sheet_name="ENSEMBLE", header=5)

# df_gini2 = df_gini2[["CODGEO", "GI16", "RD", "S80S2016"]]
# df_gini2

In [16]:
# df_pop = df.groupby("Code du département").agg({"Inscrits": "sum", "Abstentions": "sum"}).reset_index()
# df_pop["Abstention Rate"] = df_pop["Abstentions"] / df_pop["Inscrits"]
# df_pop

In [17]:

# df_model = pd.merge(dd, df_gini2, left_on="Code", right_on="CODGEO")
# df_model = pd.merge(df_model, df_pop, on="Code du département")
# df_model["Taux de pauvreté (en\xa0%)"] = df_model["Taux de pauvreté (en\xa0%)"] / 100
# df_model["Inscrits Log"] = np.log10(df_model["Inscrits"])
# df_model.corr()

In [18]:
# Y = df_model["Weight"]
# X = df_model[["Taux de pauvreté (en\xa0%)", "GI16", "Inscrits Log", "Abstention Rate"]]
# X = sm.add_constant(X)
# model = sm.OLS(Y,X)
# results = model.fit(cov_type="hc2")

# print(results.summary())

In [19]:
# measure = "Weight"

# data1 = df_1st_round.groupby(["Code du département", "Candidate"]).agg({measure: "std"}).reset_index()

# tmp = pd.merge(gdf, data1, left_on="code", right_on="Code du département")
# matplotlib.rcParams["figure.dpi"] = 200

# tmp = tmp[tmp["Candidate"] == "Jean-Luc MÉLENCHON"].copy()

# fig, ax = plt.subplots()

# tmp.plot(ax=ax, edgecolor="#222222", cmap="plasma", linewidth=0.3, column=measure, legend=True)
# # color=aa["color"], 

# ax.axis("off")
# fig.tight_layout()

In [20]:
# data1 = df_1st_round.groupby(["Code du département", "Candidate"]).agg({"Weight": "std"}).reset_index()

# aa = data1.groupby(["Code du département", "Candidate"]).agg({"Weight": "sum"})
# aa = aa.sort_values(["Code du département", "Weight"], ascending=[False, False])
# aa = aa.groupby(["Code du département"]).head(1).reset_index()

# matplotlib.rcParams["figure.dpi"] = 200

# # tmp = tmp[tmp["Candidate"] == "Marine LE PEN"].copy()

# tmp = pd.merge(gdf, aa, left_on="code", right_on="Code du département")
# tmp["color"] = tmp["Candidate"].replace({
#     "Nathalie ARTHAUD": "#aa0201",
#     "Fabien ROUSSEL": "#dd0200",
#     "Emmanuel MACRON": "#ffd601",
#     "Jean LASSALLE": "#adc0fd",
#     "Marine LE PEN": "#014a76",
#     "Éric ZEMMOUR": "#0b0a66",
#     "Jean-Luc MÉLENCHON": "#c9462b",
#     "Anne HIDALGO": "#ed1651",
#     "Yannick JADOT": "#79b31e",
#     "Valérie PÉCRESSE": "#0066cc",
#     "Philippe POUTOU": "#c00b1e",
#     "Nicolas DUPONT-AIGNAN": "#0587cc"
# })

# fig, ax = plt.subplots()

# tmp.plot(ax=ax, edgecolor="#222222", linewidth=0.3, color=tmp["color"], 
#          column="Candidate", legend=True,
#         legend_kwds={'loc': 'center left', "bbox_to_anchor": (1, 0.5)})



# ax.axis("off")
# fig.tight_layout()

In [21]:
# data1 = df_filtered.groupby(["Code du département", "Candidate"]).agg({"Votes": "mean"}).reset_index()

# aa = data1.groupby(["Code du département", "Candidate"]).agg({"Votes": "sum"})
# aa = aa.sort_values(["Code du département", "Votes"], ascending=[False, False])
# aa = aa.groupby(["Code du département"]).head(1).reset_index()

# matplotlib.rcParams["figure.dpi"] = 200

# # tmp = tmp[tmp["Candidate"] == "Marine LE PEN"].copy()

# tmp = pd.merge(gdf, aa, left_on="code", right_on="Code du département")
# tmp["color"] = tmp["Candidate"].replace({
#     "Nathalie ARTHAUD": "#aa0201",
#     "Fabien ROUSSEL": "#dd0200",
#     "Emmanuel MACRON": "#ffd601",
#     "Jean LASSALLE": "#adc0fd",
#     "Marine LE PEN": "#014a76",
#     "Éric ZEMMOUR": "#0b0a66",
#     "Jean-Luc MÉLENCHON": "#c9462b",
#     "Anne HIDALGO": "#ed1651",
#     "Yannick JADOT": "#79b31e",
#     "Valérie PÉCRESSE": "#0066cc",
#     "Philippe POUTOU": "#c00b1e",
#     "Nicolas DUPONT-AIGNAN": "#0587cc"
# })

# fig, ax = plt.subplots()

# tmp.plot(ax=ax, edgecolor="#222222", color=tmp["color"], linewidth=0.3, column="Candidate", legend=True)
# # color=aa["color"], 

# ax.axis("off")
# fig.tight_layout()

In [22]:
# import matplotlib.pyplot as plt  # Graphics
# import numba
# import seaborn                   # Graphics
# import geopandas                 # Spatial data manipulation
# import rioxarray                 # Surface data manipulation
# import xarray                    # Surface data manipulation

# from matplotlib import colors
# from pysal.lib import weights    # Spatial weights
# import contextily                # Background tiles



In [23]:
# from esda.moran import Moran

In [24]:
# import pysal as ps

In [25]:
# db = pd.merge(tmp, df_model, left_on="code", right_on="Code")
# db.head()

In [26]:
# feature = "Weight" #"Taux de pauvreté (en\xa0%)" # "GI16"
# w = weights.Queen.from_dataframe(db)

# # Row-standardization
# w.transform = "R"

# db[f"w_{feature}"] = weights.spatial_lag.lag_spatial(w, db[feature])

# db[f"{feature}_std"] = (db[feature] - db[feature].mean() )
# db[f'w_{feature}_std'] = weights.spatial_lag.lag_spatial(w, db[f"{feature}_std"])#( db['w_Weight'] - db['Weight'].mean() )

# fig, ax = plt.subplots()
# matplotlib.rcParams["figure.dpi"] = 200

# db.plot(ax=ax, edgecolor="#222222", cmap="plasma", linewidth=0.3, column=f"w_{feature}", legend=True)
# # color=aa["color"], 

# ax.axis("off")
# fig.tight_layout()

In [27]:
# fig, ax = plt.subplots(figsize=(7, 7))

# g = sns.regplot(x="w_GI16", y="w_Weight_x", data=db)
# matplotlib.rcParams["figure.dpi"] = 72

# g.set_ylabel("Spatial Lag Disagreements")
# g.set_yscale("log")
# g.set_xlabel("Spatial Lag GINI")

In [28]:
# # Setup the figure and axis
# f, ax = plt.subplots(1, figsize=(6, 6))
# # Plot values
# seaborn.regplot(
#     x='Weight_std', y='w_Weight_std', data=db, ci=None
# )
# # Add vertical and horizontal lines
# plt.axvline(0, c='k', alpha=0.5)
# plt.axhline(0, c='k', alpha=0.5)
# # Add text labels for each quadrant
# # plt.text(20, 5, "HH", fontsize=25, c='r')
# # plt.text(12, -11, "HL", fontsize=25, c='r')
# # plt.text(-20, 8.0, "LH", fontsize=25, c='r')
# # plt.text(-25, -11.0, "LL", fontsize=25, c='r')
# # Display
# plt.show()

In [29]:
# datax = df_filtered.groupby(["Code du département", "Candidate"]).agg({"Votes": "sum"}).reset_index()