In [3]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import seaborn as sns

pd.set_option("display.max_columns", None)



In [4]:
year = 2007
election_round = "runoff"

In [5]:
# Read full data
if year == 2022:
    df = pd.read_excel(f"data/France/{year}_{election_round}.xlsx")
    df["polling_id"] = df["Code du département"].astype(str) + "-" + df["Code de la commune"].astype(str) + "-" + df["Code du b.vote"].astype(str)
    
elif year == 2017:
    df = pd.read_csv(f"data/France/{year}_{election_round}.txt", delimiter=";", encoding="latin-1")
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    
    df["polling_id"] = df["level_0"].astype(str) + "-" + df["level_4"].astype(str) + "-" +\
        "-" + df["level_22"].astype(str)
    
    if election_round == "runoff":
        candidates = {
            "level_25": "Emmanuel MACRON",
            "level_32": "Marine LE PEN"
        }
        
    else:
        candidates = {
            "level_25": "Nicolas DUPONT-AIGNAN",
            "level_32": "Marine LE PEN",
            "level_39": "Emmanuel MACRON",
            "level_46": "Benoît HAMON",
            "level_53": "Nathalie ARTHAUD",
            "level_60": "Philippe POUTOU",
            "level_67": "Jacques CHEMINADE",
            "level_74": "Jean LASSALLE",
            "level_81": "Jean-Luc MÉLENCHON",
            "level_88": "François ASSELINEAU",
            "level_95": "François FILLON"
        }
        
    df = df.rename(columns=candidates)
    df_filtered = df[["polling_id"] + list(candidates.values())]
    
elif year == 2012:
    df = pd.read_csv(f"data/France/{year}.txt", delimiter=";", encoding="latin-1")
    rnd = 1 if election_round == "first_round" else 2
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    df = df[df["level_1"] == rnd].copy()

    df["polling_id"] = df["level_2"].astype(str) + "-" + df["level_3"].astype(str)\
        + "-" + df["level_6"].astype(str) + "-" + df["level_7"].astype(str)

    df["Candidate"] = df["level_13"] + " " + df["level_12"]
    df = df.rename(columns={"level_15": "Votes"})

elif year in [2002, 2007]:
    df = pd.read_csv(f"data/France/{year}.txt", delimiter=";", header=16, encoding="latin-1")
    rnd = 1 if election_round == "first_round" else 2
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    df = df[df["level_0"] == rnd].copy()
    df["level_1"] = df["level_1"].astype(str).str.zfill(2)

    df["polling_id"] = df["level_1"].astype(str) + "-" + df["level_2"].astype(str)\
        + "-" + df["level_4"].astype(str)

    df["Candidate"] = df["level_10"] + " " + df["level_9"]
    df = df.rename(columns={"level_12": "Votes"})

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,level_11,Votes,polling_id,Candidate
787404,2,1,1,L'Abergement-Clémenciat,1,596,534,512,8,ROYAL,Ségolène,ROYA,197,01-1-0001,Ségolène ROYAL
787405,2,1,1,L'Abergement-Clémenciat,1,596,534,512,12,SARKOZY,Nicolas,SARK,315,01-1-0001,Nicolas SARKOZY
787406,2,1,2,L'Abergement-de-Varey,1,205,183,178,8,ROYAL,Ségolène,ROYA,76,01-2-0001,Ségolène ROYAL
787407,2,1,2,L'Abergement-de-Varey,1,205,183,178,12,SARKOZY,Nicolas,SARK,102,01-2-0001,Nicolas SARKOZY
787408,2,1,4,Ambérieu-en-Bugey,1,1077,853,810,8,ROYAL,Ségolène,ROYA,394,01-4-0001,Ségolène ROYAL


In [4]:
if year == 2022:

    if election_round == "runoff":
        df = df.rename(columns={
            "Voix": "Emmanuel MACRON",
            "Unnamed: 32": "Marine LE PEN"
        })

        df_filtered = df[[
            "polling_id",
    #         "Code du département", 
    #         "Libellé de la commune", 
    #         "Code du b.vote", 
            "Emmanuel MACRON",
            "Marine LE PEN", 
        ]]
    else:
        df = df.rename(columns={
            "Voix": "Nathalie ARTHAUD",
            "Unnamed: 32": "Fabien ROUSSEL",
            "Unnamed: 39": "Emmanuel MACRON",
            "Unnamed: 46": "Jean LASSALLE",
            "Unnamed: 53": "Marine LE PEN",
            "Unnamed: 60": "Éric ZEMMOUR",
            "Unnamed: 67": "Jean-Luc MÉLENCHON",
            "Unnamed: 74": "Anne HIDALGO",
            "Unnamed: 81": "Yannick JADOT",
            "Unnamed: 88": "Valérie PÉCRESSE",
            "Unnamed: 95": "Philippe POUTOU",
            "Unnamed: 102": "Nicolas DUPONT-AIGNAN"
        })

        df_filtered = df[[
            "polling_id",
    #         "Code du département", 
    #         "Libellé de la commune", 
    #         "Code du b.vote", 
            "Nathalie ARTHAUD", 
            "Fabien ROUSSEL", 
            "Emmanuel MACRON",
            "Jean LASSALLE", 
            "Marine LE PEN", 
            "Éric ZEMMOUR", 
            "Jean-Luc MÉLENCHON", 
            "Anne HIDALGO",
            "Yannick JADOT",
            "Valérie PÉCRESSE",
            "Philippe POUTOU",
            "Nicolas DUPONT-AIGNAN"
        ]]

In [9]:
len(df.level_1.unique())

101

In [6]:
if year in [2017, 2022]:
    df_filtered = df_filtered.melt(
        id_vars=["polling_id"], 
        var_name="Candidate", 
        value_name="Votes"
    )

elif year in [2002, 2007, 2012]:
    df_filtered = df[["polling_id", "Candidate", "Votes"]].copy()
# df_filtered["polling_id"] = df_filtered["Libellé de la commune"].astype(str) + "-" + df_filtered["Code du b.vote"].astype(str)

df_filtered.head()

Unnamed: 0,polling_id,Candidate,Votes
787404,01-1-0001,Ségolène ROYAL,197
787405,01-1-0001,Nicolas SARKOZY,315
787406,01-2-0001,Ségolène ROYAL,76
787407,01-2-0001,Nicolas SARKOZY,102
787408,01-4-0001,Ségolène ROYAL,394


In [10]:
df_filtered["rank"] = df_filtered.groupby(["polling_id"])["Votes"].rank("min", ascending=False).astype(int)

In [11]:
df_filtered.sort_values("rank")

Unnamed: 0,polling_id,Candidate,Votes,rank
831231,35-49-0003,Nicolas SARKOZY,543,1
841715,44-84-0004,Nicolas SARKOZY,527,1
841713,44-84-0003,Nicolas SARKOZY,431,1
891637,77-96-0001,Nicolas SARKOZY,443,1
841710,44-84-0002,Ségolène ROYAL,462,1
...,...,...,...,...
845828,47-219-0001,Ségolène ROYAL,46,2
845826,47-218-0001,Ségolène ROYAL,57,2
845824,47-217-0001,Ségolène ROYAL,301,2
845841,47-224-0001,Nicolas SARKOZY,59,2


In [12]:
tt = df_filtered.groupby(["polling_id", "Candidate"]).agg({"Votes": "sum"})
tt["rate"] = tt.groupby(level=[0]).apply(lambda x: x/x.sum())
tt = tt.reset_index()
tt = tt[["polling_id", "Candidate", "rate"]]

In [13]:
df_filtered = pd.merge(df_filtered, tt, on=["polling_id", "Candidate"])
df_filtered = df_filtered.rename(columns={"Votes": "value", "Candidate": "candidate"})

In [22]:
df_filtered.to_csv(f"data_output/France/{year}_{election_round}.csv.gz", compression="gzip", index=False)

In [11]:
df.Candidate.unique()

array(['JACQUES CHIRAC', 'JEAN-MARIE LE PEN'], dtype=object)

In [17]:
df_filtered

Unnamed: 0,polling_id,candidate,value,rank,rate
0,01-1-0001,Ségolène ROYAL,197,2,0.384766
1,01-1-0001,Nicolas SARKOZY,315,1,0.615234
2,01-2-0001,Ségolène ROYAL,76,2,0.426966
3,01-2-0001,Nicolas SARKOZY,102,1,0.573034
4,01-4-0001,Ségolène ROYAL,394,2,0.486420
...,...,...,...,...,...
131229,ZM-517-73.0,Nicolas SARKOZY,143,2,0.488055
131230,ZM-517-74.0,Ségolène ROYAL,142,1,0.614719
131231,ZM-517-74.0,Nicolas SARKOZY,89,2,0.385281
131232,ZM-517-99,Ségolène ROYAL,55,1,0.625000


In [23]:
asd = pd.merge(df_filtered, df_location, on="polling_id")

In [24]:
len(asd.department_id.unique())

101

In [15]:
if year == 2017:

    df_location = df[["polling_id", "level_0", "level_1", "level_2", "level_3", "level_4", "level_5", "level_22", "level_7", "level_8"]]

    df_location = df_location.rename(columns={
        "level_0": "department_id",
        "level_1": "department",
        "level_2": "circunscription_id",
        "level_3": "circunscription",
        "level_4": "commune_id",
        "level_5": "commune",
        "level_22": "polling_place",
        "level_7": "electors",
        "level_8": "abstentions"
    })
    
elif year == 2022:
    df_location = df[["Code du département", "Libellé du département", "Code de la circonscription", "Libellé de la circonscription",
       "Code de la commune", "Libellé de la commune", "Code du b.vote", "Inscrits", "Abstentions", "polling_id"]]

    df_location = df_location.rename(columns={
        "Code du département": "department_id",
        "Libellé du département": "department",
        "Code de la circonscription": "circunscription_id",
        "Libellé de la circonscription": "circunscription",
        "Code de la commune": "commune_id",
        "Libellé de la commune": "commune",
        "Code du b.vote": "polling_place",
        "Inscrits": "electors",
        "Abstentions": "abstentions"
    })
    
elif year == 2012:
    df_location = df[["level_2", "level_3", "level_4", "level_8", "polling_id"]].copy().drop_duplicates()

    df_location = df_location.rename(columns={
        "level_2": "department_id",
        "level_3": "commune_id",
        "level_4": "commune",
        "level_8": "electors"
    })
    
elif year in [2002, 2007]:
    df_location = df[["level_1", "level_2", "level_3", "level_4", "level_5", "level_6", "polling_id"]].copy().drop_duplicates()

    df_location = df_location.rename(columns={
        "level_1": "department_id",
        "level_2": "commune_id",
        "level_3": "commune",
        "level_4": "polling_station",
        "level_5": "inscrits",
        "level_6": "voters"
    })

df_location["commune_id"] = df_location["department_id"].astype(str).str.zfill(2) + df_location["commune_id"].astype(str).str.zfill(3)
df_location.to_csv(f"data_output/France/{year}_{election_round}_location.csv.gz", compression="gzip", index=False)