In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import seaborn as sns

pd.set_option("display.max_columns", None)



In [2]:
year = 2002
election_round = "runoff"

In [3]:
# Read full data
if year == 2022:
    df = pd.read_excel(f"data/France/{year}_{election_round}.xlsx")
    df["polling_id"] = df["Code du département"].astype(str) + "-" + df["Code de la commune"].astype(str) + "-" + df["Code du b.vote"].astype(str)
    
elif year == 2017:
    df = pd.read_csv(f"data/France/{year}_{election_round}.txt", delimiter=";", encoding="latin-1")
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    
    df["polling_id"] = df["level_0"].astype(str) + "-" + df["level_4"].astype(str) + "-" +\
        "-" + df["level_22"].astype(str)
    
    if election_round == "runoff":
        candidates = {
            "level_25": "Emmanuel MACRON",
            "level_32": "Marine LE PEN"
        }
        
    else:
        candidates = {
            "level_25": "Nicolas DUPONT-AIGNAN",
            "level_32": "Marine LE PEN",
            "level_39": "Emmanuel MACRON",
            "level_46": "Benoît HAMON",
            "level_53": "Nathalie ARTHAUD",
            "level_60": "Philippe POUTOU",
            "level_67": "Jacques CHEMINADE",
            "level_74": "Jean LASSALLE",
            "level_81": "Jean-Luc MÉLENCHON",
            "level_88": "François ASSELINEAU",
            "level_95": "François FILLON"
        }
        
    df = df.rename(columns=candidates)
    df_filtered = df[["polling_id"] + list(candidates.values())]
    
elif year == 2012:
    df = pd.read_csv(f"data/France/{year}.txt", delimiter=";", encoding="latin-1")
    rnd = 1 if election_round == "first_round" else 2
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    df = df[df["level_1"] == rnd].copy()

    df["polling_id"] = df["level_2"].astype(str) + "-" + df["level_3"].astype(str)\
        + "-" + df["level_6"].astype(str) + "-" + df["level_7"].astype(str)

    df["Candidate"] = df["level_13"] + " " + df["level_12"]
    df = df.rename(columns={"level_15": "Votes"})

elif year in [2002, 2007]:
    df = pd.read_csv(f"data/France/{year}.txt", delimiter=";", header=16, encoding="latin-1")
    rnd = 1 if election_round == "first_round" else 2
    df = df.reset_index()
    df.columns = [f"level_{x}" for x in range(len(list(df)))]
    df = df[df["level_0"] == rnd].copy()
    df["level_1"] = df["level_1"].astype(str).str.zfill(2)

    df["polling_id"] = df["level_1"].astype(str) + "-" + df["level_2"].astype(str)\
        + "-" + df["level_4"].astype(str)

    df["Candidate"] = df["level_10"] + " " + df["level_9"]
    df = df.rename(columns={"level_12": "Votes"})

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,level_9,level_10,level_11,Votes,polling_id,Candidate
1026256,2,1,1,L'Abergement-Clémenciat,1,563,467,452,5,CHIRAC,JACQUES,CHIR,349,01-1-1,JACQUES CHIRAC
1026257,2,1,1,L'Abergement-Clémenciat,1,563,467,452,6,LE PEN,JEAN-MARIE,LEPE,103,01-1-1,JEAN-MARIE LE PEN
1026258,2,1,2,L'Abergement-de-Varey,1,196,160,143,5,CHIRAC,JACQUES,CHIR,95,01-2-1,JACQUES CHIRAC
1026259,2,1,2,L'Abergement-de-Varey,1,196,160,143,6,LE PEN,JEAN-MARIE,LEPE,48,01-2-1,JEAN-MARIE LE PEN
1026260,2,1,4,Ambérieu-en-Bugey,1,1388,1069,1003,5,CHIRAC,JACQUES,CHIR,813,01-4-1,JACQUES CHIRAC


In [4]:
if year == 2022:

    if election_round == "runoff":
        df = df.rename(columns={
            "Voix": "Emmanuel MACRON",
            "Unnamed: 32": "Marine LE PEN"
        })

        df_filtered = df[[
            "polling_id",
    #         "Code du département", 
    #         "Libellé de la commune", 
    #         "Code du b.vote", 
            "Emmanuel MACRON",
            "Marine LE PEN", 
        ]]
    else:
        df = df.rename(columns={
            "Voix": "Nathalie ARTHAUD",
            "Unnamed: 32": "Fabien ROUSSEL",
            "Unnamed: 39": "Emmanuel MACRON",
            "Unnamed: 46": "Jean LASSALLE",
            "Unnamed: 53": "Marine LE PEN",
            "Unnamed: 60": "Éric ZEMMOUR",
            "Unnamed: 67": "Jean-Luc MÉLENCHON",
            "Unnamed: 74": "Anne HIDALGO",
            "Unnamed: 81": "Yannick JADOT",
            "Unnamed: 88": "Valérie PÉCRESSE",
            "Unnamed: 95": "Philippe POUTOU",
            "Unnamed: 102": "Nicolas DUPONT-AIGNAN"
        })

        df_filtered = df[[
            "polling_id",
    #         "Code du département", 
    #         "Libellé de la commune", 
    #         "Code du b.vote", 
            "Nathalie ARTHAUD", 
            "Fabien ROUSSEL", 
            "Emmanuel MACRON",
            "Jean LASSALLE", 
            "Marine LE PEN", 
            "Éric ZEMMOUR", 
            "Jean-Luc MÉLENCHON", 
            "Anne HIDALGO",
            "Yannick JADOT",
            "Valérie PÉCRESSE",
            "Philippe POUTOU",
            "Nicolas DUPONT-AIGNAN"
        ]]

In [5]:
if year in [2017, 2022]:
    df_filtered = df_filtered.melt(
        id_vars=["polling_id"], 
        var_name="Candidate", 
        value_name="Votes"
    )

elif year in [2002, 2007, 2012]:
    df_filtered = df[["polling_id", "Candidate", "Votes"]].copy()
# df_filtered["polling_id"] = df_filtered["Libellé de la commune"].astype(str) + "-" + df_filtered["Code du b.vote"].astype(str)

df_filtered.head()

Unnamed: 0,polling_id,Candidate,Votes
1026256,01-1-1,JACQUES CHIRAC,349
1026257,01-1-1,JEAN-MARIE LE PEN,103
1026258,01-2-1,JACQUES CHIRAC,95
1026259,01-2-1,JEAN-MARIE LE PEN,48
1026260,01-4-1,JACQUES CHIRAC,813


In [6]:
df_filtered["rank"] = df_filtered.groupby(["polling_id"])["Votes"].rank("min", ascending=False).astype(int)

In [7]:
df_filtered.sort_values("rank")

Unnamed: 0,polling_id,Candidate,Votes,rank
1026256,01-1-1,JACQUES CHIRAC,349,1
1097536,58-115-0001,JACQUES CHIRAC,108,1
1097534,58-114-0001,JACQUES CHIRAC,72,1
1097532,58-113-0001,JACQUES CHIRAC,73,1
1097530,58-112-0001,JACQUES CHIRAC,277,1
...,...,...,...,...
1101601,59-650-0016,JEAN-MARIE LE PEN,195,2
1052735,24-92-0001,JEAN-MARIE LE PEN,37,2
1101599,59-650-0015,JEAN-MARIE LE PEN,139,2
1101607,59-650-0019,JEAN-MARIE LE PEN,107,2


In [8]:
tt = df_filtered.groupby(["polling_id", "Candidate"]).agg({"Votes": "sum"})
tt["rate"] = tt.groupby(level=[0]).apply(lambda x: x/x.sum())
tt = tt.reset_index()
tt = tt[["polling_id", "Candidate", "rate"]]

In [9]:
df_filtered = pd.merge(df_filtered, tt, on=["polling_id", "Candidate"])
df_filtered = df_filtered.rename(columns={"Votes": "value", "Candidate": "candidate"})

In [10]:
df_filtered.to_csv(f"data_output/France/{year}_{election_round}.csv.gz", compression="gzip", index=False)

In [11]:
df.Candidate.unique()

array(['JACQUES CHIRAC', 'JEAN-MARIE LE PEN'], dtype=object)

In [12]:
if year == 2017:

    df_location = df[["polling_id", "level_0", "level_1", "level_2", "level_3", "level_4", "level_5", "level_22", "level_7", "level_8"]]

    df_location = df_location.rename(columns={
        "level_0": "department_id",
        "level_1": "department",
        "level_2": "circunscription_id",
        "level_3": "circunscription",
        "level_4": "commune_id",
        "level_5": "commune",
        "level_22": "polling_place",
        "level_7": "electors",
        "level_8": "abstentions"
    })
    
elif year == 2022:
    df_location = df[["Code du département", "Libellé du département", "Code de la circonscription", "Libellé de la circonscription",
       "Code de la commune", "Libellé de la commune", "Code du b.vote", "Inscrits", "Abstentions", "polling_id"]]

    df_location = df_location.rename(columns={
        "Code du département": "department_id",
        "Libellé du département": "department",
        "Code de la circonscription": "circunscription_id",
        "Libellé de la circonscription": "circunscription",
        "Code de la commune": "commune_id",
        "Libellé de la commune": "commune",
        "Code du b.vote": "polling_place",
        "Inscrits": "electors",
        "Abstentions": "abstentions"
    })
    
elif year == 2012:
    df_location = df[["level_2", "level_3", "level_4", "level_8", "polling_id"]].copy().drop_duplicates()

    df_location = df_location.rename(columns={
        "level_2": "department_id",
        "level_3": "commune_id",
        "level_4": "commune",
        "level_8": "electors"
    })
    
elif year in [2002, 2007]:
    df_location = df[["level_1", "level_2", "level_3", "level_4", "level_5", "level_6", "polling_id"]].copy().drop_duplicates()

    df_location = df_location.rename(columns={
        "level_1": "department_id",
        "level_2": "commune_id",
        "level_3": "commune",
        "level_4": "polling_station",
        "level_5": "inscrits",
        "level_6": "voters"
    })

df_location["commune_id"] = df_location["department_id"].astype(str).str.zfill(2) + df_location["commune_id"].astype(str).str.zfill(3)
df_location.to_csv(f"data_output/France/{year}_{election_round}_location.csv.gz", compression="gzip", index=False)