# Study Grouped Catalan Elections Dataset

Load libraries:

In [1]:
import pandas as pd
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import textdistance
from tqdm import tqdm
from unidecode import unidecode
import logging

pp = pprint.PrettyPrinter(indent=2)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

Load the clean dataset:

In [2]:
df = pd.read_pickle('../../data/processed/catalan-elections-grouped-data.pkl')
df_original = df.copy()

## Dataset Structure 

Visualize the structure of the dataset:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12339340 entries, 0 to 12339339
Data columns (total 24 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   index_autonumeric       int64         
 1   nom_eleccio             object        
 2   id_nivell_territorial   object        
 3   nom_nivell_territorial  object        
 4   territori_codi          object        
 5   territori_nom           object        
 6   seccio                  Int64         
 7   vots                    int32         
 8   escons                  float64       
 9   districte               Int64         
 10  mesa                    object        
 11  party_code              int32         
 12  party_name              object        
 13  party_abbr              object        
 14  party_color             object        
 15  clean_party_name        object        
 16  clean_party_abbr        object        
 17  type                    object        
 18  

First of all, we want to check that for every section in an election there aren't any repeaded ``party_codes``:

In [8]:
df_section = df[df["id_nivell_territorial"] == "SE"]

In [9]:
def check_duplicated_party_codes(df):
    def has_duplicates(group):
        duplicated = group["party_code"].duplicated(keep=False)
        if duplicated.any():
            return pd.Series(
                {
                    "party_code": group.loc[duplicated, "party_code"].unique().tolist(),
                    "party_name": group.loc[duplicated, "party_name"].unique().tolist(),
                }
            )

    duplicates = df_section.groupby(
        ["nom_eleccio", "seccio", "districte", "territori_codi"]
    ).apply(has_duplicates)
    return duplicates.dropna()


duplicated_party_codes = check_duplicated_party_codes(df)

In [10]:
duplicated_party_codes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,party_code,party_name
nom_eleccio,seccio,districte,territori_codi,Unnamed: 4_level_1,Unnamed: 5_level_1
Eleccions Municipals 1979,1,1,8019,[3000000],[Altres partits]
Eleccions Municipals 1979,1,2,8019,[3000000],[Altres partits]
Eleccions Municipals 1979,1,3,8019,[3000000],[Altres partits]
Eleccions Municipals 1979,1,4,8019,[3000000],[Altres partits]
Eleccions Municipals 1979,1,5,8019,[3000000],[Altres partits]
...,...,...,...,...,...
Eleccions Municipals 2015,236,10,8019,[5000000],[Independents]
Eleccions Municipals 2015,237,10,8019,[5000000],[Independents]
Eleccions Municipals 2019,1,1,8171,[201924111],[AAE OSONA]
Eleccions Municipals 2019,1,1,8246,[201924111],[AAE OSONA]


In [23]:
df_section[
    (df_section["nom_eleccio"] == "Eleccions Municipals 2019")
    & (df_section["territori_codi"] == 8171)
    # & (df_section["seccio"] == 1)
    # & (df_section["districte"] == 1)
][["territori_nom", "districte", "seccio", "party_code", "party_name", "vots"]]

Unnamed: 0,territori_nom,districte,seccio,party_code,party_name,vots
12317528,Prats de Lluçanès,1,1,2019839,ESQUERRA REPUBLICANA DE CATALUNYA-ACORD MUNICIPAL,242
12317529,Prats de Lluçanès,1,1,201924111,AAE OSONA,113
12317530,Prats de Lluçanès,1,1,201924111,AAE OSONA,69
12317531,Prats de Lluçanès,1,1,20191031,JUNTS PER CATALUNYA-JUNTS,335
12317532,Prats de Lluçanès,1,2,2019839,ESQUERRA REPUBLICANA DE CATALUNYA-ACORD MUNICIPAL,262
12317533,Prats de Lluçanès,1,2,201924111,AAE OSONA,175
12317534,Prats de Lluçanès,1,2,201924111,AAE OSONA,69
12317535,Prats de Lluçanès,1,2,20191031,JUNTS PER CATALUNYA-JUNTS,344


In [12]:
# Join all lists of party codes and names into one list
all_party_codes = duplicated_party_codes['party_code'].sum()
all_party_names = duplicated_party_codes['party_name'].sum()

# Count the number of times each party code appears
party_code_counts = pd.Series(all_party_codes).value_counts()

# Count the number of times each party name appears
party_name_counts = pd.Series(all_party_names).value_counts()

print(party_code_counts)
print(party_name_counts)

3000000      20188
5000000       5748
201924111        3
Name: count, dtype: int64
Altres partits    20188
Independents       5748
AAE OSONA             3
Name: count, dtype: int64


In [38]:
df_results = pd.read_csv('../../data/raw/catalan-elections-data.csv')
df_results_section = df_results[df_results["id_nivell_territorial"] == "SE"]

  df_results = pd.read_csv('../../data/raw/catalan-elections-data.csv')


In [36]:
df_results_section[
    (df_results_section["nom_eleccio"] == "Eleccions Municipals 2019")
    & (df_results_section["territori_codi"] == 8171)
    # & (df_results_section["seccio"] == 1)
    # & (df_results_section["districte"] == 1)
][["territori_nom", "districte", "seccio", "party_code", "party_name", "vots"]]

Unnamed: 0,territori_nom,districte,seccio,party_code,party_name,vots
12317528,Prats de Lluçanès,1,1,2019839,ESQUERRA REPUBLICANA DE CATALUNYA-ACORD MUNICIPAL,242
12317529,Prats de Lluçanès,1,1,201924111,AAE OSONA,113
12317530,Prats de Lluçanès,1,1,201924111,AAE OSONA,69
12317531,Prats de Lluçanès,1,1,20191031,JUNTS PER CATALUNYA-JUNTS,335
12317532,Prats de Lluçanès,1,2,2019839,ESQUERRA REPUBLICANA DE CATALUNYA-ACORD MUNICIPAL,262
12317533,Prats de Lluçanès,1,2,201924111,AAE OSONA,175
12317534,Prats de Lluçanès,1,2,201924111,AAE OSONA,69
12317535,Prats de Lluçanès,1,2,20191031,JUNTS PER CATALUNYA-JUNTS,344
