# Study Grouped Catalan Elections Dataset

Load libraries:

In [1]:
import pandas as pd
import pprint
import logging

pp = pprint.PrettyPrinter(indent=2)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

Load the clean dataset:

In [2]:
df = pd.read_pickle('../../data/processed/catalan-elections-grouped-data.pkl')
# df_original = df.copy()

## Dataset Structure 

Visualize the structure of the dataset:

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4273269 entries, 0 to 4273268
Data columns (total 32 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   index_autonumeric       Int64         
 1   id_eleccio              string        
 2   nom_eleccio             string        
 3   id_nivell_territorial   string        
 4   nom_nivell_territorial  string        
 5   territori_codi          string        
 6   territori_nom           string        
 7   seccio                  int32         
 8   vots                    int32         
 9   escons                  Int64         
 10  districte               int32         
 11  mesa                    string        
 12  type                    string        
 13  year                    int32         
 14  round                   string        
 15  party_code              int32         
 16  party_name              object        
 17  party_abbr              object        
 18  pa

In [4]:
df.head()

Unnamed: 0,index_autonumeric,id_eleccio,nom_eleccio,id_nivell_territorial,nom_nivell_territorial,territori_codi,territori_nom,seccio,vots,escons,...,month,day,date,cens_electoral,vots_valids,vots_blancs,vots_nuls,votants,valid_votes_percentage,joined_code
0,117342,A19801,Eleccions al Parlament de Catalunya 1980,SE,Secció,8001,Abrera,1,2,0,...,3,20,1980-03-20,2302,1619,10,8,1627.0,0.123533,1.0
1,117343,A19801,Eleccions al Parlament de Catalunya 1980,SE,Secció,8001,Abrera,1,389,0,...,3,20,1980-03-20,2302,1619,10,8,1627.0,24.027177,4.0
2,117344,A19801,Eleccions al Parlament de Catalunya 1980,SE,Secció,8001,Abrera,1,407,0,...,3,20,1980-03-20,2302,1619,10,8,1627.0,25.138975,6.0
3,117345,A19801,Eleccions al Parlament de Catalunya 1980,SE,Secció,8001,Abrera,1,7,0,...,3,20,1980-03-20,2302,1619,10,8,1627.0,0.432366,8.0
4,117346,A19801,Eleccions al Parlament de Catalunya 1980,SE,Secció,8001,Abrera,1,81,0,...,3,20,1980-03-20,2302,1619,10,8,1627.0,5.003088,10.0


In [5]:
len(df["party_code"].unique())

902

In [6]:
len(df["joined_code"].unique())

686

In [7]:
all(df['party_code'] == df['joined_code'])

False

First of all, we want to check that for every section in an election there aren't any repeaded ``party_codes``:

In [8]:
duplicated_party_codes = df[
    df.duplicated(subset=["mundissec", "party_code", "nom_eleccio"], keep=False)
][["nom_eleccio", "mundissec", "party_code", "party_name", "vots", "valid_votes_percentage"]]

duplicated_party_codes

Unnamed: 0,nom_eleccio,mundissec,party_code,party_name,vots,valid_votes_percentage
3766883,Eleccions Municipals 1979,08019301001,3000000,Altres partits,52,6.161137
3766884,Eleccions Municipals 1979,08019301001,3000000,Altres partits,0,0.0
3766891,Eleccions Municipals 1979,08019301001,3000000,Altres partits,0,0.0
3766898,Eleccions Municipals 1979,08019301002,3000000,Altres partits,52,8.666667
3766899,Eleccions Municipals 1979,08019301002,3000000,Altres partits,0,0.0
...,...,...,...,...,...,...
4251885,Eleccions Municipals 2019,08171201001,201924111,AAE OSONA,69,8.961039
4251888,Eleccions Municipals 2019,08171201002,201924111,AAE OSONA,175,20.2781
4251889,Eleccions Municipals 2019,08171201002,201924111,AAE OSONA,69,7.995365
4258938,Eleccions Municipals 2019,082461001001,201924111,AAE OSONA,132,10.099464


In [9]:
duplicated_party_codes["party_name"].unique()

array(['Altres partits', 'Independents', 'AAE OSONA'], dtype=object)