In [30]:
import pandas as pd
import os

# Voting data

In [31]:
folder = "data"

file1_path = os.path.join(folder, "pst4.csv")
file2_path = os.path.join(folder, "pst4p.csv")

data_gen = pd.read_csv(file1_path)
data_gran = pd.read_csv(file2_path)

In [32]:
print("General data")
print("length", len(data_gen))
print("unique okrsek", len(data_gen['OKRSEK'].unique()))
print("unique obec", len(data_gen['OBEC'].unique()))
print("unique okres", len(data_gen['OKRES'].unique()))
print()
print("Granular data")
print("length", len(data_gran))
print("unique okrsek", len(data_gran['OKRSEK'].unique()))
print("unique obec", len(data_gran['OBEC'].unique()))
print("unique okres", len(data_gran['OKRES'].unique()))
print()
print("Sanity checks")
print(len(data_gen['OKRSEK'].unique()) == len(data_gran['OKRSEK'].unique()))
print(len(data_gen['OBEC'].unique()) == len(data_gran['OBEC'].unique()))
print(len(data_gen['OKRES'].unique()) == len(data_gran['OKRES'].unique()))

General data
length 14819
unique okrsek 1502
unique obec 6389
unique okres 78

Granular data
length 179877
unique okrsek 1502
unique obec 6389
unique okres 78

Sanity checks
True
True
True


In [33]:
df_gen = data_gen[['ID_OKRSKY', 'ODEVZ_OBAL', 'PL_HL_CELK']][data_gen['TYP_FORM'] == 1]

In [34]:
df_gen.head()

Unnamed: 0,ID_OKRSKY,ODEVZ_OBAL,PL_HL_CELK
0,1,508,502
1,2,303,301
2,3,298,296
3,4,167,163
4,5,517,515


In [35]:
df_gran = data_gran[['ID_OKRSKY', 'OKRSEK', 'OBEC', 'OKRES', 'KSTRANA', 'POC_HLASU']][data_gran['TYP_FORM'] == 2]

In [36]:
df_gran.head()

Unnamed: 0,ID_OKRSKY,OKRSEK,OBEC,OKRES,KSTRANA,POC_HLASU
0,1,1,500011,7204,3,2
1,1,1,500011,7204,4,1
2,1,1,500011,7204,6,41
3,1,1,500011,7204,8,5
4,1,1,500011,7204,10,4


In [37]:
df = pd.merge(df_gran, df_gen, how = 'left', on = 'ID_OKRSKY')

In [38]:
print("Merged data checks")
print("length:", len(data_gran) == len(df))
df.head()

Merged data checks
length: True


Unnamed: 0,ID_OKRSKY,OKRSEK,OBEC,OKRES,KSTRANA,POC_HLASU,ODEVZ_OBAL,PL_HL_CELK
0,1,1,500011,7204,3,2,508,502
1,1,1,500011,7204,4,1,508,502
2,1,1,500011,7204,6,41,508,502
3,1,1,500011,7204,8,5,508,502
4,1,1,500011,7204,10,4,508,502


In [39]:
df["PROC_HLASU_STRANA"] = df["POC_HLASU"] / df["PL_HL_CELK"]
df["PROC_NEPL_HLASU"] = 1 -  df["PL_HL_CELK"] / df["ODEVZ_OBAL"]

In [40]:
df.head()

Unnamed: 0,ID_OKRSKY,OKRSEK,OBEC,OKRES,KSTRANA,POC_HLASU,ODEVZ_OBAL,PL_HL_CELK,PROC_HLASU_STRANA,PROC_NEPL_HLASU
0,1,1,500011,7204,3,2,508,502,0.003984,0.011811
1,1,1,500011,7204,4,1,508,502,0.001992,0.011811
2,1,1,500011,7204,6,41,508,502,0.081673,0.011811
3,1,1,500011,7204,8,5,508,502,0.00996,0.011811
4,1,1,500011,7204,10,4,508,502,0.007968,0.011811


In [41]:
# aggregate voting data based on OBEC
# make one row per OBEC, party votes will be different columns in the same row (with the party code as column name)

df_obec_agg = df.groupby(['KSTRANA', 'OBEC']).agg(
    ODEVZ_OBAL = ('ODEVZ_OBAL', 'sum'),
    PL_HL_CELK = ('PL_HL_CELK', 'sum'),
    POC_HLASU = ('POC_HLASU', 'sum')
).reset_index()

df_fin = df_obec_agg.pivot(index='OBEC', columns='KSTRANA', values='POC_HLASU').reset_index().fillna(0)
df_obec_agg_data = df_obec_agg[['OBEC', 'ODEVZ_OBAL', 'PL_HL_CELK']].drop_duplicates()
df_fin = pd.merge(df_fin, df_obec_agg_data, how = 'left', on = 'OBEC')
df_fin['OBEC'] = df_fin['OBEC'].astype(int)


In [42]:
file3_path = os.path.join(folder, "pol_strany.csv")
party_info = pd.read_csv(file3_path, sep=';')
party_info = party_info[['Kandidátní listina.číslo', 'Kandidátní listina.název']][1:]
party_info['Kandidátní listina.číslo'] = party_info['Kandidátní listina.číslo'].astype(int)
party_info['Kandidátní listina.název'] = party_info['Kandidátní listina.název'].str.replace('\u200b', '', regex=False).str.strip()
party_name_dict = dict(zip(party_info['Kandidátní listina.číslo'], party_info['Kandidátní listina.název']))

In [43]:
for col in df_fin.columns:
    if col != 'OBEC' and col != 'ODEVZ_OBAL' and col != 'PL_HL_CELK':
        if col in party_name_dict:
            df_fin = df_fin.rename(columns={col: party_name_dict[col]})
        else:
            print(f"Warning: Party code {col} not found in party info")

In [44]:
# for each KSTRANA colum add a {int}_PROC column with the share of votes for that party in that OBEC
for col in df_fin.columns:
    if col != 'OBEC' and col != 'ODEVZ_OBAL' and col != 'PL_HL_CELK':
        df_fin[f"{col}_PROC"] = df_fin[col] / df_fin["PL_HL_CELK"]
df_fin["PROC_NEPL_HLASU"] = 1 -  df_fin["PL_HL_CELK"] / df_fin["ODEVZ_OBAL"]

In [45]:
print(len(df_fin))
df_fin.head()

14316


Unnamed: 0,OBEC,Rebelové,MZH,JaSaN,VÝZVA,SMSka,SPD,ČSSD,PŘÍSAHA,Levice,...,Volt_PROC,PB_PROC,AUTO_PROC,BPS_PROC,ANO_PROC,STAN_PROC,Kruh_PROC,Stačilo!_PROC,Voluntia_PROC,PROC_NEPL_HLASU
0,500011,0.0,1.0,2.0,2.0,0.0,93.0,3.0,18.0,0.0,...,0.0,0.0,0.253378,0.0,1.097973,0.466216,0.0,0.148649,0.010135,0.006711
1,500011,0.0,1.0,2.0,2.0,0.0,93.0,3.0,18.0,0.0,...,0.0,0.0,0.149402,0.0,0.64741,0.2749,0.0,0.087649,0.005976,0.011811
2,500011,0.0,1.0,2.0,2.0,0.0,93.0,3.0,18.0,0.0,...,0.0,0.0,0.093985,0.0,0.407268,0.172932,0.0,0.055138,0.003759,0.009926
3,500011,0.0,1.0,2.0,2.0,0.0,93.0,3.0,18.0,0.0,...,0.0,0.0,0.068244,0.0,0.295723,0.125569,0.0,0.040036,0.00273,0.009017
4,500011,0.0,1.0,2.0,2.0,0.0,93.0,3.0,18.0,0.0,...,0.0,0.0,0.125628,0.0,0.544389,0.231156,0.0,0.073702,0.005025,0.006656


# Demographic data

In [46]:
file4_path = os.path.join(folder, "dem_obec.xlsx")

demog = pd.read_excel(file4_path, skiprows=5)
mask_okres = demog.iloc[:, 0].str.startswith("Okres", na=False)
demog.loc[mask_okres, "okres_name"] = demog.loc[mask_okres].iloc[:, 0].str.replace("Okres ", "", regex=False)
demog["okres_name"] = demog["okres_name"].ffill()
demog = demog[~mask_okres].copy()

demog = demog.rename(columns={
    "Unnamed: 0": "okres_code",     
    "Unnamed: 1": "obec_code",         
    "Unnamed: 2": "obec_name",        
    "Unnamed: 3": "pop_total",
    "Unnamed: 4": "pop_men",
    "Unnamed: 5": "pop_women",
    "Unnamed: 6": "age_avg_total",
    "Unnamed: 7": "age_avg_men",
    "Unnamed: 8": "age_avg_women"
})

for c in ["okres_code", "obec_name", "okres_name"]:
    if c in demog.columns:
        demog[c] = demog[c].astype("string").str.strip()

# integer-like
for c in ["obec_code", "pop_total", "pop_men", "pop_women"]:
    if c in demog.columns:
        demog[c] = pd.to_numeric(demog[c], errors="coerce").astype("Int64")

# floats
for c in ["age_avg_total", "age_avg_men", "age_avg_women"]:
    if c in demog.columns:
        demog[c] = pd.to_numeric(demog[c], errors="coerce")

# 3) (Optional) reorder columns
cols = [
    "okres_code", "okres_name", "obec_code", "obec_name",
    "pop_total", "pop_men", "pop_women",
    "age_avg_total", "age_avg_men", "age_avg_women"
]
demog = demog[[c for c in cols if c in demog.columns]]

In [47]:
print(len(demog))
demog.head()

6258


Unnamed: 0,okres_code,okres_name,obec_code,obec_name,pop_total,pop_men,pop_women,age_avg_total,age_avg_men,age_avg_women
0,CZ0100,,554782,Praha,1397880,679162,718718,41.864895,40.356198,43.290559
2,CZ0201,Benešov,529303,Benešov,17043,8058,8985,43.878631,42.236287,45.35153
3,CZ0201,Benešov,532568,Bernartice,223,102,121,45.172646,46.029412,44.450413
4,CZ0201,Benešov,530743,Bílkovice,225,113,112,45.78,45.597345,45.964286
5,CZ0201,Benešov,532380,Blažejovice,102,50,52,49.843137,47.78,51.826923


In [48]:
data = pd.merge(df_fin, demog, how = 'left', left_on = 'OBEC', right_on = 'obec_code')
data.to_csv(os.path.join(folder, "prepared_data.csv"), index=False)