https://docs.google.com/document/d/172QhFesRsvPLJEQvqQCWtUB5BjUT4UbkL4ZvTHsMuvg/edit?tab=t.0

# The election results dataset

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/2VZ5ZC

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re


import sqlite3 
import warnings
warnings.filterwarnings('ignore')

## Pulling the data from the db into a pandas df

In [2]:
# Define the database path
db_path = "/work/Connor Folder/parlgov-stable.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query the entire table 'view_election'
query = "SELECT * FROM view_election;"
# Fetch the results into a pandas DataFrame
election_df = pd.read_sql(query, conn)

# Close the connection
conn.close()

# Display the resulting DataFrame
election_df.head()

Unnamed: 0,country_name_short,country_name,election_type,election_date,vote_share,seats,seats_total,party_name_short,party_name,party_name_english,left_right,country_id,election_id,previous_parliament_election_id,previous_cabinet_id,party_id
0,AUS,Australia,parliament,1901-03-30,44.4,32.0,75,PP,Protectionist Party,Protectionist Party,7.4,33,731,,,1898
1,AUS,Australia,parliament,1901-03-30,34.2,26.0,75,FTP,Free Trade Party,Free Trade Party,6.0,33,731,,,1938
2,AUS,Australia,parliament,1901-03-30,19.4,15.0,75,ALP,Australian Labor Party,Australian Labor Party,3.8833,33,731,,,1253
3,AUS,Australia,parliament,1901-03-30,1.4,1.0,75,none,no party affiliation,no party affiliation,,33,731,,,1396
4,AUS,Australia,parliament,1901-03-30,0.6,1.0,75,one-seat,one seat,one seat,,33,731,,,2299


## Exploring the countries in the table

In [3]:
# Extract a list of distinct country_name values from the election_df
distinct_countries = election_df['country_name'].drop_duplicates().tolist()
print(f" The number of countries in the DF: {len(distinct_countries)}")
print(f"\nThe list of countries:\n{distinct_countries}")


 The number of countries in the DF: 37

The list of countries:
['Australia', 'Austria', 'Belgium', 'Bulgaria', 'Canada', 'Switzerland', 'Cyprus', 'Czech Republic', 'Germany', 'Denmark', 'Spain', 'Estonia', 'Finland', 'France', 'United Kingdom', 'Greece', 'Croatia', 'Hungary', 'Ireland', 'Iceland', 'Israel', 'Italy', 'Japan', 'Lithuania', 'Luxembourg', 'Latvia', 'Malta', 'Netherlands', 'Norway', 'New Zealand', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Sweden', 'Turkey']


# Cleaning the DF with Functions

<hr>

### Filtering to only desired countries

In [4]:
#Create filter function
def filterCountries(df, filter, colName):
    return df[df[colName].isin(filter)]

### Removing NaN years (no data collected in these years)

This leaves a nice year range, 1946-2019 (73 years) down from 122 years

In [5]:
# Remove columns (years) where there are NaNs in the data
def removeNullYears(df):
    df = df.dropna(axis=1, how='any')
    return df

In [6]:
# Alternative to removing null years, set them to 'False'
def falseNullYears(df):
    df = df.fillna(False)
    return df

### Tracking the majority government each year

In [7]:
# def majority_government_each_year(df):
#     # Filter for only parliament elections
#     df = df[df['election_type'] == 'parliament']

#     # Sort by country and election date
#     df = df.sort_values(by=['country_name', 'election_date'])
    
#     results = {}

#     # Process each country separately
#     for country, group in df.groupby('country_name'):
#         group['election_date'] = pd.to_datetime(group['election_date'])  # Ensure dates are in datetime format
#         group = group.sort_values('election_date')

#         # Determine the majority party for each election
#         group['majority_party'] = group.groupby('election_id')['vote_share'].transform(max) == group['vote_share']
#         majority_parties = group[group['majority_party']].drop_duplicates('election_id')

#         # Map each year to the majority party
#         years = pd.date_range(start=group['election_date'].min(), end=group['election_date'].max(), freq='Y').year
#         party_map = {}
#         for i, row in majority_parties.iterrows():
#             start_year = row['election_date'].year if isinstance(row['election_date'], pd.Timestamp) else pd.to_datetime(row['election_date']).year
#             next_election = majority_parties['election_date'][majority_parties['election_date'] > row['election_date']].min()
#             end_year = next_election.year if pd.notna(next_election) else group['election_date'].max().year
#             for year in range(start_year, end_year + 1):
#                 party_map[year] = row['party_name_english']

#         # Adjust results for this country
#         results[country] = party_map

#     # Convert the results to a DataFrame
#     result_df = pd.DataFrame(results).T
#     result_df.index.name = 'country'
#     return result_df

In [8]:
def majority_government_each_year(df):

    # Filter for only parliament elections
    df = df[df['election_type'] == 'parliament']
    
    # Ensure election_date is in datetime format
    df['election_date'] = pd.to_datetime(df['election_date'])
    
    # Sort by country and election date
    df = df.sort_values(by=['country_name', 'election_date'])
    
    results = {}
    
    for country, group in df.groupby('country_name'):
        # Determine the majority party for each election
        group['majority_party'] = group.groupby('election_id')['vote_share'].transform(max) == group['vote_share']
        majority_parties = group[group['majority_party']].drop_duplicates('election_id')
        
        # Map election years to majority party
        party_map = {row['election_date'].year: row['party_name_english'] for _, row in majority_parties.iterrows()}
        results[country] = party_map
    
    # Convert to DataFrame
    result_df = pd.DataFrame.from_dict(results, orient='index')
    result_df.index.name = 'country'
    
    result_df = result_df.sort_index(axis=1, ascending=True)

    return result_df

### Tracking when a majority government is overthrown (as a bool)

In [9]:
# Function to detect government changes
def calculate_government_changes(df):
    def check_changes(series):
        previous = None
        result = []
        
        for value in series:
            if pd.isna(value):
                result.append(np.nan)  # Keep NaNs as NaNs
            else:
                result.append(previous != value if previous is not None else False)
                previous = value  # Update previous only if the value is not NaN
        
        return result

    return df.apply(check_changes)

<hr>

# Applying the funcs

Line 8: "removeNullYears" will return only years where all countries have data. This returns less data but can be trusted.

Line 9: "falseNullYears" will return all years 1900-2023. NaNs are replaced with "False". This returns far more data but shouldn't be trusted.

(TLDR: Only one of these should be used, I recommend removeNullYears)

In [10]:
majority_govs_df = election_df.copy()

# G7_countries = ['Japan', 'Germany', 'France', 'United Kingdom', 'Italy', 'Canada'] 
# majority_govs_df = filterCountries(majority_govs_df, G7_countries, 'country_name')

majority_govs_df = majority_government_each_year(majority_govs_df)

#majority_govs_df = removeNullYears(majority_govs_df)
# majority_govs_df = falseNullYears(majority_govs_df)

majority_govs_df.T.head(20)

country,Australia,Denmark,Norway,Belgium,France,New Zealand,Sweden,Switzerland,Canada,Finland,...,Latvia,Romania,Slovakia,Slovenia,Poland,Cyprus,Lithuania,Bulgaria,Croatia,Estonia
1900,,,Liberal Party of Norway,Catholic Party,,,,,Liberal Party of Canada,,...,,,,,,,,,,
1901,Protectionist Party,Liberal Party,,,,,,,,,...,,,,,,,,,,
1902,,,,Catholic Party,Left Republican,New Zealand Liberal Party,,Radical Democratic Party,,,...,,,,,,,,,,
1903,Free Trade Party,Liberal Party,Conservative Party,,,,,,,,...,,,,,,,,,,
1904,,,,Catholic Party,,,,,Liberal Party of Canada,,...,,,,,,,,,,
1905,,,,,,New Zealand Liberal Party,,Radical Democratic Party,,,...,,,,,,,,,,
1906,Free Trade Party,Liberal Party,Liberal Party of Norway,Catholic Party,Conservatives,,,,,,...,,,,,,,,,,
1908,,,,Catholic Party,,New Zealand Liberal Party,,Radical Democratic Party,Liberal Party of Canada,,...,,,,,,,,,,
1909,,Social Democrats,Conservative Party,,,,,,,,...,,,,,,,,,,
1910,Australian Labor Party,Liberal Party,,Catholic Party,Republican Socialist Party,,,,,,...,,,,,,,,,,


In [11]:
df = majority_govs_df.copy()
df = df.T

df_main = calculate_government_changes(df)

df_main

country,Australia,Denmark,Norway,Belgium,France,New Zealand,Sweden,Switzerland,Canada,Finland,...,Latvia,Romania,Slovakia,Slovenia,Poland,Cyprus,Lithuania,Bulgaria,Croatia,Estonia
1900,,,False,False,,,,,False,,...,,,,,,,,,,
1901,False,False,,,,,,,,,...,,,,,,,,,,
1902,,,,False,False,False,,False,,,...,,,,,,,,,,
1903,True,False,True,,,,,,,,...,,,,,,,,,,
1904,,,,False,,,,,False,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019,False,False,,False,,,,False,True,True,...,,,,,False,,,,,False
2020,,,,,,True,,,,,...,,False,True,,,,False,,False,
2021,,,False,,,,,,False,,...,,,,,,False,,True,,
2022,False,False,,,False,,False,,,,...,True,,,True,,,,True,,


## Adding USA Presidential Elections

In [12]:
# Function to get the winner's party for a given year
def get_winner_party(year):
    url = f"https://www.270towin.com/{year}-election"
    response = requests.get(url)
    
    # Check if the page is found
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the table with election results
        table = soup.find('table', {'class': 'table'})
        
        if table:
            # Extract all rows from the table
            rows = table.find_all('tr')
            
            # Look for the row with the winner (marked by ✓)
            for row in rows:
                cols = row.find_all('td')
                
                if len(cols) > 3 and '✓' in cols[0].text:
                    winner_name = cols[2].text.strip()
                    winner_party = cols[3].text.strip()
                    return winner_party
    return np.nan  # Return None if page or data is not found

# Create an empty list to store the data
data = []

# Loop through the years from 1900 to 2023
for year in range(1900, 2024):
    winner_party = get_winner_party(year)
    data.append([year, winner_party])

# Create a DataFrame
df = pd.DataFrame(data, columns=["Year", "Winner_Party"])

# Display the DataFrame sorted by year
df = df.sort_values(by="Year").reset_index(drop=True)
df = df.set_index("Year")


##Apply the bool func to df_usa
df_usa = df.copy()
df_usa = calculate_government_changes(df_usa)

In [13]:
df_usa = df_usa.rename(columns={"Winner_Party": "USA"})
df_main = pd.concat([df_main, df_usa], axis = 1)

## Adding Other Countries via Manual Imputation

In [14]:
df_main["Costa_Rica"], df_main["Columbia"], df_main["Czechia"],df_main["Chile"],df_main["Lithuania"], df_main["Latvia"], df_main["Mexico"] = np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan

#Lithuania
df_main.loc[[1992,1996,2000,2004,2008,2012,2016,2020], 'Lithuania'] = True

#Czechia
df_main.loc[[1990,1992,1998,2006,2010,2017,2021], 'Czechia'] = True
df_main.loc[[1996,2002,2013], 'Czechia'] = False

#Chile
df_main.loc[[1999,2005,2009,2013,2017,2021], 'Chile'] = True
df_main.loc[[1989,1993], 'Chile'] = False


#Columbia
df_main.loc[[1998,2002,2010,2018,2022], 'Columbia'] = True
df_main.loc[[1990,1994,2006,2014], 'Columbia'] = False

#Latvia
df_main.loc[[1993,1999,2007,2011,2019,2023], 'Latvia'] = True
df_main.loc[[1996,2003,2015], 'Latvia'] = False

#Mexico
df_main.loc[[1988, 1994, 2000, 2006, 2012, 2018], 'Mexico'] = True

In [15]:
df_main.to_csv("ElectionChangesFinal.csv")

In [19]:
df_main.head(20)

Unnamed: 0,Australia,Denmark,Norway,Belgium,France,New Zealand,Sweden,Switzerland,Canada,Finland,...,Lithuania,Bulgaria,Croatia,Estonia,USA,Costa_Rica,Columbia,Czechia,Chile,Mexico
1900,,,False,False,,,,,False,,...,,,,,False,,,,,
1901,False,False,,,,,,,,,...,,,,,,,,,,
1902,,,,False,False,False,,False,,,...,,,,,,,,,,
1903,True,False,True,,,,,,,,...,,,,,,,,,,
1904,,,,False,,,,,False,,...,,,,,False,,,,,
1905,,,,,,False,,False,,,...,,,,,,,,,,
1906,False,False,True,False,True,,,,,,...,,,,,,,,,,
1908,,,,False,,False,,False,False,,...,,,,,False,,,,,
1909,,True,True,,,,,,,,...,,,,,,,,,,
1910,True,True,,False,True,,,,,,...,,,,,,,,,,


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=63ad4e1b-19bb-4dd7-a997-1fa3d2fd82a1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>