# Polity5

In [38]:
import pandas as pd
polity_df = pd.read_excel('../datasets/raw/democracy/polity/POLITY5-PRC.xlsx', sheet_name='Data')

# Rename columns
polity_df.rename(columns={'Economy ISO3': 'country_iso', 'Economy Name': 'country_name'}, inplace=True)

polity_df['Indicator ID'] = polity_df['Indicator ID'].str.replace(r'^POLITY5\.PRC\.', '', regex=True)


# Drop the columns Attribute 1	Attribute 2	Attribute 3
polity_df.drop(columns=['Attribute 1', 'Attribute 2', 'Attribute 3'], inplace=True)

# Set the years as identifiers instead of columns
years = [str(year) for year in range(1776, 2019)]
polity_df_long = polity_df.melt(
    id_vars=['country_iso', 'country_name', 'Indicator ID'],
    value_vars=years,
    var_name='Year',
    value_name='Value'
)

polity_df_long['Year'] = pd.to_numeric(polity_df_long['Year'])

# Set the Indicator as the column
polity_df_final = polity_df_long.pivot_table(
    index=['country_iso', 'country_name', 'Year'],
    columns='Indicator ID',
    values='Value'
).reset_index()

polity_df_final.to_csv('../datasets/processed/democracy/polity/polity5.csv', index=False)

polity_df_final.head()

Indicator ID,country_iso,country_name,Year,autoc,democ,durable,exconst,exrec,parcomp,parreg,polcomp,polity,polity2,xconst,xrcomp,xropen,xrreg
0,AFG,Afghanistan,1800,7.0,1.0,,1.0,1.0,3.0,3.0,6.0,-6.0,-6.0,1.0,1.0,1.0,3.0
1,AFG,Afghanistan,1801,7.0,1.0,,1.0,1.0,3.0,3.0,6.0,-6.0,-6.0,1.0,1.0,1.0,3.0
2,AFG,Afghanistan,1802,7.0,1.0,,1.0,1.0,3.0,3.0,6.0,-6.0,-6.0,1.0,1.0,1.0,3.0
3,AFG,Afghanistan,1803,7.0,1.0,,1.0,1.0,3.0,3.0,6.0,-6.0,-6.0,1.0,1.0,1.0,3.0
4,AFG,Afghanistan,1804,7.0,1.0,,1.0,1.0,3.0,3.0,6.0,-6.0,-6.0,1.0,1.0,1.0,3.0


In [39]:
countries = polity_df_final["country_name"].unique()

print(f"Nr of countries: {len(countries)} - Countries: {', '.join(countries)}")


Nr of countries: 166 - Countries: Afghanistan, Angola, Albania, United Arab Emirates, Argentina, Armenia, Australia, Austria, Azerbaijan, Burundi, Belgium, Benin, Burkina Faso, Bangladesh, Bulgaria, Bahrain, Bosnia and Herzegovina, Belarus, Bolivia, Brazil, Bhutan, Botswana, Central African Republic, Canada, Switzerland, Chile, China, Cote d'Ivoire, Cameroon, Congo, Rep., Colombia, Comoros, Cabo Verde, Costa Rica, Cuba, Cyprus, Czechia, Germany, Djibouti, Denmark, Dominican Republic, Algeria, Ecuador, Egypt, Arab Rep., Eritrea, Spain, Estonia, Ethiopia, Finland, Fiji, France, Gabon, United Kingdom, Georgia, Ghana, Guinea, Gambia, The, Guinea-Bissau, Equatorial Guinea, Greece, Guatemala, Guyana, Honduras, Croatia, Haiti, Hungary, Indonesia, India, Ireland, Iran, Islamic Rep., Iraq, Israel, Italy, Jamaica, Jordan, Japan, Kazakhstan, Kenya, Kyrgyz Republic, Cambodia, Korea, Rep., Kuwait, Lao PDR, Lebanon, Liberia, Libya, Sri Lanka, Lesotho, Lithuania, Luxembourg, Latvia, Morocco, Moldova,

# Freedom in the World

In [40]:
freedom_df = pd.read_excel('../datasets/raw/democracy/freedom-world/All_data_FIW_2013-2024.xlsx', sheet_name='FIW13-25', header=1)

freedom_df.rename(columns={'Country/Territory': 'country', 'Edition': 'year'}, inplace=True)
freedom_df.drop(columns=['Region'], inplace=True)
freedom_df = freedom_df[freedom_df['C/T'] == 'c']
freedom_df.drop(columns=['Add Q', 'Add A', 'C/T'], inplace=True)
freedom_df.columns = freedom_df.columns.str.strip().str.lower().str.replace(' ', '_')
freedom_df.to_csv('../datasets/processed/democracy/freedom-world/freedom-world.csv', index=False)
freedom_df.head()

Unnamed: 0,country,year,status,pr_rating,cl_rating,a1,a2,a3,a,b1,...,f3,f4,f,g1,g2,g3,g4,g,cl,total
1,Afghanistan,2025,NF,7,7,0,0,0,0,0,...,0,0,0,0,1,0,1,2,5,6
2,Albania,2025,PF,3,3,3,3,3,9,3,...,2,3,10,3,2,2,2,9,40,68
3,Algeria,2025,NF,6,5,1,1,1,3,1,...,2,2,6,2,2,2,1,7,21,31
4,Andorra,2025,F,1,1,4,4,4,12,4,...,4,3,15,4,4,3,4,15,55,93
5,Angola,2025,NF,6,5,0,2,1,3,1,...,1,2,5,1,1,1,0,3,18,28


In [41]:
countries = freedom_df["country"].unique()

print(f"Nr of countries: {len(countries)} - Countries: {', '.join(countries)}")

print(f"From years {freedom_df['year'].min()} to {freedom_df['year'].max()}")


Nr of countries: 195 - Countries: Afghanistan, Albania, Algeria, Andorra, Angola, Antigua and Barbuda, Argentina, Armenia, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bhutan, Bolivia, Bosnia and Herzegovina, Botswana, Brazil, Brunei, Bulgaria, Burkina Faso, Burundi, Cabo Verde, Cambodia, Cameroon, Canada, Central African Republic, Chad, Chile, China, Colombia, Comoros, Congo (Brazzaville), Congo (Kinshasa), Costa Rica, Cote d'Ivoire, Croatia, Cuba, Cyprus, Czech Republic, Denmark, Djibouti, Dominica, Dominican Republic, Ecuador, Egypt, El Salvador, Equatorial Guinea, Eritrea, Estonia, Eswatini, Ethiopia, Fiji, Finland, France, Gabon, Georgia, Germany, Ghana, Greece, Grenada, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, Honduras, Hungary, Iceland, India, Indonesia, Iran, Iraq, Ireland, Israel, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Kiribati, Kosovo, Kuwait, Kyrgyzstan, Laos, Latvia, Lebanon, Lesotho, Liberia, Li

# Autocratic Regime Data

In [42]:
autocratic_cases_df = pd.read_excel('../datasets/raw/democracy/autocratic/GWF Autocratic Regimes 1.2/GWF Autocratic Regimes.xlsx', sheet_name='Autocratic Regimes Case List', header=0)

# Transform the dates to datetime in day/month/year format. By default Pandas uses month/day/year format.
autocratic_cases_df['gwf_startdate'] = pd.to_datetime(autocratic_cases_df['gwf_startdate'], dayfirst=True, errors='coerce')
autocratic_cases_df['gwf_enddate'] = pd.to_datetime(autocratic_cases_df['gwf_enddate'], dayfirst=True, errors='coerce')

# gwf_endyr is parsed as a float by default, so we need to convert it to an integer
# If values are not integers, set them to -1 (For example, for cases where the autocratic regime is still ongoing)
autocratic_cases_df['gwf_startyr'] = autocratic_cases_df['gwf_startyr'].fillna(-1).astype(int)
autocratic_cases_df['gwf_endyr'] = autocratic_cases_df['gwf_endyr'].fillna(-1).astype(int)

autocratic_cases_df.to_csv('../datasets/processed/democracy/autocratic/autocratic-regime-case-list.csv', index=False)
autocratic_cases_df.head()

Unnamed: 0,cowcode,gwf_country,gwf_casename,gwf_startdate,gwf_enddate,gwf_startyr,gwf_endyr,gwf_subsreg,gwf_howend,gwf_violent,gwf_regimetype
0,700,Afghanistan,Afghanistan 29-73,1929-10-10,1973-07-17,1930,1973,2,5,2,monarchy
1,700,Afghanistan,Afghanistan 73-78,1973-07-17,1978-04-27,1974,1978,2,5,4,personal
2,700,Afghanistan,Afghanistan 78-92,1978-04-27,1992-04-16,1979,1992,3,6,4,party-personal
3,700,Afghanistan,Afghanistan 96-01,1996-09-27,2001-11-13,1997,2001,3,7,4,party-based
4,700,Afghanistan,Afghanistan 09-NA,2009-08-20,2010-12-31,2010,-1,0,0,0,personal


In [43]:
# Now we want to process the TSCS Data, which contains the autocratic regimes data over the years
autocratic_years_df = pd.read_excel('../datasets/raw/democracy/autocratic/GWF Autocratic Regimes 1.2/GWF Autocratic Regimes.xlsx', sheet_name='TSCS data', header=0)

# Transform the dates to datetime in day/month/year format. By default Pandas uses month/day/year format.
autocratic_years_df['gwf_startdate'] = pd.to_datetime(autocratic_years_df['gwf_startdate'], dayfirst=True, errors='coerce')
autocratic_years_df['gwf_enddate'] = pd.to_datetime(autocratic_years_df['gwf_enddate'], dayfirst=True, errors='coerce')

autocratic_years_df.to_csv('../datasets/processed/democracy/autocratic/autocratic-regime-country-years.csv', index=False)
autocratic_years_df.head()

Unnamed: 0,cowcode,year,gwf_country,gwf_casename,gwf_startdate,gwf_enddate,gwf_spell,gwf_duration,gwf_fail,gwf_fail_subsregime,gwf_fail_type,gwf_fail_violent,gwf_regimetype,gwf_party,gwf_personal,gwf_military,gwf_monarch
0,40,1953,Cuba,Cuba 52-59,1952-10-03,1959-01-01,7,1,0,0,0,0,personal,0,1,0,0
1,40,1954,Cuba,Cuba 52-59,1952-10-03,1959-01-01,7,2,0,0,0,0,personal,0,1,0,0
2,40,1955,Cuba,Cuba 52-59,1952-10-03,1959-01-01,7,3,0,0,0,0,personal,0,1,0,0
3,40,1956,Cuba,Cuba 52-59,1952-10-03,1959-01-01,7,4,0,0,0,0,personal,0,1,0,0
4,40,1957,Cuba,Cuba 52-59,1952-10-03,1959-01-01,7,5,0,0,0,0,personal,0,1,0,0


In [44]:
countries = autocratic_cases_df["gwf_country"].unique()

print(f"Autocratic Regimes - Nr of countries: {len(countries)} - Countries: {', '.join(countries)}")
print(f"Autocratic Regimes - From years {autocratic_cases_df['gwf_startyr'].min()} to {autocratic_cases_df['gwf_endyr'].max()}")

countries = autocratic_years_df["gwf_country"].unique()

print(f"Autocratic Years - Nr of countries: {len(countries)} - Countries: {', '.join(countries)}")
print(f"Autocratic Years - From years {autocratic_years_df['year'].min()} to {autocratic_years_df['year'].max()}")


Autocratic Regimes - Nr of countries: 120 - Countries: Afghanistan, Albania, Algeria, Angola, Argentina, Armenia, Azerbaijan, Bangladesh, Belarus, Benin, Bolivia, Botswana, Brazil, Bulgaria, Burkina Faso, Burundi, Cambodia, Cameroon, Cen African Rep, Chad, Chile, China, Colombia, Congo-Brz, Congo/Zaire, Costa Rica, Cuba, Czechoslovakia, Dominican Rep, Ecuador, Egypt, El Salvador, Eritrea, Ethiopia, Gabon, Gambia, Georgia, Germany East, Ghana, Greece, Guatemala, Guinea, Guinea Bissau, Haiti, Honduras, Hungary, Indonesia, Iran, Iraq, Ivory Coast, Jordan, Kazakhstan, Kenya, Korea North, Korea South, Kuwait, Kyrgyzstan, Laos, Lesotho, Liberia, Libya, Madagascar, Malawi, Malaysia, Mali, Mauritania, Mexico, Mongolia, Morocco, Mozambique, Myanmar, Namibia, Nepal, Nicaragua, Niger, Nigeria, Oman, Pakistan, Panama, Paraguay, Peru, Philippines, Poland, Portugal, Romania, Russia, Rwanda, Saudi Arabia, Senegal, Serbia, Sierra Leone, Singapore, Somalia, South Africa, Soviet Union, Spain, Sri Lanka,