In [72]:
import pandas as pd
import re
import country_converter as coco
import pycountry

# Load datasets

I'm loading all the tabular datasets to retrieve the countries available in each dataset and find the differences.

In [73]:
# Democracy
dichotomous_df = pd.read_csv('../datasets/processed/democracy/dichotomous/dichotomous-democracy-snapshot.csv', header=0)
freedom_world_df = pd.read_csv('../datasets/processed/democracy/freedom-world/freedom-world-snapshot.csv', header=0)
lied_df = pd.read_csv('../datasets/processed/democracy/lied/lied-snapshot.csv', header=0)
polity_df = pd.read_csv('../datasets/processed/democracy/polity/polity5-snapshot.csv', header=0)

# Religion
rdi_df = pd.read_csv('../datasets/processed/religion/pew-research-center-religion-diversity/religious-diversity-index-extended.csv', header=0)
national_religion_df = pd.read_csv('../datasets/processed/religion/world-religion-project/national-religion-dataset-snapshot.csv', header=0)

# Women's Rights
georgetown_df = pd.read_csv('../datasets/processed/women/georgetown/women-peace-and-security-index.csv', header=0)
world_bank_df = pd.read_csv('../datasets/processed/women/world-bank/women-business-and-the-law-snapshot.csv', header=0)

Now I'll get the countries available in each dataset.
To simplify the comparison I'll transform the country names to lowercase, remove special characters, remove extra spaces, and trim the spaces.



In [74]:
def normalize_country_name(country):
    country = str(country).lower()
    country = re.sub(r'[^a-z0-9\s]', '', country)
    country = ' '.join(country.split())
    return country

dichotomous_countries = set([normalize_country_name(country) for country in dichotomous_df['country'].unique()])
freedom_world_countries = set([normalize_country_name(country) for country in freedom_world_df['country'].unique()])
lied_countries = set([normalize_country_name(country) for country in lied_df['countryn'].unique()])
polity_countries = set([normalize_country_name(country) for country in polity_df['country_name'].unique()])

rdi_countries = set([normalize_country_name(country) for country in rdi_df['country'].unique()])
national_religion_countries = set([normalize_country_name(country) for country in national_religion_df['country'].unique()])

georgetown_countries = set([normalize_country_name(country) for country in georgetown_df['country'].unique()])
world_bank_countries = set([normalize_country_name(country) for country in world_bank_df['country'].unique()])

print(f"dichotomous_countries: {len(dichotomous_countries)} - {dichotomous_countries}")
print(f"freedom_world_countries: {len(freedom_world_countries)} - {freedom_world_countries}")
print(f"lied_countries: {len(lied_countries)} - {lied_countries}")
print(f"polity_countries: {len(polity_countries)} - {polity_countries}")
print(f"rdi_countries: {len(rdi_countries)} - {rdi_countries}")
print(f"national_religion_countries: {len(national_religion_countries)} - {national_religion_countries}")
print(f"georgetown_countries: {len(georgetown_countries)} - {georgetown_countries}")
print(f"world_bank_countries: {len(world_bank_countries)} - {world_bank_countries}")



dichotomous_countries: 191 - {'china', 'mali', 'congo rep', 'kyrgyzstan', 'maldives', 'croatia', 'comoros', 'bulgaria', 'chad', 'iceland', 'paraguay', 'cambodia', 'zambia', 'switzerland', 'central african rep', 'jamaica', 'san marino', 'uruguay', 'nicaragua', 'estonia', 'finland', 'myanmar', 'palau', 'norway', 'liechstenstein', 'hungary', 'slovenia', 'denmark', 'singapore', 'pakistan', 'malta', 'bolivia', 'tonga', 'suriname', 'rwanda', 'guyana', 'dominican republic', 'malawi', 'turkmenistan', 'austria', 'uganda', 'trinidad tobago', 'peru', 'vanuatu', 'slovakia', 'new zealand', 'gabon', 'portugal', 'ghana', 'congo dem rep', 'united kingdom', 'libya', 'guinea', 'tunisia', 'bangladesh', 'djibouti', 'madagascar', 'samoa western', 'mozambique', 'st vincent gren', 'cameroon', 'india', 'cape verde', 'papua new guinea', 'gambia', 'east timor', 'albania', 'benin', 'lebanon', 'saudi arabia', 'korea south', 'thailand', 'kazakhstan', 'bahrain', 'micronesia fed', 'sierra leone', 'barbados', 'philip

Now I'll get the countries available in all the datasets.

In [75]:
all_countries = dichotomous_countries & freedom_world_countries & lied_countries & polity_countries & rdi_countries & national_religion_countries & georgetown_countries & world_bank_countries

print(f"all_countries: {len(all_countries)} - {all_countries}")

all_countries: 129 - {'china', 'mali', 'comoros', 'croatia', 'bulgaria', 'chad', 'paraguay', 'cambodia', 'zambia', 'switzerland', 'jamaica', 'estonia', 'uruguay', 'nicaragua', 'finland', 'norway', 'hungary', 'slovenia', 'denmark', 'singapore', 'pakistan', 'bolivia', 'suriname', 'rwanda', 'guyana', 'dominican republic', 'malawi', 'austria', 'uganda', 'peru', 'new zealand', 'gabon', 'portugal', 'ghana', 'united kingdom', 'tunisia', 'guinea', 'bangladesh', 'djibouti', 'madagascar', 'mozambique', 'cameroon', 'india', 'papua new guinea', 'albania', 'benin', 'lebanon', 'saudi arabia', 'thailand', 'kazakhstan', 'bahrain', 'sierra leone', 'philippines', 'liberia', 'algeria', 'niger', 'uzbekistan', 'kenya', 'bhutan', 'georgia', 'mauritius', 'jordan', 'japan', 'morocco', 'luxembourg', 'ukraine', 'tajikistan', 'moldova', 'zimbabwe', 'netherlands', 'kuwait', 'sweden', 'belgium', 'belarus', 'malaysia', 'guatemala', 'south africa', 'indonesia', 'namibia', 'burundi', 'lithuania', 'nepal', 'latvia', '

Many countries where left out. I'll analyze on each dataset which countries are missing.

In [76]:
# Which countries are missing from dichotomous
dichotomous_missing = dichotomous_countries - all_countries
print(f"dichotomous_missing: {len(dichotomous_missing)} - {dichotomous_missing}")

# Which countries are missing from freedom_world
freedom_world_missing = freedom_world_countries - all_countries
print(f"freedom_world_missing: {len(freedom_world_missing)} - {freedom_world_missing}")

# Which countries are missing from lied
lied_missing = lied_countries - all_countries
print(f"lied_missing: {len(lied_missing)} - {lied_missing}")

# Which countries are missing from polity
polity_missing = polity_countries - all_countries
print(f"polity_missing: {len(polity_missing)} - {polity_missing}")

# Which countries are missing from rdi
rdi_missing = rdi_countries - all_countries
print(f"rdi_missing: {len(rdi_missing)} - {rdi_missing}")

# Which countries are missing from national_religion
national_religion_missing = national_religion_countries - all_countries
print(f"national_religion_missing: {len(national_religion_missing)} - {national_religion_missing}")

# Which countries are missing from georgetown
georgetown_missing = georgetown_countries - all_countries
print(f"georgetown_missing: {len(georgetown_missing)} - {georgetown_missing}")

# Which countries are missing from world_bank
world_bank_missing = world_bank_countries - all_countries
print(f"world_bank_missing: {len(world_bank_missing)} - {world_bank_missing}")

dichotomous_missing: 62 - {'egypt', 'st lucia', 'maldives', 'kyrgyzstan', 'congo rep', 'grenada', 'monaco', 'seychelles', 'cuba', 'samoa western', 'venezuela', 'iceland', 'andorra', 'st vincent gren', 'bahamas', 'eritrea', 'cape verde', 'gambia', 'central african rep', 'east timor', 'turkey', 'myanmar', 'san marino', 'korea south', 'vietnam', 'palau', 'laos', 'united states of america', 'liechstenstein', 'nauru', 'micronesia fed', 'st kitts nevis', 'belize', 'malta', 'taiwan', 'russia', 'antigua', 'tonga', 'barbados', 'syria', 'cote divoire', 'marshall islands', 'swaziland', 'yemen', 'turkmenistan', 'sudan', 'czech republic', 'trinidad tobago', 'kiribati', 'vanuatu', 'sao tome principe', 'slovakia', 'iran', 'macedonia', 'korea north', 'dominica', 'congo dem rep', 'libya', 'guineabissau', 'bosnia', 'brunei', 'tuvalu'}
freedom_world_missing: 66 - {'liechtenstein', 'egypt', 'st lucia', 'maldives', 'kyrgyzstan', 'grenada', 'cabo verde', 'monaco', 'seychelles', 'cuba', 'st kitts and nevis',

# Normalize country ISO 3

## Dichotomous

I'll try to find the ISO Alpha 3 code for each country on each dataset.

In [77]:
cc = coco.CountryConverter()
dichotomous_country_codes = cc.pandas_convert(series=dichotomous_df['abbreviation'], to='ISO3', not_found=None) 

print(f"dichotomous_country_codes: {len(dichotomous_country_codes)} - {dichotomous_country_codes}")

dichotomous_country_codes: 191 - 0      USA
1      CAN
2      BHS
3      CUB
4      HTI
      ... 
186    NRU
187    MHL
188    PLW
189    FSM
190    WSM
Name: abbreviation, Length: 191, dtype: object


I'll cleanup the values that aren't found.

In [78]:
# If column iso3 doesn't exist, add it
if 'iso3' not in dichotomous_df.columns:
    # Drop row with YUG as abbreviation
    dichotomous_df = dichotomous_df[dichotomous_df['abbreviation'] != 'YUG']
    # Update ROM to ROU in the "abbreviation" column
    dichotomous_df = dichotomous_df.replace('ROM', 'ROU')
    dichotomous_df = dichotomous_df.replace('SWD', 'SWE')
    dichotomous_df = dichotomous_df.replace('ZAR', 'COD')
    dichotomous_df = dichotomous_df.replace('ETF', 'ETH')
    dichotomous_df = dichotomous_df.replace('TMP', 'TLS')

    dichotomous_df.insert(0, 'iso3', cc.pandas_convert(series=dichotomous_df['abbreviation'], to='ISO3', not_found=None)) 

    dichotomous_df.to_csv('../datasets/processed/democracy/dichotomous/dichotomous-democracy-snapshot.csv', index=False)

dichotomous_df.head()

Unnamed: 0,iso3,country,ccode,abbreviation,abbreviation_undp,democracy,democracy_trans,democracy_breakdowns,democracy_duration,democracy_omitteddata,democracy_femalesuffrage
0,USA,UNITED STATES OF AMERICA,2,USA,USA,1.0,0,0,206,1,1
1,CAN,CANADA,20,CAN,CAN,1.0,0,0,139,1,1
2,BHS,BAHAMAS,31,BHS,BHS,1.0,0,0,33,1,1
3,CUB,CUBA,40,CUB,CUB,0.0,0,2,53,0,0
4,HTI,HAITI,41,HTI,HAI,0.0,0,0,186,0,0


## Freedom World

In [79]:
if 'iso3' not in freedom_world_df.columns:
    freedom_world_df.insert(0, 'iso3', cc.pandas_convert(series=freedom_world_df['country'], to='ISO3', not_found=None))

    freedom_world_df.to_csv('../datasets/processed/democracy/freedom-world/freedom-world-snapshot.csv', index=False)

    freedom_world_df.head()

## Lied


In [80]:
if 'iso3' not in lied_df.columns:
    lied_df.insert(0, 'iso3', cc.pandas_convert(series=lied_df['countryn'], to='ISO3', not_found=None))

lied_df.to_csv('../datasets/processed/democracy/lied/lied-snapshot.csv', index=False)

lied_df.head()

Unnamed: 0,iso3,countryn,cow,vdem,male_suffrage,female_suffrage,executive_elections,legislative_elections,multi-party_legislative_elections,competitive_elections,...,political_liberties,lexical_index_plus,democratic_transition,transition_type,democratic_breakdown,breakdown_type,turnover_period,turnover_event,two_turnover_period,sovereign
0,USA,United States,2,20,1,1,1,1,1,1,...,1,7,0,0,0,0,1,0,1,1
1,CAN,Canada,20,66,1,1,1,1,1,1,...,1,7,0,0,0,0,1,0,1,1
2,BHS,Bahamas,31,145,1,1,1,1,1,1,...,1,7,0,0,0,0,1,1,1,1
3,CUB,Cuba,40,155,1,1,1,1,0,0,...,0,1,0,0,0,0,0,0,0,1
4,HTI,Haiti,41,26,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Polity

In [81]:
if 'iso3' not in polity_df.columns:
    polity_df.insert(0, 'iso3', cc.pandas_convert(series=polity_df['country_iso'], to='ISO3', not_found=None))

polity_df.to_csv('../datasets/processed/democracy/polity/polity5-snapshot.csv', index=False)

polity_df.head()

Unnamed: 0,iso3,country_iso,country_name,autoc,democ,durable,exconst,exrec,parcomp,parreg,polcomp,polity,polity2,xconst,xrcomp,xropen,xrreg
0,AFG,AFG,Afghanistan,2,1,4.0,4,3,0,1,4,-1,-1.0,4,1,4,2
1,AGO,AGO,Angola,4,2,21.0,3,3,4,3,8,-2,-2.0,3,1,4,2
2,ALB,ALB,Albania,0,9,21.0,7,8,4,2,9,9,9.0,7,3,4,3
3,ARE,ARE,United Arab Emirates,8,0,47.0,3,2,1,4,1,-8,-8.0,3,1,2,3
4,ARG,ARG,Argentina,0,9,35.0,7,8,4,2,9,9,9.0,7,3,4,3


# Religious Diversity Index

In [82]:
if 'iso3' not in rdi_df.columns:
    rdi_df.insert(0, 'iso3', cc.pandas_convert(series=rdi_df['country'], to='ISO3', not_found=None))

    rdi_df.to_csv('../datasets/processed/religion/pew-research-center-religion-diversity/religious-diversity-index-extended.csv', index=False)

    rdi_df.head()


Channel Islands not found in regex
Netherlands Antilles not found in regex


In [83]:
rdi_df = rdi_df[rdi_df['country'] != 'Channel Islands']
rdi_df = rdi_df[rdi_df['country'] != 'Netherlands Antilles']

rdi_df.to_csv('../datasets/processed/religion/pew-research-center-religion-diversity/religious-diversity-index-extended.csv', index=False)

rdi_df.head()


Unnamed: 0,iso3,country,rdi,christian,muslim,unaffiliated,hindu,buddhist,folk,other,...,population,dominant_religion,maj_christian,maj_muslim,maj_unaffiliated,maj_hindu,maj_buddhist,maj_folk,maj_other,maj_jewish
0,AFG,Afghanistan,0.1,0.001,0.997,0.0,0.0,0.0,0.0,0.0,...,31410000,muslim,0,1,0,0,0,0,0,0
1,ALB,Albania,3.7,0.18,0.803,0.014,0.0,0.0,0.0,0.002,...,3200000,muslim,0,1,0,0,0,0,0,0
2,DZA,Algeria,0.5,0.002,0.979,0.018,0.0,0.0,0.0,0.0,...,35470000,muslim,0,1,0,0,0,0,0,0
3,ASM,American Samoa,0.4,0.983,0.0,0.007,0.0,0.003,0.004,0.003,...,70000,christian,1,0,0,0,0,0,0,0
4,AND,Andorra,2.2,0.895,0.008,0.088,0.005,0.0,0.0,0.001,...,80000,christian,1,0,0,0,0,0,0,0


## National Religion Dataset

In [84]:
# This dataset already has a column "iso3". I'll check if it's correct.
cc.pandas_convert(series=national_religion_df['iso3'], to='ISO3', not_found=None)

0      USA
1      CAN
2      BHS
3      CUB
4      HTI
      ... 
187    NRU
188    MHL
189    PLW
190    FSM
191    WSM
Name: iso3, Length: 192, dtype: object

## Georgetown - Women Peace and Security Index

In [85]:
if 'iso3' not in georgetown_df.columns:
    georgetown_df.insert(0, 'iso3', cc.pandas_convert(series=georgetown_df['country'], to='ISO3', not_found=None))

    georgetown_df.to_csv('../datasets/processed/women/georgetown/women-peace-and-security-index.csv', index=False)

    georgetown_df.head()
    

## World Bank - Women Business and the Law

In [86]:
if 'iso3' not in world_bank_df.columns:
    world_bank_df.insert(0, 'iso3', cc.pandas_convert(series=world_bank_df['ISO Code'], to='ISO3', not_found=None))

    world_bank_df.to_csv('../datasets/processed/women/world-bank/women-business-and-the-law-snapshot.csv', index=False)

    world_bank_df.head()

KSV not found in ISO3


In [87]:
# Set the column "iso3" as "XKX" for Kosovo
world_bank_df.loc[world_bank_df['ISO Code'] == 'KSV', 'iso3'] = 'XKX'

world_bank_df.to_csv('../datasets/processed/women/world-bank/women-business-and-the-law-snapshot.csv', index=False)

world_bank_df.head()


Unnamed: 0,iso3,country,ISO Code,wbl_index,mobility,woman_choose_residence,woman_travel_outside_home,woman_apply_passport,woman_travel_abroad,workplace,...,equal_property_rights,equal_inheritance_children,equal_inheritance_spouses,equal_asset_admin_marriage,value_nonmonetary_contributions,pension,equal_pension_age_full,equal_pension_age_partial,equal_retirement_age,pension_credit_childcare
0,AFG,Afghanistan,AFG,31.875,25,False,False,True,False,50,...,True,False,False,True,False,25,False,False,True,False
1,ALB,Albania,ALB,91.25,100,True,True,True,True,100,...,True,True,True,True,True,50,False,False,True,True
2,DZA,Algeria,DZA,57.5,75,True,True,False,True,75,...,True,False,False,True,False,25,False,False,False,True
3,AGO,Angola,AGO,79.375,100,True,True,True,True,100,...,True,True,True,True,True,25,False,True,False,False
4,ATG,Antigua and Barbuda,ATG,68.75,75,False,True,True,True,50,...,True,True,True,True,False,75,True,True,True,False


# Merge Countries using ISO3

In [88]:
dichotomous_countries = set(dichotomous_df['iso3'].unique())
freedom_world_countries = set(freedom_world_df['iso3'].unique())
lied_countries = set(lied_df['iso3'].unique())
polity_countries = set(polity_df['iso3'].unique())

rdi_countries = set(rdi_df['iso3'].unique())
national_religion_countries = set(national_religion_df['iso3'].unique())

georgetown_countries = set(georgetown_df['iso3'].unique())
world_bank_countries = set(world_bank_df['iso3'].unique())

# Find countries that are in all datasets
all_countries = dichotomous_countries & freedom_world_countries & lied_countries & polity_countries & rdi_countries & national_religion_countries & georgetown_countries & world_bank_countries

print(f"all_countries: {len(all_countries)} - {all_countries}")




all_countries: 154 - {'URY', 'MNG', 'COG', 'AZE', 'LBR', 'SEN', 'LKA', 'GBR', 'OMN', 'KEN', 'HTI', 'KGZ', 'SLV', 'BHR', 'KOR', 'BDI', 'CHE', 'JAM', 'VEN', 'VNM', 'ISR', 'MEX', 'ZAF', 'MUS', 'ALB', 'CYP', 'TJK', 'IRN', 'CPV', 'AUT', 'BRA', 'JPN', 'ECU', 'SVN', 'KWT', 'NGA', 'RUS', 'LVA', 'HRV', 'SGP', 'LBN', 'DJI', 'GTM', 'CZE', 'HND', 'MWI', 'NZL', 'SWE', 'UKR', 'IRL', 'BGD', 'SLE', 'MOZ', 'NAM', 'IDN', 'NOR', 'NLD', 'ARM', 'ETH', 'TLS', 'BEN', 'EST', 'ARG', 'PNG', 'TGO', 'SLB', 'POL', 'ESP', 'SOM', 'COM', 'FJI', 'CAF', 'MKD', 'AUS', 'KHM', 'NER', 'ZWE', 'TUN', 'PER', 'EGY', 'GUY', 'BEL', 'CIV', 'SAU', 'MLI', 'PHL', 'BFA', 'MAR', 'FRA', 'NPL', 'BTN', 'ZMB', 'PAK', 'AFG', 'ITA', 'BGR', 'PRT', 'BWA', 'IND', 'GNQ', 'GNB', 'GEO', 'PAN', 'THA', 'HUN', 'TUR', 'GHA', 'GAB', 'COL', 'LTU', 'TCD', 'TZA', 'LSO', 'DZA', 'LAO', 'BLR', 'MMR', 'RWA', 'CRI', 'USA', 'GMB', 'BOL', 'GIN', 'AGO', 'MDG', 'TTO', 'IRQ', 'LUX', 'CAN', 'PRY', 'SUR', 'SYR', 'DNK', 'KAZ', 'TWN', 'MDA', 'SWZ', 'JOR', 'QAT', 'UZB'

In [89]:
# Which countries are missing from dichotomous
dichotomous_missing = dichotomous_countries - all_countries
print(f"dichotomous_missing: {len(dichotomous_missing)} - {dichotomous_missing}")

# Which countries are missing from freedom_world
freedom_world_missing = freedom_world_countries - all_countries
print(f"freedom_world_missing: {len(freedom_world_missing)} - {freedom_world_missing}")

# Which countries are missing from lied
lied_missing = lied_countries - all_countries
print(f"lied_missing: {len(lied_missing)} - {lied_missing}")

# Which countries are missing from polity
polity_missing = polity_countries - all_countries
print(f"polity_missing: {len(polity_missing)} - {polity_missing}")

# Which countries are missing from rdi
rdi_missing = rdi_countries - all_countries
print(f"rdi_missing: {len(rdi_missing)} - {rdi_missing}")

# Which countries are missing from national_religion
national_religion_missing = national_religion_countries - all_countries
print(f"national_religion_missing: {len(national_religion_missing)} - {national_religion_missing}")

# Which countries are missing from georgetown
georgetown_missing = georgetown_countries - all_countries
print(f"georgetown_missing: {len(georgetown_missing)} - {georgetown_missing}")

# Which countries are missing from world_bank
world_bank_missing = world_bank_countries - all_countries
print(f"world_bank_missing: {len(world_bank_missing)} - {world_bank_missing}")

dichotomous_missing: 37 - {'BIH', 'ISL', 'KIR', 'TUV', 'BRB', 'TKM', 'BLZ', 'PLW', 'SDN', 'BHS', 'LBY', 'GRD', 'STP', 'LCA', 'SYC', 'FSM', 'VUT', 'NRU', 'ERI', 'WSM', 'COD', 'CUB', 'AND', 'BRN', 'MLT', 'VCT', 'TON', 'ATG', 'KNA', 'PRK', 'LIE', 'YEM', 'MCO', 'DMA', 'SMR', 'MHL', 'MDV'}
freedom_world_missing: 41 - {'BIH', 'ISL', 'KIR', 'TUV', 'BRB', 'TKM', 'BLZ', 'SRB', 'PLW', 'SSD', 'SDN', 'BHS', 'LBY', 'GRD', 'STP', 'LCA', 'SYC', 'XKX', 'FSM', 'VUT', 'NRU', 'ERI', 'MHL', 'WSM', 'COD', 'CUB', 'AND', 'BRN', 'MLT', 'VCT', 'ATG', 'KNA', 'PRK', 'LIE', 'YEM', 'MNE', 'MCO', 'DMA', 'SMR', 'TON', 'MDV'}
lied_missing: 42 - {'BIH', 'ISL', 'KIR', 'TUV', 'BRB', 'PSE', 'TKM', 'BLZ', 'SRB', 'PLW', 'SSD', 'SDN', 'BHS', 'LBY', 'GRD', 'STP', 'LCA', 'SYC', 'XKX', 'FSM', 'VUT', 'NRU', 'ERI', 'WSM', 'COD', 'CUB', 'AND', 'BRN', 'MLT', 'VCT', 'TON', 'ATG', 'KNA', 'PRK', 'LIE', 'YEM', 'MNE', 'MCO', 'DMA', 'SMR', 'MHL', 'MDV'}
polity_missing: 7 - {'TKM', 'SRB', 'PRK', 'ERI', 'MNE', 'XKX', 'CUB'}
rdi_missing: 7

In [94]:
# What's the dataset with less missing countries?
missing_counts = {
    'dichotomous': len(dichotomous_missing),
    'freedom_world': len(freedom_world_missing),
    'lied': len(lied_missing),
    'polity': len(polity_missing),
    'rdi': len(rdi_missing),
    'national_religion': len(national_religion_missing),
    'georgetown': len(georgetown_missing),
    'world_bank': len(world_bank_missing)
}

print(missing_counts)

{'dichotomous': 37, 'freedom_world': 41, 'lied': 42, 'polity': 7, 'rdi': 76, 'national_religion': 38, 'georgetown': 23, 'world_bank': 36}


Polity seems to be the dataset with less countries and ruling out many important ones.

The scope of this research will exclude those countries for now.

When comparing specific datasets, I'll use all the countries available in both datasets.

In [97]:
country_names = []
for iso_code in sorted(all_countries):
    country = pycountry.countries.get(alpha_3=iso_code)
    if country:
        country_names.append(country.name)
print(f"List of {len(country_names)} countries present in all datasets:")
print(", ".join(country_names))

List of 154 countries present in all datasets:
Afghanistan, Angola, Albania, United Arab Emirates, Argentina, Armenia, Australia, Austria, Azerbaijan, Burundi, Belgium, Benin, Burkina Faso, Bangladesh, Bulgaria, Bahrain, Belarus, Bolivia, Plurinational State of, Brazil, Bhutan, Botswana, Central African Republic, Canada, Switzerland, Chile, China, Côte d'Ivoire, Cameroon, Congo, Colombia, Comoros, Cabo Verde, Costa Rica, Cyprus, Czechia, Germany, Djibouti, Denmark, Dominican Republic, Algeria, Ecuador, Egypt, Spain, Estonia, Ethiopia, Finland, Fiji, France, Gabon, United Kingdom, Georgia, Ghana, Guinea, Gambia, Guinea-Bissau, Equatorial Guinea, Greece, Guatemala, Guyana, Honduras, Croatia, Haiti, Hungary, Indonesia, India, Ireland, Iran, Islamic Republic of, Iraq, Israel, Italy, Jamaica, Jordan, Japan, Kazakhstan, Kenya, Kyrgyzstan, Cambodia, Korea, Republic of, Kuwait, Lao People's Democratic Republic, Lebanon, Liberia, Sri Lanka, Lesotho, Lithuania, Luxembourg, Latvia, Morocco, Moldo