# Similar Countries
This notebook is about trying to find countries that are somewhat similar to Australia

In [16]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn import preprocessing

In [17]:
base_data = pd.read_csv("./dataset/cleaned.csv")

base_data.drop(["Indicator Code","Country Code"],inplace=True,axis=1)
    
# FILTER BY INDICATOR NAME
INDICATORS_WE_NO_LONGER_WANT_DUE_TO_MANY_COUNTRIES_NAN = [
    "Births attended by skilled health staff (% of total)",
    "Coverage of social insurance programs (% of population)",
    "Adequacy of unemployment benefits and ALMP (% of total welfare of beneficiary households)",
    "Gini index (World Bank estimate)",
    "Physicians (per 1,000 people)",
]
INDICATOR_WE_NO_LONGER_WANT_DUE_TO_AUSTRALIA_NAN = [
    "Population living in slums (% of urban population)",
    "Literacy rate, adult total (% of people ages 15 and above)",
    "Nurses and midwives (per 1,000 people)",

]
INDICATORS_WE_WANT = [
    "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)",
    "Current health expenditure per capita (current US$)",
    "Current health expenditure (% of GDP)",
    "Current health expenditure per capita (current US$)",
    "GDP (current US$)",
    "GDP per capita (current US$)",
    "Hospital beds (per 1,000 people)",
    "Life expectancy at birth, total (years)",
    "People using at least basic drinking water services (% of population)",
    "People using at least basic sanitation services (% of population)",
    "People with basic handwashing facilities including soap and water (% of population)",
    "Population ages 65 and above (% of total population)",
    "Population density (people per sq. km of land area)",
    "Smoking prevalence, total (ages 15+)",
    "Urban population (% of total population)"
]

base_data = base_data[base_data["Indicator Name"].isin(INDICATORS_WE_WANT)]


df = pd.pivot_table(base_data,index=["Country Name"], columns="Indicator Name", values="2018")
non_filtered_data = pd.pivot_table(base_data,index=["Country Name"], columns="Indicator Name", values="2018",dropna=False)

# DISPLAY DATA COLUMNS WE WANT BUT NOT IN DF
#print(non_filtered_data.columns[~non_filtered_data.columns.isin(df.columns)])



# COLUMN INDEX SETTER

# MERGE RESPIRATORY DEATH
df["location"] = df.index
respiratory_df = pd.read_csv("./dataset/respiratory.csv")[["location","val"]]
df = df.merge(respiratory_df,how="left").set_index("location",drop=False)
df.rename({"val":"Respiratory Death per 100k","location":"Country Name"},axis="columns",inplace=True)

print(respiratory_df[respiratory_df["location"]=="United States"])
merger = [
    ["beds.csv","Hospital beds (per 1,000 people)"],
    ["health.csv","Current health expenditure per capita (current US$)"],
    ["smoking.csv","Smoking prevalence, total (ages 15+)"]
]
for filename,column_name in merger:
    file_df = pd.read_csv(f"./dataset/{filename}")[["Country Name",column_name]]
    file_df[column_name]=file_df[column_name].apply(lambda value: float(value) if value!=".." else np.nan)
    df = df.reset_index(drop=True).merge(file_df,how="left").set_index("Country Name",drop=False)

df.drop("Country Name",axis="columns",inplace=True)

         location        val
75  United States  24.607882


In [18]:
df

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,1.948438e+10,524.162881,64.486000,2.584927,56.937760,25.495000,61.257492,0.400000,67.122650,
Albania,1.514702e+10,5284.380184,78.458000,13.744736,104.612263,60.319000,17.185137,2.600000,,28.700000
Algeria,1.737580e+11,4114.715061,76.693000,6.362497,17.730075,72.629000,14.936240,,258.494293,15.600000
American Samoa,6.360000e+08,11466.690706,,,277.325000,87.153000,22.482450,,,
Andorra,3.218316e+09,41793.055258,,,163.842553,88.062000,25.029809,,4040.786621,33.500000
...,...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,1.461590e+10,3198.866644,73.895000,3.133306,758.984551,76.164000,,,,
World,8.635707e+13,11374.846763,72.563274,8.873129,59.617881,55.270426,,2.704488,1061.146745,20.485071
"Yemen, Rep.",2.759126e+10,968.159048,66.096000,2.876270,53.977853,36.642000,,0.700000,,18.400000
Zambia,2.700524e+10,1556.334482,63.510000,2.099678,23.341479,43.521000,87.477320,,67.648666,13.800000


In [19]:
df.loc["Australia"]
df.loc["United States"]

GDP (current US$)                                       2.052905e+13
GDP per capita (current US$)                            6.284002e+04
Life expectancy at birth, total (years)                 7.853902e+01
Population ages 65 and above (% of total population)    1.580765e+01
Population density (people per sq. km of land area)     3.571362e+01
Urban population (% of total population)                8.225600e+01
Respiratory Death per 100k                              2.460788e+01
Hospital beds (per 1,000 people)                        2.900000e+00
Current health expenditure per capita (current US$)     1.024614e+04
Smoking prevalence, total (ages 15+)                    2.180000e+01
Name: United States, dtype: float64

In [20]:
df.dtypes

GDP (current US$)                                       float64
GDP per capita (current US$)                            float64
Life expectancy at birth, total (years)                 float64
Population ages 65 and above (% of total population)    float64
Population density (people per sq. km of land area)     float64
Urban population (% of total population)                float64
Respiratory Death per 100k                              float64
Hospital beds (per 1,000 people)                        float64
Current health expenditure per capita (current US$)     float64
Smoking prevalence, total (ages 15+)                    float64
dtype: object

In [21]:
for column in list(df.columns):
    print(column)
    
    print(df[column].isna().sum())

GDP (current US$)
22
GDP per capita (current US$)
22
Life expectancy at birth, total (years)
19
Population ages 65 and above (% of total population)
24
Population density (people per sq. km of land area)
8
Urban population (% of total population)
3
Respiratory Death per 100k
88
Hospital beds (per 1,000 people)
103
Current health expenditure per capita (current US$)
32
Smoking prevalence, total (ages 15+)
77


In [22]:
#df.fillna(df.mean(),inplace=True)
CRITICAL_INDICATORS = [
    "GDP (current US$)",
    "GDP per capita (current US$)",
    "Life expectancy at birth, total (years)",
    "Population ages 65 and above (% of total population)",
    "Population density (people per sq. km of land area)",
    "Respiratory Death per 100k"
]

df = df[~df[CRITICAL_INDICATORS].isna().any(axis=1)]

In [23]:
print(df.shape)
df.head()

(158, 10)


Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,19484380000.0,524.162881,64.486,2.584927,56.93776,25.495,61.257492,0.4,67.12265,
Albania,15147020000.0,5284.380184,78.458,13.744736,104.612263,60.319,17.185137,2.6,,28.7
Algeria,173758000000.0,4114.715061,76.693,6.362497,17.730075,72.629,14.93624,,258.494293,15.6
Angola,101353200000.0,3289.646664,60.782,2.216374,24.713052,65.514,85.07669,,114.459641,
Antigua and Barbuda,1610574000.0,16726.980808,76.885,8.799826,218.831818,24.599,33.742485,2.1,673.85968,


In [24]:
#standardised_df = df.apply(zscore)
#standardised_df.head()
australia = df.loc["Australia"]
australia_standard = np.abs(df-australia)
australia_standard

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,1.414420e+12,56871.756585,18.26278,13.071548,53.689889,60.517,43.708583,3.4,5264.695221,
Albania,1.418757e+12,52111.539282,4.29078,1.911739,101.364392,25.693,0.363771,1.2,,14.0
Algeria,1.260146e+12,53281.204405,6.05578,9.293979,14.482204,13.383,2.612668,,5073.323578,0.9
Angola,1.332551e+12,54106.272802,21.96678,13.440102,21.465181,20.498,67.527781,,5217.358231,
Antigua and Barbuda,1.432294e+12,40668.938658,5.86378,6.856650,215.583947,61.413,16.193577,1.7,4657.958191,
...,...,...,...,...,...,...,...,...,...,...
Uruguay,1.374307e+12,40117.949356,4.97878,0.841956,16.460157,9.322,30.291577,0.8,3740.284668,2.1
Uzbekistan,1.383512e+12,55866.836603,11.17578,11.237337,74.222980,35.534,23.957852,0.5,5232.993294,2.1
Vanuatu,1.432998e+12,54300.217310,12.42578,12.016322,20.761973,60.738,35.567989,,5226.151344,3.9
Zambia,1.406899e+12,55839.584985,19.23878,13.556797,20.093608,42.491,69.928411,,5264.169205,0.9


In [25]:
australia_ranked = australia_standard.rank(axis=0)
australia_ranked

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,96.0,153.0,129.0,144.0,58.0,141.0,105.0,100.0,124.0,
Albania,103.0,88.0,38.0,19.0,94.0,79.0,3.0,37.0,,100.0
Algeria,42.0,100.0,56.0,80.0,25.0,51.0,21.0,,88.0,9.5
Angola,49.0,106.0,146.0,154.0,34.0,72.0,131.0,,109.0,
Antigua and Barbuda,143.0,42.0,51.0,60.0,125.0,143.0,71.0,48.0,51.0,
...,...,...,...,...,...,...,...,...,...,...
Uruguay,67.0,40.0,43.0,11.0,29.0,42.0,94.0,25.0,27.0,21.5
Uzbekistan,73.0,128.0,99.0,104.0,74.0,106.0,85.0,15.5,114.0,20.0
Vanuatu,150.0,112.0,108.0,115.0,33.0,142.0,95.0,,111.0,31.0
Zambia,85.0,126.0,137.0,155.0,32.0,115.0,132.0,,123.0,8.0


In [26]:
australia_ranked_with_ave = australia_ranked
australia_ranked_with_ave["average_rank"] = australia_ranked_with_ave.apply(lambda row: np.nanmean(row), axis=1)
australia_ranked_with_ave

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",average_rank
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,96.0,153.0,129.0,144.0,58.0,141.0,105.0,100.0,124.0,,116.666667
Albania,103.0,88.0,38.0,19.0,94.0,79.0,3.0,37.0,,100.0,62.333333
Algeria,42.0,100.0,56.0,80.0,25.0,51.0,21.0,,88.0,9.5,52.500000
Angola,49.0,106.0,146.0,154.0,34.0,72.0,131.0,,109.0,,100.125000
Antigua and Barbuda,143.0,42.0,51.0,60.0,125.0,143.0,71.0,48.0,51.0,,81.555556
...,...,...,...,...,...,...,...,...,...,...,...
Uruguay,67.0,40.0,43.0,11.0,29.0,42.0,94.0,25.0,27.0,21.5,39.950000
Uzbekistan,73.0,128.0,99.0,104.0,74.0,106.0,85.0,15.5,114.0,20.0,81.850000
Vanuatu,150.0,112.0,108.0,115.0,33.0,142.0,95.0,,111.0,31.0,99.666667
Zambia,85.0,126.0,137.0,155.0,32.0,115.0,132.0,,123.0,8.0,101.444444


In [27]:
country_names = pd.Series(np.unique(base_data["Country Name"]))

excluded_country = country_names[~country_names.isin(australia_ranked_with_ave.index)]

for country in excluded_country:
    print(country)

American Samoa
Andorra
Arab World
Aruba
Bahamas, The
Bermuda
Bolivia
British Virgin Islands
Caribbean small states
Cayman Islands
Central Europe and the Baltics
Channel Islands
Congo, Dem. Rep.
Congo, Rep.
Cote d'Ivoire
Curacao
Czech Republic
Dominica
Early-demographic dividend
East Asia & Pacific
East Asia & Pacific (IDA & IBRD countries)
East Asia & Pacific (excluding high income)
Egypt, Arab Rep.
Eritrea
Euro area
Europe & Central Asia
Europe & Central Asia (IDA & IBRD countries)
Europe & Central Asia (excluding high income)
European Union
Faroe Islands
Fragile and conflict affected situations
French Polynesia
Gambia, The
Gibraltar
Greenland
Heavily indebted poor countries (HIPC)
High income
Hong Kong SAR, China
IBRD only
IDA & IBRD total
IDA blend
IDA only
IDA total
Iran, Islamic Rep.
Isle of Man
Korea, Dem. People’s Rep.
Korea, Rep.
Kosovo
Kyrgyz Republic
Lao PDR
Late-demographic dividend
Latin America & Caribbean
Latin America & Caribbean (excluding high income)
Latin America & t

In [28]:
australia_ranked_with_ave.sort_values(by="average_rank",inplace=True)
australia_ranked_with_ave

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",average_rank
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Australia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.5,1.100000
Canada,5.0,11.0,13.0,16.0,8.0,17.0,42.0,33.5,9.0,4.0,15.850000
New Zealand,39.0,15.0,14.0,2.0,27.0,3.0,19.0,42.0,15.0,14.0,19.000000
Sweden,14.0,2.0,6.0,44.0,35.0,6.0,58.0,33.5,8.0,33.0,23.950000
Iceland,87.0,16.0,5.0,12.0,2.0,34.0,62.0,15.5,10.0,1.5,24.500000
...,...,...,...,...,...,...,...,...,...,...,...
Central African Republic,140.0,155.0,158.0,138.0,12.0,120.0,158.0,85.5,151.0,,124.166667
Sierra Leone,132.0,151.0,155.0,131.0,95.0,118.0,149.0,,126.0,80.0,126.333333
Chad,118.0,147.0,156.0,147.0,15.0,149.0,156.0,,147.0,,129.375000
Guinea-Bissau,146.0,144.0,153.0,139.0,65.0,116.0,142.0,,134.0,,129.875000


In [29]:
australia_ranked_with_ave.head(20)

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",average_rank
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Australia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.5,1.1
Canada,5.0,11.0,13.0,16.0,8.0,17.0,42.0,33.5,9.0,4.0,15.85
New Zealand,39.0,15.0,14.0,2.0,27.0,3.0,19.0,42.0,15.0,14.0,19.0
Sweden,14.0,2.0,6.0,44.0,35.0,6.0,58.0,33.5,8.0,33.0,23.95
Iceland,87.0,16.0,5.0,12.0,2.0,34.0,62.0,15.5,10.0,1.5,24.5
Norway,19.0,24.0,2.0,14.0,17.0,16.0,83.0,14.0,19.0,41.0,24.9
Finland,32.0,7.0,18.0,56.0,26.0,5.0,25.0,51.0,13.0,46.0,27.9
Denmark,27.0,3.0,25.0,38.0,111.0,9.0,88.0,19.0,7.0,34.0,36.1
Saudi Arabia,9.0,34.0,71.0,122.0,20.0,10.0,4.0,48.0,38.0,9.5,36.55
Israel,24.0,17.0,4.0,31.0,145.0,29.0,10.0,19.0,17.0,83.0,37.9


In [31]:
australia_ranked_with_ave.head(21).to_csv("./intermediary_files/top_20_countries_and_Aus.csv")