# Similar Countries
This notebook is about trying to find countries that are somewhat similar to Australia

In [8]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn import preprocessing

In [9]:
base_data = pd.read_csv("cleaned.csv")

base_data.drop(["Indicator Code","Country Code"],inplace=True,axis=1)
    
# FILTER BY INDICATOR NAME
INDICATORS_WE_NO_LONGER_WANT_DUE_TO_MANY_COUNTRIES_NAN = [
    "Births attended by skilled health staff (% of total)",
    "Coverage of social insurance programs (% of population)",
    "Adequacy of unemployment benefits and ALMP (% of total welfare of beneficiary households)",
    "Gini index (World Bank estimate)",
    "Physicians (per 1,000 people)",
]
INDICATOR_WE_NO_LONGER_WANT_DUE_TO_AUSTRALIA_NAN = [
    "Population living in slums (% of urban population)",
    "Literacy rate, adult total (% of people ages 15 and above)",
    "Nurses and midwives (per 1,000 people)",

]
INDICATORS_WE_WANT = [
    "Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)",
    "Current health expenditure per capita (current US$)",
    "Current health expenditure (% of GDP)",
    "Current health expenditure per capita (current US$)",
    "GDP (current US$)",
    "GDP per capita (current US$)",
    "Hospital beds (per 1,000 people)",
    "Life expectancy at birth, total (years)",
    "People using at least basic drinking water services (% of population)",
    "People using at least basic sanitation services (% of population)",
    "People with basic handwashing facilities including soap and water (% of population)",
    "Population ages 65 and above (% of total population)",
    "Population density (people per sq. km of land area)",
    "Smoking prevalence, total (ages 15+)",
    "Urban population (% of total population)"
]

base_data = base_data[base_data["Indicator Name"].isin(INDICATORS_WE_WANT)]


df = pd.pivot_table(base_data,index=["Country Name"], columns="Indicator Name", values="2018")
non_filtered_data = pd.pivot_table(base_data,index=["Country Name"], columns="Indicator Name", values="2018",dropna=False)

# DISPLAY DATA COLUMNS WE WANT BUT NOT IN DF
#print(non_filtered_data.columns[~non_filtered_data.columns.isin(df.columns)])



# COLUMN INDEX SETTER

# MERGE RESPIRATORY DEATH
df["location"] = df.index
respiratory_df = pd.read_csv("respiratory.csv")[["location","val"]]
df = df.merge(respiratory_df,how="left").set_index("location",drop=False)
df.rename({"val":"Respiratory Death per 100k","location":"Country Name"},axis="columns",inplace=True)

print(df.columns)
merger = [
    ["beds.csv","Hospital beds (per 1,000 people)"],
    ["health.csv","Current health expenditure per capita (current US$)"],
    ["smoking.csv","Smoking prevalence, total (ages 15+)"]
]
for filename,column_name in merger:
    file_df = pd.read_csv(filename)[["Country Name",column_name]]
    file_df[column_name]=file_df[column_name].apply(lambda value: float(value) if value!=".." else np.nan)
    df = df.reset_index(drop=True).merge(file_df,how="left").set_index("Country Name",drop=False)

df.drop("Country Name",axis="columns",inplace=True)

Index(['GDP (current US$)', 'GDP per capita (current US$)',
       'Life expectancy at birth, total (years)',
       'Population ages 65 and above (% of total population)',
       'Population density (people per sq. km of land area)',
       'Urban population (% of total population)', 'Country Name',
       'Respiratory Death per 100k'],
      dtype='object')


In [10]:
df

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,1.948438e+10,524.162881,64.486000,2.584927,56.937760,25.495000,61.257492,0.400000,67.122650,
Albania,1.514702e+10,5284.380184,78.458000,13.744736,104.612263,60.319000,17.185137,2.600000,,28.700000
Algeria,1.737580e+11,4114.715061,76.693000,6.362497,17.730075,72.629000,14.936240,,258.494293,15.600000
American Samoa,6.360000e+08,11466.690706,,,277.325000,87.153000,22.482450,,,
Andorra,3.218316e+09,41793.055258,,,163.842553,88.062000,25.029809,,4040.786621,33.500000
...,...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,1.461590e+10,3198.866644,73.895000,3.133306,758.984551,76.164000,,,,
World,8.635707e+13,11374.846763,72.563274,8.873129,59.617881,55.270426,,2.704488,1061.146745,20.485071
"Yemen, Rep.",2.759126e+10,968.159048,66.096000,2.876270,53.977853,36.642000,,0.700000,,18.400000
Zambia,2.700524e+10,1556.334482,63.510000,2.099678,23.341479,43.521000,87.477320,,67.648666,13.800000


In [11]:
df.loc["Australia"]

GDP (current US$)                                       1.433904e+12
GDP per capita (current US$)                            5.739592e+04
Life expectancy at birth, total (years)                 8.274878e+01
Population ages 65 and above (% of total population)    1.565648e+01
Population density (people per sq. km of land area)     3.247871e+00
Urban population (% of total population)                8.601200e+01
Respiratory Death per 100k                              1.754891e+01
Hospital beds (per 1,000 people)                        3.800000e+00
Current health expenditure per capita (current US$)     5.331818e+03
Smoking prevalence, total (ages 15+)                    1.470000e+01
Name: Australia, dtype: float64

In [12]:
df.dtypes

GDP (current US$)                                       float64
GDP per capita (current US$)                            float64
Life expectancy at birth, total (years)                 float64
Population ages 65 and above (% of total population)    float64
Population density (people per sq. km of land area)     float64
Urban population (% of total population)                float64
Respiratory Death per 100k                              float64
Hospital beds (per 1,000 people)                        float64
Current health expenditure per capita (current US$)     float64
Smoking prevalence, total (ages 15+)                    float64
dtype: object

In [13]:
for column in list(df.columns):
    print(column)
    
    print(df[column].isna().sum())

GDP (current US$)
22
GDP per capita (current US$)
22
Life expectancy at birth, total (years)
19
Population ages 65 and above (% of total population)
24
Population density (people per sq. km of land area)
8
Urban population (% of total population)
3
Respiratory Death per 100k
89
Hospital beds (per 1,000 people)
103
Current health expenditure per capita (current US$)
32
Smoking prevalence, total (ages 15+)
77


In [14]:
#df.fillna(df.mean(),inplace=True)
CRITICAL_INDICATORS = [
    "GDP (current US$)",
    "GDP per capita (current US$)",
    "Life expectancy at birth, total (years)",
    "Population ages 65 and above (% of total population)",
    "Population density (people per sq. km of land area)",
    "Respiratory Death per 100k"
]

df = df[~df[CRITICAL_INDICATORS].isna().any(axis=1)]

In [15]:
print(df.shape)
df.head()

(157, 10)


Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,19484380000.0,524.162881,64.486,2.584927,56.93776,25.495,61.257492,0.4,67.12265,
Albania,15147020000.0,5284.380184,78.458,13.744736,104.612263,60.319,17.185137,2.6,,28.7
Algeria,173758000000.0,4114.715061,76.693,6.362497,17.730075,72.629,14.93624,,258.494293,15.6
Angola,101353200000.0,3289.646664,60.782,2.216374,24.713052,65.514,85.07669,,114.459641,
Antigua and Barbuda,1610574000.0,16726.980808,76.885,8.799826,218.831818,24.599,33.742485,2.1,673.85968,


In [16]:
standardised_df = df.apply(zscore)
standardised_df.head()

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)"
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,-0.285125,-0.714792,-1.090284,-0.99462,-0.232453,-1.472951,0.211383,,,
Albania,-0.288494,-0.481942,0.765381,0.730034,-0.162139,0.035241,-0.834787,,,
Algeria,-0.165277,-0.539157,0.530966,-0.410829,-0.290279,0.568374,-0.888171,,,
Angola,-0.221525,-0.579516,-1.582223,-1.051577,-0.27998,0.260231,0.776793,,,
Antigua and Barbuda,-0.29901,0.077783,0.556466,-0.03416,0.006321,-1.511756,-0.441756,,,


In [22]:
standardised_df["total_abs_score"] = standardised_df.apply(lambda row: np.sum(np.abs(row)), axis=1)

In [23]:
standardised_df.head(100)

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",total_abs_score,distance to australia
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Seychelles,-0.299029,0.061339,0.019431,-0.220426,-0.006185,-0.121884,0.644285,,,,8.899884,-6.154724
Kazakhstan,-0.160941,-0.260440,0.060409,-0.251751,-0.306444,-0.089966,-0.601544,,,,9.258799,-5.795809
Trinidad and Tobago,-0.281766,0.097492,0.090956,0.264898,0.083156,-0.273769,-0.750260,,,,9.369601,-5.685008
Grenada,-0.299353,-0.227505,-0.041326,0.092886,0.167045,-1.006210,-0.151169,,,,9.512797,-5.541811
El Salvador,-0.279972,-0.541459,0.053237,-0.113399,0.140608,0.542129,-0.339554,,,,9.537661,-5.516947
...,...,...,...,...,...,...,...,...,...,...,...,...
Greece,-0.130800,0.253749,1.207622,1.952541,-0.193623,0.846807,0.129523,,,,12.241969,-2.812640
Angola,-0.221525,-0.579516,-1.582223,-1.051577,-0.279980,0.260231,0.776793,,,,12.279149,-2.775459
Guam,-0.295662,1.006478,0.951452,0.128122,0.136326,1.527711,-0.731308,,,,12.304362,-2.750246
Cameroon,-0.270201,-0.665371,-1.829388,-0.972374,-0.237753,-0.135613,0.774644,,,,12.412649,-2.641959


In [24]:
country_names = pd.Series(np.unique(base_data["Country Name"]))

excluded_country = country_names[~country_names.isin(standardised_df.index)]

for country in excluded_country:
    print(country)

American Samoa
Andorra
Arab World
Aruba
Bahamas, The
Bermuda
Bolivia
British Virgin Islands
Caribbean small states
Cayman Islands
Central Europe and the Baltics
Channel Islands
Congo, Dem. Rep.
Congo, Rep.
Cote d'Ivoire
Curacao
Czech Republic
Dominica
Early-demographic dividend
East Asia & Pacific
East Asia & Pacific (IDA & IBRD countries)
East Asia & Pacific (excluding high income)
Egypt, Arab Rep.
Eritrea
Euro area
Europe & Central Asia
Europe & Central Asia (IDA & IBRD countries)
Europe & Central Asia (excluding high income)
European Union
Faroe Islands
Fragile and conflict affected situations
French Polynesia
Gambia, The
Gibraltar
Greenland
Heavily indebted poor countries (HIPC)
High income
Hong Kong SAR, China
IBRD only
IDA & IBRD total
IDA blend
IDA only
IDA total
Iran, Islamic Rep.
Isle of Man
Korea, Dem. People’s Rep.
Korea, Rep.
Kosovo
Kyrgyz Republic
Lao PDR
Late-demographic dividend
Latin America & Caribbean
Latin America & Caribbean (excluding high income)
Latin America & t

In [27]:
australia_z_score = standardised_df.loc["Australia","total_abs_score"]
standardised_df["distance to australia"] = standardised_df["total_abs_score"] - australia_z_score

standardised_df.sort_values(by="distance to australia",inplace=True)
standardised_df

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",total_abs_score,distance to australia
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Seychelles,-0.299029,0.061339,0.019431,-0.220426,-0.006185,-0.121884,0.644285,,,,8.899884,-6.154724
Kazakhstan,-0.160941,-0.260440,0.060409,-0.251751,-0.306444,-0.089966,-0.601544,,,,9.258799,-5.795809
Trinidad and Tobago,-0.281766,0.097492,0.090956,0.264898,0.083156,-0.273769,-0.750260,,,,9.369601,-5.685008
Grenada,-0.299353,-0.227505,-0.041326,0.092886,0.167045,-1.006210,-0.151169,,,,9.512797,-5.541811
El Salvador,-0.279972,-0.541459,0.053237,-0.113399,0.140608,0.542129,-0.339554,,,,9.537661,-5.516947
...,...,...,...,...,...,...,...,...,...,...,...,...
Lesotho,-0.298260,-0.680662,-2.522141,-0.636677,-0.214017,-1.357836,4.021779,,,,21.666812,6.612204
Central African Republic,-0.298536,-0.717150,-2.641673,-0.957399,-0.305382,-0.785681,5.426688,,,,25.870222,10.815614
Japan,3.548882,1.175084,1.529451,2.867596,0.195462,1.390681,0.944280,,,,27.427005,12.372397
China,10.493931,-0.252414,0.532427,0.293631,-0.097633,-0.015301,-0.872081,,,,30.144947,15.090338


In [26]:
standardised_df.tail(5)

Unnamed: 0_level_0,GDP (current US$),GDP per capita (current US$),"Life expectancy at birth, total (years)",Population ages 65 and above (% of total population),Population density (people per sq. km of land area),Urban population (% of total population),Respiratory Death per 100k,"Hospital beds (per 1,000 people)",Current health expenditure per capita (current US$),"Smoking prevalence, total (ages 15+)",total_abs_score,distance to australia
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Lesotho,-0.29826,-0.680662,-2.522141,-0.636677,-0.214017,-1.357836,4.021779,,,,21.666812,6.612204
Central African Republic,-0.298536,-0.71715,-2.641673,-0.957399,-0.305382,-0.785681,5.426688,,,,25.870222,10.815614
Japan,3.548882,1.175084,1.529451,2.867596,0.195462,1.390681,0.94428,,,,27.427005,12.372397
China,10.493931,-0.252414,0.532427,0.293631,-0.097633,-0.015301,-0.872081,,,,30.144947,15.090338
Singapore,-0.010327,2.497248,1.388054,0.37747,11.413274,1.753783,0.182881,,,,45.341808,30.2872
