In [236]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial import distance

from sklearn import preprocessing 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, confusion_matrix

# Economic Freedom Index Data

- https://www.heritage.org/index/
- The Interactive Heatmap from the EFI website seems to set a cutoff for economic prosperity at about 60.0. 
- I will be using that for my classifications

In [237]:
df = pd.read_csv('economic_freedom_index2019_data.csv', encoding = 'latin-1')
df
# maybe gdp per world rank?
# IIRC there's also a happiness ranking so maybe correlate those 2 datasets?

Unnamed: 0,CountryID,Country Name,WEBNAME,Region,World Rank,Region Rank,2019 Score,Property Rights,Judical Effectiveness,Government Integrity,...,Country,Population (Millions),"GDP (Billions, PPP)",GDP Growth Rate (%),5 Year GDP Growth Rate (%),GDP per Capita (PPP),Unemployment (%),Inflation (%),FDI Inflow (Millions),Public Debt (% of GDP)
0,1,Afghanistan,Afghanistan,Asia-Pacific,152.0,39.0,51.5,19.6,29.6,25.2,...,Afghanistan,35.5,$69.6,2.5,2.9,"$1,958",8.8,5.0,53.9,7.3
1,2,Albania,Albania,Europe,52.0,27.0,66.5,54.8,30.6,40.4,...,Albania,2.9,$36.0,3.9,2.5,"$12,507",13.9,2.0,1119.1,71.2
2,3,Algeria,Algeria,Middle East and North Africa,171.0,14.0,46.2,31.6,36.2,28.9,...,Algeria,41.5,$632.9,2.0,3.1,"$15,237",10.0,5.6,1203.0,25.8
3,4,Angola,Angola,Sub-Saharan Africa,156.0,33.0,50.6,35.9,26.6,20.5,...,Angola,28.2,$190.3,0.7,2.9,"$6,753",8.2,31.7,-2254.5,65.3
4,5,Argentina,Argentina,Americas,148.0,26.0,52.2,47.8,44.5,33.5,...,Argentina,44.1,$920.2,2.9,0.7,"$20,876",8.7,25.7,11857.0,52.6
5,6,Armenia,Armenia,Europe,47.0,24.0,67.7,57.2,46.3,38.6,...,Armenia,3.0,$28.3,7.5,3.6,"$9,456",18.2,0.9,245.7,53.5
6,7,Australia,Australia,Asia-Pacific,5.0,4.0,80.9,79.1,86.5,79.9,...,Australia,24.8,"$1,246.5",2.3,2.4,"$50,334",5.6,2.0,46368.0,41.6
7,8,Austria,Austria,Europe,31.0,16.0,72.0,84.2,71.3,77.4,...,Austria,8.8,$439.6,2.9,1.3,"$49,869",5.5,2.2,9629.6,78.8
8,9,Azerbaijan,Azerbaijan,Asia-Pacific,60.0,13.0,65.4,59.1,53.1,44.7,...,Azerbaijan,9.8,$171.8,0.1,1.2,"$17,492",5.0,13.0,2867.0,54.7
9,10,Bahamas,Bahamas,Americas,76.0,15.0,62.9,42.2,46.9,43.7,...,Bahamas,0.4,$11.6,1.3,-0.7,"$31,139",12.6,1.4,927.7,57.2


In [238]:
# WEBNAME is the same as Country Name with slight differences; so is Country
# CountryID is basically like index
df = df.drop(["WEBNAME", "CountryID", "Country Name"], axis = 1)

In [239]:
df.head()

Unnamed: 0,Region,World Rank,Region Rank,2019 Score,Property Rights,Judical Effectiveness,Government Integrity,Tax Burden,Gov't Spending,Fiscal Health,...,Country,Population (Millions),"GDP (Billions, PPP)",GDP Growth Rate (%),5 Year GDP Growth Rate (%),GDP per Capita (PPP),Unemployment (%),Inflation (%),FDI Inflow (Millions),Public Debt (% of GDP)
0,Asia-Pacific,152.0,39.0,51.5,19.6,29.6,25.2,91.7,80.3,99.3,...,Afghanistan,35.5,$69.6,2.5,2.9,"$1,958",8.8,5.0,53.9,7.3
1,Europe,52.0,27.0,66.5,54.8,30.6,40.4,86.3,73.9,80.6,...,Albania,2.9,$36.0,3.9,2.5,"$12,507",13.9,2.0,1119.1,71.2
2,Middle East and North Africa,171.0,14.0,46.2,31.6,36.2,28.9,76.4,48.7,18.7,...,Algeria,41.5,$632.9,2.0,3.1,"$15,237",10.0,5.6,1203.0,25.8
3,Sub-Saharan Africa,156.0,33.0,50.6,35.9,26.6,20.5,83.9,80.7,58.2,...,Angola,28.2,$190.3,0.7,2.9,"$6,753",8.2,31.7,-2254.5,65.3
4,Americas,148.0,26.0,52.2,47.8,44.5,33.5,69.3,49.5,33.0,...,Argentina,44.1,$920.2,2.9,0.7,"$20,876",8.7,25.7,11857.0,52.6


In [240]:
df.rename(columns={ 
    "2019 Score": "EFI Score 2019",
    "World Rank": "World EFI Rank",
    "Region Rank": "Regional EFI Rank",
    "Gov\'t Spending": "Govt Spending",
    "Gov't Expenditure % of GDP ": "Govt Expenditure % of GDP",
    "Investment Freedom ": "Investment Freedom"
}, inplace=True)

In [241]:
# Let's have our rankings in a separate dataframe 
rankings = df[['World EFI Rank','EFI Score 2019', 'Regional EFI Rank', 'Country', 'Region']]
rankings.head()

Unnamed: 0,World EFI Rank,EFI Score 2019,Regional EFI Rank,Country,Region
0,152.0,51.5,39.0,Afghanistan,Asia-Pacific
1,52.0,66.5,27.0,Albania,Europe
2,171.0,46.2,14.0,Algeria,Middle East and North Africa
3,156.0,50.6,33.0,Angola,Sub-Saharan Africa
4,148.0,52.2,26.0,Argentina,Americas


In [242]:
conditions = [
    (rankings["EFI Score 2019"] >= 60),
    (rankings["EFI Score 2019"] < 60)
]
choices = [1, 0]
rankings["Freedom"] = np.select(conditions, choices, default="None")
rankings.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,World EFI Rank,EFI Score 2019,Regional EFI Rank,Country,Region,Freedom
0,152.0,51.5,39.0,Afghanistan,Asia-Pacific,0
1,52.0,66.5,27.0,Albania,Europe,1
2,171.0,46.2,14.0,Algeria,Middle East and North Africa,0
3,156.0,50.6,33.0,Angola,Sub-Saharan Africa,0
4,148.0,52.2,26.0,Argentina,Americas,0


In [243]:
conditions = [
    (df["EFI Score 2019"] >= 60),
    (df["EFI Score 2019"] < 60)
]
choices = [1, 0]
df["Freedom"] = np.select(conditions, choices, default="None")

In [244]:
# Remove labels and just leave the features
data = df.drop(['World EFI Rank','EFI Score 2019', 'Regional EFI Rank', 'Freedom', 'Country', 'Region'], axis = 1)
data.head()

Unnamed: 0,Property Rights,Judical Effectiveness,Government Integrity,Tax Burden,Govt Spending,Fiscal Health,Business Freedom,Labor Freedom,Monetary Freedom,Trade Freedom,...,Govt Expenditure % of GDP,Population (Millions),"GDP (Billions, PPP)",GDP Growth Rate (%),5 Year GDP Growth Rate (%),GDP per Capita (PPP),Unemployment (%),Inflation (%),FDI Inflow (Millions),Public Debt (% of GDP)
0,19.6,29.6,25.2,91.7,80.3,99.3,49.2,60.4,76.7,66.0,...,25.6,35.5,$69.6,2.5,2.9,"$1,958",8.8,5.0,53.9,7.3
1,54.8,30.6,40.4,86.3,73.9,80.6,69.3,52.7,81.5,87.8,...,29.5,2.9,$36.0,3.9,2.5,"$12,507",13.9,2.0,1119.1,71.2
2,31.6,36.2,28.9,76.4,48.7,18.7,61.6,49.9,74.9,67.4,...,41.4,41.5,$632.9,2.0,3.1,"$15,237",10.0,5.6,1203.0,25.8
3,35.9,26.6,20.5,83.9,80.7,58.2,55.7,58.8,55.4,61.2,...,25.3,28.2,$190.3,0.7,2.9,"$6,753",8.2,31.7,-2254.5,65.3
4,47.8,44.5,33.5,69.3,49.5,33.0,56.4,46.9,60.2,70.0,...,41.0,44.1,$920.2,2.9,0.7,"$20,876",8.7,25.7,11857.0,52.6


In [245]:
data.columns.unique()

Index(['Property Rights', 'Judical Effectiveness', 'Government Integrity',
       'Tax Burden', 'Govt Spending', 'Fiscal Health', 'Business Freedom',
       'Labor Freedom', 'Monetary Freedom', 'Trade Freedom',
       'Investment Freedom', 'Financial Freedom', 'Tariff Rate (%)',
       'Income Tax Rate (%)', 'Corporate Tax Rate (%)', 'Tax Burden % of GDP',
       'Govt Expenditure % of GDP', 'Population (Millions)',
       'GDP (Billions, PPP)', 'GDP Growth Rate (%)',
       '5 Year GDP Growth Rate (%)', 'GDP per Capita (PPP)',
       'Unemployment (%)', 'Inflation (%)', 'FDI Inflow (Millions)',
       'Public Debt (% of GDP)'],
      dtype='object')

In [266]:
data.replace({ 
    '40.0 (2015 est.)': '40.0',
    '38,000 ppl.': 0.038,
    '6.1 CHF (2014 )': '6.1',
    '1,246.5 ': '1246.5'
    data['Property Rights'] = data['Property Rights'].fillna(0).astype(float)
data['Judical Effectiveness'] = data['Judical Effectiveness'].fillna(0).astype(float)
data['Government Integrity'] = data['Government Integrity'].fillna(0).astype(float)
data['Tax Burden'] = data['Tax Burden'].fillna(0).astype(float)
data['Govt Spending'] = data['Govt Spending'].fillna(0).astype(float)
data['Fiscal Health'] = data['Fiscal Health'].fillna(0).astype(float)
data['Business Freedom'] = data['Business Freedom'].fillna(0).astype(float)
data['Labor Freedom'] = data['Labor Freedom'].fillna(0).astype(float)
data['Monetary Freedom'] = data['Monetary Freedom'].fillna(0).astype(float)
data['Trade Freedom'] = data['Trade Freedom'].fillna(0).astype(float)
data['Investment Freedom'] = data['Investment Freedom'].fillna(0).astype(float)
data['Financial Freedom'] = data['Financial Freedom'].fillna(0).astype(float)
data['Financial Freedom'] = data['Financial Freedom'].fillna(0).astype(float)
data['Tariff Rate (%)'] = data['Tariff Rate (%)'].fillna(0).astype(float)
data['Income Tax Rate (%)'] = data['Income Tax Rate (%)'].fillna(0).astype(float)
data['Corporate Tax Rate (%)'] = data['Corporate Tax Rate (%)'].fillna(0).astype(float)
data['Tax Burden % of GDP'] = data['Tax Burden % of GDP'].fillna(0).astype(float)
data['Govt Expenditure % of GDP'] = data['Govt Expenditure % of GDP'].fillna(0).astype(float)
data['Population (Millions)'] = data['Population (Millions)'].fillna(0) # cannot convert to float
data['GDP (Billions, PPP)'] = data['GDP (Billions, PPP)'].str.strip('$').str.strip(' ').fillna(0).astype(float)
data['GDP (Billions, PPP)'].unique()


    }, inplace=True
)

SyntaxError: invalid syntax (<ipython-input-266-c08db43b809b>, line 6)

In [264]:
data['Property Rights'] = data['Property Rights'].fillna(0).astype(float)
data['Judical Effectiveness'] = data['Judical Effectiveness'].fillna(0).astype(float)
data['Government Integrity'] = data['Government Integrity'].fillna(0).astype(float)
data['Tax Burden'] = data['Tax Burden'].fillna(0).astype(float)
data['Govt Spending'] = data['Govt Spending'].fillna(0).astype(float)
data['Fiscal Health'] = data['Fiscal Health'].fillna(0).astype(float)
data['Business Freedom'] = data['Business Freedom'].fillna(0).astype(float)
data['Labor Freedom'] = data['Labor Freedom'].fillna(0).astype(float)
data['Monetary Freedom'] = data['Monetary Freedom'].fillna(0).astype(float)
data['Trade Freedom'] = data['Trade Freedom'].fillna(0).astype(float)
data['Investment Freedom'] = data['Investment Freedom'].fillna(0).astype(float)
data['Financial Freedom'] = data['Financial Freedom'].fillna(0).astype(float)
data['Financial Freedom'] = data['Financial Freedom'].fillna(0).astype(float)
data['Tariff Rate (%)'] = data['Tariff Rate (%)'].fillna(0).astype(float)
data['Income Tax Rate (%)'] = data['Income Tax Rate (%)'].fillna(0).astype(float)
data['Corporate Tax Rate (%)'] = data['Corporate Tax Rate (%)'].fillna(0).astype(float)
data['Tax Burden % of GDP'] = data['Tax Burden % of GDP'].fillna(0).astype(float)
data['Govt Expenditure % of GDP'] = data['Govt Expenditure % of GDP'].fillna(0).astype(float)
data['Population (Millions)'] = data['Population (Millions)'].fillna(0) # cannot convert to float




array(['69.6', '36.0', '632.9', '190.3', '920.2', '28.3', '1246.5',
       '439.6', '171.8', '11.6', '70.4', '687.1', '5.2', '178.9', '528.5',
       '3.2', '25.3', '7.0', '83.6', '44.6', '38.9', '3,240.3', '33.5',
       '153.1', '35.8', '328.7', '8.0', '64.3', '88.9', '1,769.3', '3.7',
       '3.4', '28.6', '451.1', '23,159.1', '714.0', '1.3', '68.5', '28.9',
       '83.9', '96.9', '101.3', '148.0', '31.6', '375.7', '286.8', '3.6',
       '0.8', '172.4', '192.6', '1,201.2', '57.0', '30.4', '9.4', '41.6',
       '11.3', '200.2', '8.7', '244.0', '2,835.8', '36.7', '39.7',
       '4,170.8', '133.7', '298.7', '137.8', '26.5', '3.1', '6.3', '19.9',
       '46.2', '454.9', '289.0', '17.6', '9,459.0', '3,242.8', '1,644.7',
       '658.8', '357.2', '316.5', '2,310.9', '26.1', '5,428.8', '89.1',
       '477.6', '163.1', '0.2', '40.0', '2,029.0', '19.6', '291.5',
       '23.0', '49.2', '53.9', '87.7', '6.1', '64.4', '91.2', '62.7',
       '71.8', '31.0', '22.4', '930.8', '6.9', '41.0', '19.3',

In [257]:
data['GDP (Billions, PPP)'].unique()

array(['69.6 ', '36.0 ', '632.9 ', '190.3 ', '920.2 ', '28.3 ', '1246.5',
       '439.6 ', '171.8 ', '11.6 ', '70.4 ', '687.1 ', '5.2 ', '178.9 ',
       '528.5 ', '3.2 ', '25.3 ', '7.0 ', '83.6 ', '44.6 ', '38.9 ',
       '3,240.3 ', '33.5 ', '153.1 ', '35.8 ', '328.7 ', '8.0 ', '64.3 ',
       '88.9 ', '1,769.3 ', '3.7 ', '3.4 ', '28.6 ', '451.1 ',
       '23,159.1 ', '714.0 ', '1.3 ', '68.5 ', '28.9 ', '83.9 ', '96.9 ',
       '101.3 ', '148.0 ', '31.6 ', '375.7 ', '286.8 ', '3.6 ', '0.8 ',
       '172.4 ', '192.6 ', '1,201.2 ', '57.0 ', '30.4 ', '9.4 ', '41.6 ',
       '11.3 ', '200.2 ', '8.7 ', '244.0 ', '2,835.8 ', '36.7 ', '39.7 ',
       '4,170.8 ', '133.7 ', '298.7 ', '137.8 ', '26.5 ', '3.1 ', '6.3 ',
       '19.9 ', '46.2 ', '454.9 ', '289.0 ', '17.6 ', '9,459.0 ',
       '3,242.8 ', '1,644.7 ', '658.8 ', '357.2 ', '316.5 ', '2,310.9 ',
       '26.1 ', '5,428.8 ', '89.1 ', '477.6 ', '163.1 ', '0.2 ', '40.0',
       '2,029.0 ', '19.6 ', '291.5 ', '23.0 ', '49.2 ', '53.9 ', '8

In [None]:
data['GDP (Billions, PPP)'] = data['GDP (Billions, PPP)'].str.strip('$').str.strip(' ')
data['GDP (Billions, PPP)'].unique()