In [None]:

%run ../../load_magic/storage.py
%run ../../load_magic/dataframes.py
%pprint
%who

In [None]:

# From https://www.heritage.org/index/ranking
file_name = r'../data/html/capitalism_by_country.html'
capitalism_df = pd.read_html(file_name)[0]
capitalism_df.columns = ['Rank', 'Country', 'Overall Capitalism', 'Change']
capitalism_df.sample(10).T

In [None]:

# From https://en.wikipedia.org/wiki/List_of_countries_by_income_equality
file_name = r'../data/html/income_inequality_by_country.html'
income_inequality_df = pd.read_html(file_name)[0]
income_inequality_df.sample(10).T

In [None]:

capitalism_country_list = capitalism_df['Country'].tolist()
len(capitalism_country_list)

In [None]:

income_inequality_country_list = income_inequality_df['Country'].tolist()
len(income_inequality_country_list)

In [None]:

common_country_list = list(set.intersection(set(capitalism_country_list),
                                            set(income_inequality_country_list)))
len(common_country_list)

In [None]:

%run ../../load_magic/lists.py
typos_df = check_for_typos(capitalism_country_list, income_inequality_country_list)
match_series = (typos_df['max_similarity'] < 1.0)
typos_df[match_series].sort_values('max_similarity', ascending=False).head()

In [None]:

match_series = (capitalism_df['Country'] == 'SÃ£o TomÃ© and PrÃ\xadncipe')
idx = capitalism_df[match_series].index.tolist()[0]
capitalism_df.loc[idx, 'Country'] = 'Sao Tome and Principe'

In [None]:

match_series = (income_inequality_df['Country'] == 'St. Lucia')
idx = income_inequality_df[match_series].index.tolist()[0]
income_inequality_df.loc[idx, 'Country'] = 'Saint Lucia'

In [None]:

match_series = (income_inequality_df['Country'] == 'DR Congo')
idx = income_inequality_df[match_series].index.tolist()[0]
income_inequality_df.loc[idx, 'Country'] = 'Democratic Republic of Congo'

In [None]:

match_series = (income_inequality_df['Country'] == 'Congo, Republic of the')
idx = income_inequality_df[match_series].index.tolist()[0]
income_inequality_df.loc[idx, 'Country'] = 'Republic of Congo'

In [None]:

match_series = (income_inequality_df['Country'] == 'North Macedonia')
idx = income_inequality_df[match_series].index.tolist()[0]
income_inequality_df.loc[idx, 'Country'] = 'Macedonia'

In [None]:

match_series = (capitalism_df['Country'] == 'Kyrgyz Republic')
idx = capitalism_df[match_series].index.tolist()[0]
capitalism_df.loc[idx, 'Country'] = 'Kyrgyzstan'

In [None]:

[cn for cn in capitalism_country_list if 'kyrg' in str(cn).lower()]

In [None]:

[cn for cn in income_inequality_country_list if 'kyrg' in str(cn).lower()]

In [None]:

capitalism_country_list = capitalism_df['Country'].tolist()
income_inequality_country_list = income_inequality_df['Country'].tolist()
uncommon_country_list = list(set.symmetric_difference(set(capitalism_country_list),
                                                      set(income_inequality_country_list)))
sorted([str(cn) for cn in uncommon_country_list])

In [None]:

capitalism_country_list = capitalism_df['Country'].tolist()
income_inequality_country_list = income_inequality_df['Country'].tolist()
typos_df = check_for_typos(income_inequality_country_list, capitalism_country_list)
match_series = (typos_df['max_similarity'] < 1.0)
typos_df[match_series].sort_values('max_similarity', ascending=False).head()

In [None]:

merge_df = pd.merge(left=capitalism_df, right=income_inequality_df, how='inner', on=['Country'],
                    suffixes=('_capitalism', '_income_inequality'))
merge_df.sample(10).T

In [None]:

# Explanatory variable
xname = 'Overall Capitalism'

# Response variable
yname = 'World Bank Gini %'

# Remove NaNs
columns_list = ['Country', xname, yname]
df = merge_df[columns_list].dropna()
xdata = df[xname]
ydata = df[yname]

In [None]:

# First order (linear) scatterplot
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline

fig1_fig = plt.figure(figsize=(12,8))
merge_axes_subplot = sns.regplot(x=xname, y=yname, scatter=True, data=df)
xlabel_str = 'Overall Capitalism (explanatory variable)'
xlabel_text = plt.xlabel(xlabel_str)
ylabel_str = 'World Bank Gini % (response variable)'
ylabel_text = plt.ylabel(ylabel_str)
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
least_capitalist = xdata.min()
most_capitalist = xdata.max()
most_unequal = ydata.max()
least_unequal = ydata.min()
for label, x, y in zip(df['Country'], xdata, ydata):
    if (x == least_capitalist):
        annotation = plt.annotate('{} (least capitalist)'.format(label),
                                  xy=(x, y), xytext=(40, 10), **kwargs)
    elif (x == most_capitalist):
        annotation = plt.annotate('{} (most capitalist)'.format(label),
                                  xy=(x, y), xytext=(-120, 220), **kwargs)
    elif (y == most_unequal):
        annotation = plt.annotate('{} (most unequal)'.format(label),
                                  xy=(x, y), xytext=(45, 0), **kwargs)
    elif (y == least_unequal):
        annotation = plt.annotate('{} (least unequal)'.format(label),
                                  xy=(x, y), xytext=(-200, 0), **kwargs)
    elif (label == 'United States'):
        annotation = plt.annotate('{} (most evil)'.format(label),
                                  xy=(x, y), xytext=(-75, 25), **kwargs)
title_obj = fig1_fig.suptitle('"Wealth inequality is huge in the capitalist societies"',
                              fontsize=24)

In [None]:

import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from statsmodels.regression.quantile_regression import QuantReg

store_objects(capitalism_gini_df=merge_df)


# Least Absolute Deviation

The LAD model is a special case of quantile regression where q=0.5

In [None]:

merge_df.columns = ['capitalism_rank', 'country_name', 'freedom_score',
                    'freedom_change_from_2018', 'un_10_percent_ratio', 'un_20_percent_ratio',
                    'world_bank_gini_index', 'world_bank_gini year', 'cia_10_percent_ratio',
                    'cia_percent_ratio_year', 'cia_gini_percent', 'cia_gini_year']
explanatory_variable = 'freedom_score'
response_variable = 'world_bank_gini_index'
inequality_qr = smf.quantreg('{} ~ {}'.format(response_variable, explanatory_variable), merge_df)
inequality_rrw = inequality_qr.fit(q=.5)
print(inequality_rrw.summary())


# Visualizing the results

We estimate the quantile regression model for many quantiles between .05 and .95, and compare best fit line from each of these models to Ordinary Least Squares results.

# Prepare data for plotting

For convenience, we place the quantile regression results in a Pandas DataFrame, and the OLS results in a dictionary.

In [None]:

quantiles_ndarray = np.arange(.05, .96, .1)
def fit_model(q):
    rrw = inequality_qr.fit(q=q)
    row_dict = {}
    row_dict['quantile'] = q
    row_dict['intercept'] = rrw.params['Intercept']
    row_dict['slope'] = rrw.params[explanatory_variable]
    conf_int_list = rrw.conf_int().loc[explanatory_variable].tolist()
    row_dict['lower_bound'] = conf_int_list[0]
    row_dict['upper_bound'] = conf_int_list[1]
    
    return row_dict
            
    
models_list = [fit_model(x) for x in quantiles_ndarray]

In [None]:

models_df = pd.DataFrame(models_list, columns=['quantile', 'intercept', 'slope', 'lower_bound',
                                               'upper_bound'])

rrw = smf.ols('{} ~ {}'.format(response_variable, explanatory_variable), merge_df).fit()
conf_int_list = rrw.conf_int().loc[explanatory_variable].tolist()
ols_dict = dict(intercept=rrw.params['Intercept'], slope=rrw.params[explanatory_variable],
           lower_bound=conf_int_list[0], upper_bound=conf_int_list[1])

print(ols_dict)
models_df


# First plot

This plot compares best fit lines for 10 quantile regression models to the least squares fit. We see that:

1. Wealth inequality decreases with freedom
2. The least squares estimates fit low inequality observations slightly poorly (i.e. the OLS line passes over low freedom countries)

In [None]:

match_series = (merge_df[explanatory_variable] == merge_df[explanatory_variable].min())
match_series = match_series | (merge_df[explanatory_variable] == merge_df[explanatory_variable].max())
merge_df[match_series].T

In [None]:

x = np.array([merge_df[explanatory_variable].min(), merge_df[explanatory_variable].max()])
get_y = lambda intercept, slope: intercept + slope * x

fig, ax = plt.subplots(figsize=(8, 6))

for i in range(models_df.shape[0]):
    y = get_y(models_df.loc[i, 'intercept'], models_df.loc[i, 'slope'])
    ax.plot(x, y, linestyle='dotted',
            label='{:.2} Quantile'.format(models_df.loc[i, 'quantile']))
    
y = get_y(ols_dict['intercept'], ols_dict['slope'])

ax.plot(x, y, color='red', label='OLS')
ax.scatter(merge_df[explanatory_variable], merge_df[response_variable], alpha=.2, label='Countries')
legend = ax.legend()
xlabel_text = ax.set_xlabel('Freedom Index (explanatory variable)', fontsize=16)
ylabel_text = ax.set_ylabel('Gini Index (response variable)', fontsize=16)


# Second plot

The dotted black lines form 95% point-wise confidence band around 10 quantile regression estimates (solid black line). The red lines represent OLS regression results along with their 95% confindence interval.

In only the low inequality observations do the quantile regression point estimates lie outside the OLS confidence interval, which suggests that the effect of freedom on inequality is mostly constant across the distribution.

In [None]:

n = models_df.shape[0]
p1 = plt.plot(models_df['quantile'], models_df['slope'], color='black', label='Quantile Regression')
p2 = plt.plot(models_df['quantile'], models_df['upper_bound'], linestyle='dotted', color='black',
              label='Upper Bound')
p3 = plt.plot(models_df['quantile'], models_df['lower_bound'], linestyle='dotted', color='black',
              label='Lower Bound')

slope = ols_dict['slope']
p4 = plt.plot(models_df['quantile'], [slope] * n, color='red', label='OLS')

lower_bound = ols_dict['lower_bound']
p5 = plt.plot(models_df['quantile'], [lower_bound] * n, linestyle='dotted', color='red')

upper_bound = ols_dict['upper_bound']
p6 = plt.plot(models_df['quantile'], [upper_bound] * n, linestyle='dotted', color='red')

plt.ylabel(r'$\beta_{{{}}}$'.format(explanatory_variable))
plt.xlabel('Quantiles of the conditional wealth inequality distribution')
legend_obj = plt.legend(loc='lower left')

In [None]:
%%javascript
var kernel = IPython.notebook.kernel;
var body = document.body,  
    attribs = body.attributes;
var command = "notebook_path = " + "'"+attribs['data-notebook-path'].value+"'";
kernel.execute(command);

In [None]:
# %load ../../load_magic/nbviewer.py

from IPython.display import HTML

notebook_viewer_url = 'https://nbviewer.jupyter.org/github/dbabbitt/notebooks/blob/master/'
notebook_viewer_url += '/'.join(notebook_path.split('/')[1:])
html_str = 'Click <a href="{}" target="_blank">here</a> to view notebook in nbviewer.'
HTML(html_str.format(notebook_viewer_url))