In [78]:
import pandas as pd
import plotly.express as px
import os

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Explore Data
- After imputation

In [79]:
df_imputed = pd.read_csv('input/imputed/df_wide_knn_imputed.csv')

# Define target indicators for top and bottom 10% wealth share
target_top = "SI.DST.10TH.10"   # Top 10% wealth share
target_bottom = "SI.DST.FRST.10" # Bottom 10% wealth share

# Exclude identifier columns and target columns from features
exclude_columns = ["Country Name", "Country Code", "Year", target_top, target_bottom]
wealth_share_columns = [col for col in df_imputed.columns if col.startswith("SI.DST")]
exclude_columns += wealth_share_columns
feature_columns = [col for col in df_imputed.columns if col not in exclude_columns]

X = df_imputed[feature_columns]

print(f"Data Shape: {X.shape[0]:,} x {X.shape[1]:,}")
print(f"Data elements: {X.size:,}")

Data Shape: 16,960 x 1,486
Data elements: 25,202,560


In [80]:
df_dropna = df_imputed.dropna(subset=[target_top, target_bottom])

X_dropna = df_dropna[feature_columns]

print(f"Data Shape after dropping NAs in target cols: {X_dropna.shape[0]:,} rows x {X_dropna.shape[1]:,} cols")
print(f"Data elements: {X_dropna.size:,}")

Data Shape after dropping NAs in target cols: 2,110 rows x 1,486 cols
Data elements: 3,135,460


In [76]:
# Different targets to consider

df_imputed_dropna = df_imputed.dropna(subset=[target_top, target_bottom]).copy()

df_imputed_dropna['wealth_share'] = df_imputed_dropna[target_top] - df_imputed_dropna[target_bottom]
df_imputed_dropna['wealth_share_ratio'] = df_imputed_dropna[target_top] / df_imputed_dropna[target_bottom]
df_imputed_dropna['wealth_share_ratio'] = df_imputed_dropna['wealth_share_ratio'].replace([float('inf'), -float('inf')], None)
df_imputed_dropna['wealth_share_ratio'] = df_imputed_dropna['wealth_share_ratio'].fillna(0)
df_imputed_dropna['GINI'] = df_imputed_dropna['SI.POV.GINI']

df_imputed_dropna

Unnamed: 0,Country Name,Country Code,Year,AG.CON.FERT.PT.ZS,AG.CON.FERT.ZS,AG.LND.AGRI.K2,AG.LND.AGRI.ZS,AG.LND.ARBL.HA,AG.LND.ARBL.HA.PC,AG.LND.ARBL.ZS,...,per_si_allsi.ben_q1_tot,per_si_allsi.cov_pop_tot,per_si_allsi.cov_q1_tot,per_si_allsi.cov_q2_tot,per_si_allsi.cov_q3_tot,per_si_allsi.cov_q4_tot,per_si_allsi.cov_q5_tot,wealth_share,wealth_share_ratio,GINI
228,Albania,ALB,1996-12-31,216.666667,11.265165,11310.0,41.277372,577000.0,0.182132,21.058394,...,,,,,,,,16.8,5.307692,27.0
234,Albania,ALB,2002-12-31,,97.584775,11400.0,41.605839,578000.0,0.189445,21.094891,...,,,,,,,,21.8,7.228571,31.7
237,Albania,ALB,2005-12-31,,112.375316,10770.0,39.306569,538000.0,0.178649,19.635036,...,,,,,,,,20.8,6.942857,30.6
240,Albania,ALB,2008-12-31,,77.309738,11810.0,43.102190,610000.0,0.206968,22.262774,...,12.996090,45.938808,48.437339,46.191035,50.211532,43.711123,41.142729,20.7,6.594595,30.0
244,Albania,ALB,2012-12-31,,93.061557,12013.0,43.843066,619100.0,0.213453,22.594891,...,12.227686,36.255063,38.229705,37.608028,34.303977,36.513230,34.619776,19.2,6.189189,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16882,Zambia,ZMB,2010-12-31,,29.173529,234360.0,31.525848,3400000.0,0.243455,4.573642,...,1.856550,1.052573,0.172614,0.289908,1.099195,1.369165,2.331667,39.4,25.625000,52.0
16887,Zambia,ZMB,2015-12-31,,57.421535,237390.0,31.933440,3700000.0,0.225622,4.977199,...,0.037083,0.923653,0.198393,0.354823,0.550346,1.647529,1.865826,42.2,36.166667,55.8
16894,Zambia,ZMB,2022-12-31,,64.562639,,32.067959,,,5.111718,...,,,,,,,,37.6,26.066667,51.5
16947,Zimbabwe,ZWE,2011-12-31,398.225000,26.548333,164000.0,42.263348,4200000.0,0.308927,10.856921,...,1.047808,2.655040,0.446569,1.460003,1.907218,3.979413,5.480463,31.3,13.520000,43.2


In [81]:
df_imputed_dropna['top10pct_wealth_share'] = df_imputed_dropna[target_top]
df_imputed_dropna['bottom10pct_wealth_share'] = df_imputed_dropna[target_bottom]
df_imputed_dropna['GINI'] = df_imputed_dropna['SI.POV.GINI']
df_imputed_dropna['wealth_share'] = df_imputed_dropna[target_top] - df_imputed_dropna[target_bottom]
df_imputed_dropna['wealth_share_ratio'] = df_imputed_dropna[target_top] / df_imputed_dropna[target_bottom].replace([float('inf'), -float('inf')], None).fillna(0)

df_targets = df_imputed_dropna[['top10pct_wealth_share', 'bottom10pct_wealth_share', 'SI.POV.GINI', 'wealth_share', 'wealth_share_ratio']]

# Calculate the correlation matrix
corr_matrix = df_targets.corr()

# Create a heatmap using plotly
fig = px.imshow(
    corr_matrix,
    text_auto=True,  # Show correlation values
    color_continuous_scale='RdBu_r',  # Red-Blue color scale (reversed)
    zmin=-1, zmax=1,  # Force scale to be from -1 to 1
    title='Correlation Matrix of Wealth Distribution Indicators'
)

fig.update_layout(
    width=700,
    height=600,
    coloraxis_colorbar=dict(
        title="Correlation",
        titleside="right"
    ),
    title_x=0.5,
    title_font=dict(size=16)
)

fig.show()

> We could potentially get away with using just the GINI index for a single target value. It strongly correlates with both top 10pct wealth share and bottom 10pct wealth share, and does a better job capturing the relationship compared to using the wealth share ratio. 

In [84]:
df_targets.describe()

Unnamed: 0,top10pct_wealth_share,bottom10pct_wealth_share,SI.POV.GINI,wealth_share,wealth_share_ratio
count,2110.0,2110.0,2110.0,2110.0,2110.0
mean,29.363981,2.580758,37.717014,26.783223,15.885794
std,6.794104,0.968146,8.828371,7.608026,16.506742
min,17.9,0.2,20.7,13.8,3.622642
25%,24.3,1.9,31.2,21.125,7.444519
50%,27.15,2.7,35.5,24.4,10.333333
75%,33.2,3.3,43.275,31.2,16.875
max,61.5,5.3,65.8,59.9,216.0


In [56]:
fig = px.scatter(
    df_imputed_dropna,
    x=target_top,
    y=target_bottom,
    color='GINI',
    hover_name='Country Name',
    hover_data=['Country Code', 'Year'],
    title="Top 10% vs Bottom 10% Wealth Share",
    labels={
        target_top: "Top 10% Wealth Share",
        target_bottom: "Bottom 10% Wealth Share",
        'GINI': "GINI Index"
    },
    color_continuous_scale=px.colors.sequential.Viridis
)
fig.update_traces(marker=dict(size=5))
fig.update_layout(
    xaxis_title="Top 10% Wealth Share",
    yaxis_title="Bottom 10% Wealth Share",
    title_x=0.5,
    title_y=0.95,
    title_font=dict(size=20),
    legend_title_text='GINI Index',
    legend_title_font=dict(size=14),
    font=dict(size=12)
)
fig.show()
os.makedirs('output/plots', exist_ok=True)
fig.write_html('output/plots/top_10_vs_bottom_10_wealth_share.html')

In [60]:
fig = px.scatter(
    df_imputed_dropna,
    x='wealth_share',
    y='wealth_share_ratio',
    color='GINI',
    hover_name='Country Name',
    hover_data=['Country Code', 'Year'],
    title='Wealth Share Gap vs. Wealth Share Ratio',
    labels={'wealth_share': 'Wealth Share', 'wealth_share_ratio': 'Wealth Share Ratio'},
)
fig.update_traces(marker=dict(size=5, opacity=0.8, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
    xaxis_title='Wealth Share Gap',
    yaxis_title='Wealth Share Ratio',
    legend_title='GINI',
    title_x=0.5,
    title_y=0.95,
    title_font=dict(size=20),
    font=dict(size=12),
)
fig.show()

# t-SNE
- Will fit on all 25M elements, then filter to display cleanly

In [62]:
# Extract feature matrix
X = df_imputed[feature_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

# Create DataFrame with t-SNE results
tsne_df = pd.DataFrame(data=X_tsne, columns=['t-SNE Component 1', 't-SNE Component 2'])

# Add metadata and target variables
tsne_df['Country Name'] = df_imputed['Country Name']
tsne_df['Country Code'] = df_imputed['Country Code']
tsne_df['Year'] = df_imputed['Year']
tsne_df[target_top] = df_imputed[target_top]
tsne_df[target_bottom] = df_imputed[target_bottom]
tsne_df['wealth_share_ratio'] = (df_imputed[target_top] / df_imputed[target_bottom]).replace([float('inf'), -float('inf')], None)
tsne_df['wealth_share_ratio'] = tsne_df['wealth_share_ratio'].fillna(0)
tsne_df['GINI'] = df_imputed['SI.POV.GINI']

# Remove rows with NAs in target columns
tsne_df = tsne_df.dropna(subset=[target_top, target_bottom])

## Create t-SNE Visualization for Top 10% Income Share

In [13]:
os.makedirs('output/tsne', exist_ok=True)

# Visualization for top 10% income share
fig = px.scatter(
    tsne_df,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_top,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators (Top 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_top: 'Top 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(
    coloraxis_colorbar=dict(title='Top 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_top10_imputed.html', include_plotlyjs='cdn')
fig.show()

## Create t-SNE Visualization for Bottom 10% Income Share

In [14]:
# Visualization for bottom 10% income share
fig = px.scatter(
    tsne_df,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_bottom,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators (Bottom 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_bottom: 'Bottom 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(
    coloraxis_colorbar=dict(title='Bottom 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_bottom10_imputed.html', include_plotlyjs='cdn')
fig.show()

## Last Record by Country - Top 10% Income Share

In [15]:
# Get the most recent record for each country
tsne_df['Year'] = pd.to_datetime(tsne_df['Year'])
last_records = tsne_df.sort_values('Year').groupby('Country Code').last().reset_index()

# Visualization for top 10% income share (last record by country)
fig = px.scatter(
    last_records,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_top,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators - Most Recent Data by Country (Top 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_top: 'Top 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Top 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_top10_last_record_by_country.html', include_plotlyjs='cdn')
fig.show()

## Last Record by Country - Bottom 10% Income Share

In [16]:
# Visualization for bottom 10% income share (last record by country)
fig = px.scatter(
    last_records,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_bottom,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators - Most Recent Data by Country (Bottom 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_bottom: 'Bottom 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Bottom 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_bottom10_last_record_by_country.html', include_plotlyjs='cdn')
fig.show()

# t-SNE GINI Index

In [64]:
# Visualization for GINI index
fig = px.scatter(
    tsne_df,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color='GINI',
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators (GINI Index)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        'GINI': 'GINI Index'
    }
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(
    coloraxis_colorbar=dict(title='GINI Index'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_GINI_imputed.html', include_plotlyjs='cdn')
fig.show()
