# Income Inequality Analysis with Imputed DataRecreating t-SNE analysis for income inequality data with imputed values

## 1. Import Libraries

In [None]:
import pandas as pd
import plotly.express as px
import os

from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

## 2. Load and Prepare Data

In [None]:
# Load the data
df = pd.read_csv('input/WDICSV.csv')

# Transform to long format
df_long = df.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
                  var_name="Year", value_name="Value")

# Convert to wide format with indicators as columns
df_wide = df_long.pivot_table(index=["Country Name", "Country Code", "Year"],
                              columns="Indicator Code", values="Value")

df_wide = df_wide.reset_index()

## 3. Feature Selection and Target Variables

In [None]:
# Define target indicators for top and bottom 10% income share
target_top = "SI.DST.10TH.10"   # Top 10% income share
target_bottom = "SI.DST.FRST.10" # Bottom 10% income share

# Exclude identifier columns and target columns from features
exclude_columns = ["Country Name", "Country Code", "Year", target_top, target_bottom]
feature_columns = [col for col in df_wide.columns if col not in exclude_columns]

## 4. Data Imputation

In [None]:
# Create a copy of the dataset for imputation
df_imputed = df_wide.copy()

# Setup imputer for feature columns using mean strategy
feature_imputer = SimpleImputer(strategy='mean')
df_imputed[feature_columns] = feature_imputer.fit_transform(df_imputed[feature_columns])

# Impute target variables separately
target_imputer = SimpleImputer(strategy='mean')
target_cols = [target_top, target_bottom]
df_imputed[target_cols] = target_imputer.fit_transform(df_imputed[target_cols])

# Verify imputation by checking for NaN values
print(f"NaN values in imputed dataset: {df_imputed.isna().sum().sum()}")

## 5. Scale Features and Apply t-SNE

In [None]:
# Extract feature matrix
X = df_imputed[feature_columns]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

# Create DataFrame with t-SNE results
tsne_df = pd.DataFrame(data=X_tsne, columns=['t-SNE Component 1', 't-SNE Component 2'])

# Add metadata and target variables
tsne_df['Country Name'] = df_imputed['Country Name']
tsne_df['Country Code'] = df_imputed['Country Code']
tsne_df['Year'] = df_imputed['Year']
tsne_df[target_top] = df_imputed[target_top]
tsne_df[target_bottom] = df_imputed[target_bottom]

# Display the first few rows
tsne_df.head()

## 6. Create t-SNE Visualization for Top 10% Income Share

In [None]:
# Create directory if it doesn't exist
os.makedirs('output/tsne', exist_ok=True)

# Visualization for top 10% income share
fig = px.scatter(
    tsne_df,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_top,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators (Top 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_top: 'Top 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(
    coloraxis_colorbar=dict(title='Top 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_top10_imputed.html', include_plotlyjs='cdn')
fig.show()

## 7. Create t-SNE Visualization for Bottom 10% Income Share

In [None]:
# Visualization for bottom 10% income share
fig = px.scatter(
    tsne_df,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_bottom,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators (Bottom 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_bottom: 'Bottom 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=8))
fig.update_layout(
    coloraxis_colorbar=dict(title='Bottom 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_bottom10_imputed.html', include_plotlyjs='cdn')
fig.show()

## 8. Last Record by Country - Top 10% Income Share

In [None]:
# Get the most recent record for each country
tsne_df['Year'] = pd.to_datetime(tsne_df['Year'])
last_records = tsne_df.sort_values('Year').groupby('Country Code').last().reset_index()

# Visualization for top 10% income share (last record by country)
fig = px.scatter(
    last_records,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_top,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators - Most Recent Data by Country (Top 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_top: 'Top 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Top 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_top10_last_record_by_country.html', include_plotlyjs='cdn')
fig.show()

## 9. Last Record by Country - Bottom 10% Income Share

In [None]:
# Visualization for bottom 10% income share (last record by country)
fig = px.scatter(
    last_records,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_bottom,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code', 'Year'],
    title='t-SNE of Income Distribution Indicators - Most Recent Data by Country (Bottom 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_bottom: 'Bottom 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Bottom 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_bottom10_last_record_by_country.html', include_plotlyjs='cdn')
fig.show()

## 10. Most Recent Year's Data Visualizations

In [None]:
# Find the most recent year with substantial data
recent_year = tsne_df['Year'].dt.year.max()
recent_data = tsne_df[tsne_df['Year'].dt.year == recent_year]

# Visualization for top 10% income share (recent year)
fig = px.scatter(
    recent_data,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_top,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code'],
    title=f't-SNE of Income Distribution Indicators - {recent_year} Data (Top 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_top: 'Top 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Top 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_recent_top10_imputed.html', include_plotlyjs='cdn')
fig.show()

# Visualization for bottom 10% income share (recent year)
fig = px.scatter(
    recent_data,
    x='t-SNE Component 1',
    y='t-SNE Component 2',
    color=target_bottom,
    color_continuous_scale='viridis',
    opacity=0.7,
    hover_data=['Country Name', 'Country Code'],
    title=f't-SNE of Income Distribution Indicators - {recent_year} Data (Bottom 10% Income Share)',
    labels={
        't-SNE Component 1': 't-SNE Component 1',
        't-SNE Component 2': 't-SNE Component 2',
        target_bottom: 'Bottom 10% Income Share'
    }
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    coloraxis_colorbar=dict(title='Bottom 10% Income Share'),
    height=700,
    width=900
)

# Save and display the figure
fig.write_html('output/tsne/tsne_plot_recent_bottom10_imputed.html', include_plotlyjs='cdn')
fig.show()