# World Happiness Report Data Explorer
## Interactive Analysis with NumPy

This notebook provides an interactive exploration of the World Happiness Report 2020 dataset using NumPy for data manipulation and analysis.

## 1. Setup and Data Loading

In [None]:
import numpy as np
import csv
from typing import Dict, List, Tuple

print("NumPy version:", np.__version__)

In [None]:
# Load data from CSV
def load_happiness_data(csv_file: str):
    """Load World Happiness Report data into NumPy arrays."""
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        headers = next(reader)
        
        # Create column mapping
        column_map = {header: i for i, header in enumerate(headers)}
        
        # Read data
        country_names = []
        regional_indicators = []
        rows = []
        
        for row in reader:
            if row and len(row) > 2:
                country_names.append(row[0])
                regional_indicators.append(row[1])
                
                # Extract numeric values
                numeric_row = []
                for val in row[2:]:
                    try:
                        numeric_row.append(float(val))
                    except ValueError:
                        numeric_row.append(np.nan)
                rows.append(numeric_row)
        
        data = np.array(rows)
        
    return data, headers, country_names, regional_indicators, column_map

# Load the data
data, headers, countries, regions, col_map = load_happiness_data('WHR20_DataForFigure2.1.csv')

print(f"Loaded {len(countries)} countries with {data.shape[1]} features")
print(f"\nFirst 5 countries: {countries[:5]}")
print(f"\nData shape: {data.shape}")

## 2. Data Overview

In [None]:
# Display all column names
print("Available columns:")
print("=" * 60)
for i, header in enumerate(headers):
    print(f"{i:2d}. {header}")

In [None]:
# Display unique regions
unique_regions = list(set(regions))
print(f"\nNumber of unique regions: {len(unique_regions)}")
print("\nRegions:")
print("=" * 60)
for i, region in enumerate(sorted(unique_regions), 1):
    count = regions.count(region)
    print(f"{i:2d}. {region:40s} ({count} countries)")

## 3. Basic Statistical Analysis

In [None]:
# Helper function to get column index
def get_col_idx(column_name: str) -> int:
    """Get column index adjusted for numeric data (skip first 2 columns)."""
    return col_map[column_name] - 2

# Analyze Ladder Score (Happiness Score)
ladder_idx = get_col_idx('Ladder score')
ladder_scores = data[:, ladder_idx]

print("HAPPINESS SCORE (Ladder Score) STATISTICS")
print("=" * 60)
print(f"Mean:       {np.nanmean(ladder_scores):.4f}")
print(f"Median:     {np.nanmedian(ladder_scores):.4f}")
print(f"Std Dev:    {np.nanstd(ladder_scores):.4f}")
print(f"Min:        {np.nanmin(ladder_scores):.4f}")
print(f"Max:        {np.nanmax(ladder_scores):.4f}")
print(f"Range:      {np.nanmax(ladder_scores) - np.nanmin(ladder_scores):.4f}")

# Percentiles
print("\nPercentiles:")
for p in [25, 50, 75, 90, 95]:
    val = np.nanpercentile(ladder_scores, p)
    print(f"  {p:2d}th percentile: {val:.4f}")

## 4. Country Rankings

In [None]:
# Top 15 Happiest Countries
sorted_indices = np.argsort(ladder_scores)[::-1]

print("TOP 15 HAPPIEST COUNTRIES")
print("=" * 70)
print(f"{'Rank':<6} {'Country':<30} {'Region':<25} {'Score':<8}")
print("-" * 70)

for rank, idx in enumerate(sorted_indices[:15], 1):
    country = countries[idx]
    region = regions[idx]
    score = ladder_scores[idx]
    print(f"{rank:<6} {country:<30} {region[:23]:<25} {score:.3f}")

In [None]:
# Bottom 15 Countries
print("BOTTOM 15 COUNTRIES BY HAPPINESS SCORE")
print("=" * 70)
print(f"{'Rank':<6} {'Country':<30} {'Region':<25} {'Score':<8}")
print("-" * 70)

for rank, idx in enumerate(sorted_indices[-15:][::-1], 1):
    country = countries[idx]
    region = regions[idx]
    score = ladder_scores[idx]
    print(f"{rank:<6} {country:<30} {region[:23]:<25} {score:.3f}")

## 5. Regional Analysis

In [None]:
# Compare happiness scores across regions
def analyze_by_region(column_name: str):
    """Analyze a metric by region."""
    col_idx = get_col_idx(column_name)
    values = data[:, col_idx]
    
    results = []
    for region in sorted(set(regions)):
        region_mask = np.array([r == region for r in regions])
        region_values = values[region_mask]
        clean_values = region_values[~np.isnan(region_values)]
        
        if len(clean_values) > 0:
            results.append({
                'region': region,
                'mean': np.mean(clean_values),
                'median': np.median(clean_values),
                'std': np.std(clean_values),
                'count': len(clean_values)
            })
    
    return results

regional_happiness = analyze_by_region('Ladder score')
regional_happiness.sort(key=lambda x: x['mean'], reverse=True)

print("HAPPINESS SCORE BY REGION")
print("=" * 80)
print(f"{'Region':<35} {'Mean':<8} {'Median':<8} {'Std Dev':<8} {'Count':<8}")
print("-" * 80)

for stats in regional_happiness:
    print(f"{stats['region'][:33]:<35} {stats['mean']:7.3f}  {stats['median']:7.3f}  "
          f"{stats['std']:7.3f}  {stats['count']:7d}")

## 6. Factor Analysis

In [None]:
# Analyze key factors contributing to happiness
factors = [
    'Logged GDP per capita',
    'Social support',
    'Healthy life expectancy',
    'Freedom to make life choices',
    'Generosity',
    'Perceptions of corruption'
]

print("KEY HAPPINESS FACTORS - STATISTICS")
print("=" * 80)
print(f"{'Factor':<35} {'Mean':<10} {'Std Dev':<10} {'Min':<10} {'Max':<10}")
print("-" * 80)

for factor in factors:
    col_idx = get_col_idx(factor)
    values = data[:, col_idx]
    clean_values = values[~np.isnan(values)]
    
    print(f"{factor[:33]:<35} {np.mean(clean_values):9.4f}  {np.std(clean_values):9.4f}  "
          f"{np.min(clean_values):9.4f}  {np.max(clean_values):9.4f}")

## 7. Correlation Analysis

In [None]:
# Calculate correlations with happiness score
ladder_idx = get_col_idx('Ladder score')
ladder_data = data[:, ladder_idx]

print("CORRELATION WITH HAPPINESS SCORE")
print("=" * 60)
print(f"{'Factor':<40} {'Correlation':<15}")
print("-" * 60)

correlations = []
for factor in factors:
    col_idx = get_col_idx(factor)
    factor_data = data[:, col_idx]
    
    # Remove NaN values
    valid_mask = ~(np.isnan(ladder_data) | np.isnan(factor_data))
    
    if np.sum(valid_mask) > 0:
        corr = np.corrcoef(ladder_data[valid_mask], factor_data[valid_mask])[0, 1]
        correlations.append((factor, corr))

# Sort by correlation strength
correlations.sort(key=lambda x: abs(x[1]), reverse=True)

for factor, corr in correlations:
    bar_length = int(abs(corr) * 30)
    bar = 'â–ˆ' * bar_length
    print(f"{factor[:38]:<40} {corr:6.3f}  {bar}")

## 8. Correlation Matrix for Key Factors

In [None]:
# Create correlation matrix
selected_factors = [
    'Ladder score',
    'Logged GDP per capita',
    'Social support',
    'Healthy life expectancy',
    'Freedom to make life choices'
]

# Extract data for selected factors
factor_indices = [get_col_idx(f) for f in selected_factors]
factor_data = data[:, factor_indices]

# Remove rows with any NaN
valid_rows = ~np.any(np.isnan(factor_data), axis=1)
clean_data = factor_data[valid_rows]

# Calculate correlation matrix
corr_matrix = np.corrcoef(clean_data.T)

print("CORRELATION MATRIX")
print("=" * 100)

# Print header
print(f"{'Factor':<30}", end="")
for i in range(len(selected_factors)):
    print(f"  {i+1:5}", end="")
print()
print("-" * 100)

# Print matrix
for i, factor in enumerate(selected_factors):
    print(f"{i+1}. {factor[:27]:<28}", end="")
    for j in range(len(selected_factors)):
        print(f" {corr_matrix[i, j]:6.3f}", end="")
    print()

print("\nNote: Values close to 1 indicate strong positive correlation")
print("      Values close to -1 indicate strong negative correlation")
print("      Values close to 0 indicate weak or no correlation")

## 9. Country-Specific Analysis

In [None]:
# Analyze specific countries
def get_country_profile(country_name: str):
    """Get complete profile for a country."""
    try:
        idx = countries.index(country_name)
        
        print(f"\nCOUNTRY PROFILE: {country_name.upper()}")
        print("=" * 70)
        print(f"Region: {regions[idx]}")
        print()
        
        # Get rank
        ladder_idx = get_col_idx('Ladder score')
        country_score = data[idx, ladder_idx]
        rank = np.sum(data[:, ladder_idx] > country_score) + 1
        percentile = (1 - rank / len(countries)) * 100
        
        print(f"Happiness Rank: #{rank} out of {len(countries)} ({percentile:.1f}th percentile)")
        print(f"Happiness Score: {country_score:.3f}")
        print()
        
        # Key metrics
        print("Key Metrics:")
        print("-" * 70)
        key_metrics = [
            'Logged GDP per capita',
            'Social support',
            'Healthy life expectancy',
            'Freedom to make life choices',
            'Generosity',
            'Perceptions of corruption'
        ]
        
        for metric in key_metrics:
            col_idx = get_col_idx(metric)
            value = data[idx, col_idx]
            
            # Calculate percentile
            all_values = data[:, col_idx]
            clean_values = all_values[~np.isnan(all_values)]
            perc = (np.sum(clean_values < value) / len(clean_values)) * 100
            
            print(f"  {metric:<40} {value:8.3f}  ({perc:5.1f}th percentile)")
        
    except ValueError:
        print(f"Country '{country_name}' not found in dataset.")

# Analyze multiple countries
sample_countries = ['Finland', 'United States', 'India', 'China', 'Brazil']

for country in sample_countries:
    get_country_profile(country)

## 10. Custom Analysis Functions

In [None]:
# Find countries similar to a given country
def find_similar_countries(country_name: str, n: int = 5):
    """Find countries with similar happiness profiles."""
    try:
        idx = countries.index(country_name)
        country_data = data[idx, :]
        
        # Calculate Euclidean distance for key factors
        key_factors = ['Ladder score', 'Logged GDP per capita', 'Social support', 
                      'Healthy life expectancy', 'Freedom to make life choices']
        key_indices = [get_col_idx(f) for f in key_factors]
        
        distances = []
        for i, other_country in enumerate(countries):
            if i != idx:
                other_data = data[i, key_indices]
                country_subset = country_data[key_indices]
                
                # Skip if any NaN values
                if not (np.any(np.isnan(other_data)) or np.any(np.isnan(country_subset))):
                    dist = np.linalg.norm(country_subset - other_data)
                    distances.append((i, dist))
        
        # Sort by distance and get top N
        distances.sort(key=lambda x: x[1])
        
        print(f"\nCOUNTRIES MOST SIMILAR TO {country_name.upper()}")
        print("=" * 70)
        ladder_idx = get_col_idx('Ladder score')
        
        for i, (country_idx, dist) in enumerate(distances[:n], 1):
            similar_country = countries[country_idx]
            score = data[country_idx, ladder_idx]
            region = regions[country_idx]
            print(f"{i}. {similar_country:<30} Score: {score:.3f}  Region: {region}")
        
    except ValueError:
        print(f"Country '{country_name}' not found.")

# Test the function
find_similar_countries('India', 7)
find_similar_countries('United States', 7)

## 11. Advanced Queries

In [None]:
# Find countries that excel in specific areas
def find_top_performers(metric: str, region: str = None, n: int = 10):
    """Find top performing countries in a specific metric."""
    col_idx = get_col_idx(metric)
    values = data[:, col_idx]
    
    # Filter by region if specified
    if region:
        mask = np.array([r == region for r in regions])
        indices = np.where(mask)[0]
    else:
        indices = np.arange(len(countries))
    
    # Get valid values
    valid_indices = [i for i in indices if not np.isnan(values[i])]
    valid_values = values[valid_indices]
    
    # Sort and get top N
    sorted_idx = np.argsort(valid_values)[::-1][:n]
    
    region_str = f" in {region}" if region else ""
    print(f"\nTOP {n} COUNTRIES BY {metric.upper()}{region_str}")
    print("=" * 70)
    
    for rank, idx in enumerate(sorted_idx, 1):
        country_idx = valid_indices[idx]
        country = countries[country_idx]
        value = valid_values[idx]
        country_region = regions[country_idx]
        print(f"{rank:2d}. {country:<30} {value:8.3f}  ({country_region})")

# Examples
find_top_performers('Generosity', n=10)
find_top_performers('Social support', region='South Asia', n=5)
find_top_performers('Freedom to make life choices', n=10)

## 12. Data Quality Check

In [None]:
# Check for missing values
print("DATA QUALITY REPORT")
print("=" * 70)
print(f"Total countries: {len(countries)}")
print(f"Total features: {data.shape[1]}")
print()

print("Missing Values by Column:")
print("-" * 70)

for i, header in enumerate(headers[2:]):
    col_data = data[:, i]
    missing_count = np.sum(np.isnan(col_data))
    missing_pct = (missing_count / len(col_data)) * 100
    
    if missing_count > 0:
        print(f"{header:<45} {missing_count:3d} ({missing_pct:5.1f}%)")

# Check for outliers
print("\n\nPotential Outliers (Z-score > 2.5):")
print("-" * 70)

key_columns = ['Ladder score', 'Logged GDP per capita', 'Social support']

for col_name in key_columns:
    col_idx = get_col_idx(col_name)
    values = data[:, col_idx]
    
    mean = np.nanmean(values)
    std = np.nanstd(values)
    z_scores = np.abs((values - mean) / std)
    
    outlier_indices = np.where(z_scores > 2.5)[0]
    
    if len(outlier_indices) > 0:
        print(f"\n{col_name}:")
        for idx in outlier_indices:
            print(f"  {countries[idx]:<30} Value: {values[idx]:7.3f}, Z-score: {z_scores[idx]:.2f}")

## 13. Summary and Insights

In [None]:
# Generate comprehensive summary
print("WORLD HAPPINESS REPORT 2020 - KEY INSIGHTS")
print("=" * 80)

# Global statistics
ladder_idx = get_col_idx('Ladder score')
ladder_scores = data[:, ladder_idx]

print("\n1. GLOBAL HAPPINESS OVERVIEW")
print("-" * 80)
print(f"   Average happiness score: {np.nanmean(ladder_scores):.3f}")
print(f"   Happiest country: {countries[np.nanargmax(ladder_scores)]} ({np.nanmax(ladder_scores):.3f})")
print(f"   Least happy country: {countries[np.nanargmin(ladder_scores)]} ({np.nanmin(ladder_scores):.3f})")
print(f"   Happiness gap: {np.nanmax(ladder_scores) - np.nanmin(ladder_scores):.3f}")

# Regional insights
print("\n2. REGIONAL PATTERNS")
print("-" * 80)
regional_stats = analyze_by_region('Ladder score')
regional_stats.sort(key=lambda x: x['mean'], reverse=True)
print(f"   Happiest region: {regional_stats[0]['region']} (avg: {regional_stats[0]['mean']:.3f})")
print(f"   Least happy region: {regional_stats[-1]['region']} (avg: {regional_stats[-1]['mean']:.3f})")

# Factor insights
print("\n3. KEY DRIVERS OF HAPPINESS")
print("-" * 80)
print("   Strongest correlations with happiness:")
for factor, corr in correlations[:3]:
    print(f"   - {factor}: r = {corr:.3f}")

print("\n" + "=" * 80)

## Conclusion

This notebook demonstrates comprehensive data analysis using NumPy for:
- Data loading and preprocessing
- Statistical analysis
- Regional comparisons
- Correlation analysis
- Country-specific profiling
- Custom queries and insights

Feel free to modify the code cells above to explore other aspects of the data!