## 1. Imports and Configuration

In [1]:
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd

# Project paths
PROJECT_ROOT = Path.cwd()
RAW_DATA_DIR = PROJECT_ROOT / "raw_data"
PROCESSED_FILENAME = "happiness_combined_2015_2019.csv"
PROCESSED_DATA_PATH = PROJECT_ROOT / PROCESSED_FILENAME

# Columns to keep in final output
OUTPUT_COLUMNS = [
    "country",
    "region",
    "continent",
    "year",
    "happiness_score",
    "gdp_per_capita",
    "social_support",
    "healthy_life_expectancy",
    "freedom",
    "generosity",
    "corruption_perception",
]

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Output file: {PROCESSED_DATA_PATH}")


Project root: /Users/elar.saks/Desktop/world_happiness_report_data_visualization
Raw data directory: /Users/elar.saks/Desktop/world_happiness_report_data_visualization/raw_data
Output file: /Users/elar.saks/Desktop/world_happiness_report_data_visualization/happiness_combined_2015_2019.csv


## 2. Column Normalization Mappings

Each year's dataset has different column names. These mappings standardize them to a consistent schema.

In [2]:
COLUMN_NORMALIZATION: Dict[int, Dict[str, str]] = {
    2015: {
        "Country": "country",
        "Region": "region",
        "Happiness Rank": "happiness_rank",
        "Happiness Score": "happiness_score",
        "Economy (GDP per Capita)": "gdp_per_capita",
        "Family": "social_support",
        "Health (Life Expectancy)": "healthy_life_expectancy",
        "Freedom": "freedom",
        "Generosity": "generosity",
        "Trust (Government Corruption)": "corruption_perception",
        "Standard Error": "standard_error",
        "Dystopia Residual": "dystopia_residual",
    },
    2016: {
        "Country": "country",
        "Region": "region",
        "Happiness Rank": "happiness_rank",
        "Happiness Score": "happiness_score",
        "Economy (GDP per Capita)": "gdp_per_capita",
        "Family": "social_support",
        "Health (Life Expectancy)": "healthy_life_expectancy",
        "Freedom": "freedom",
        "Generosity": "generosity",
        "Trust (Government Corruption)": "corruption_perception",
        "Lower Confidence Interval": "whisker_low",
        "Upper Confidence Interval": "whisker_high",
        "Dystopia Residual": "dystopia_residual",
    },
    2017: {
        "Country": "country",
        "Happiness.Rank": "happiness_rank",
        "Happiness.Score": "happiness_score",
        "Economy..GDP.per.Capita.": "gdp_per_capita",
        "Family": "social_support",
        "Health..Life.Expectancy.": "healthy_life_expectancy",
        "Freedom": "freedom",
        "Generosity": "generosity",
        "Trust..Government.Corruption.": "corruption_perception",
        "Whisker.high": "whisker_high",
        "Whisker.low": "whisker_low",
        "Dystopia.Residual": "dystopia_residual",
    },
    2018: {
        "Country or region": "country",
        "Overall rank": "happiness_rank",
        "Score": "happiness_score",
        "GDP per capita": "gdp_per_capita",
        "Social support": "social_support",
        "Healthy life expectancy": "healthy_life_expectancy",
        "Freedom to make life choices": "freedom",
        "Generosity": "generosity",
        "Perceptions of corruption": "corruption_perception",
    },
    2019: {
        "Country or region": "country",
        "Overall rank": "happiness_rank",
        "Score": "happiness_score",
        "GDP per capita": "gdp_per_capita",
        "Social support": "social_support",
        "Healthy life expectancy": "healthy_life_expectancy",
        "Freedom to make life choices": "freedom",
        "Generosity": "generosity",
        "Perceptions of corruption": "corruption_perception",
    },
}

print(f"Column mappings defined for years: {list(COLUMN_NORMALIZATION.keys())}")


Column mappings defined for years: [2015, 2016, 2017, 2018, 2019]


## 3. Geographic Mappings

### 3.1 Country Name Aliases
Standardizes country name variations across datasets.

In [3]:
COUNTRY_ALIASES: Dict[str, str] = {
    "Trinidad & Tobago": "Trinidad and Tobago",
    "Taiwan Province of China": "Taiwan",
    "Hong Kong S.A.R., China": "Hong Kong",
    "North Macedonia": "Macedonia",
    "Bolivia (Plurinational State of)": "Bolivia",
    "Congo, Dem. Rep.": "Congo (Kinshasa)",
    "Democratic Republic of the Congo": "Congo (Kinshasa)",
    "Congo, Rep.": "Congo (Brazzaville)",
    "Republic of the Congo": "Congo (Brazzaville)",
    "Eswatini": "Swaziland",
    "Ivory Coast": "Ivory Coast",
    "Czechia": "Czech Republic",
    "United States of America": "United States",
    "Russia": "Russian Federation",
    "Slovak Republic": "Slovakia",
}

print(f"Defined {len(COUNTRY_ALIASES)} country name aliases")


Defined 15 country name aliases


### 3.2 Region to Continent Mapping
Maps UN geoscheme-style regions to the 7-continent model.

In [4]:
REGION_TO_CONTINENT: Dict[str, str] = {
    "Eastern Europe": "Europe",
    "Western Europe": "Europe",
    "Northern Europe": "Europe",
    "Southern Europe": "Europe",
    "Northern Africa": "Africa",
    "Sub-Saharan Africa": "Africa",
    "Western Asia / Middle East": "Asia",
    "Central Asia": "Asia",
    "South Asia": "Asia",
    "East Asia": "Asia",
    "Southeast Asia": "Asia",
    "North America": "North America",
    "Central America": "North America",
    "Caribbean": "North America",
    "South America": "South America",
    "Oceania": "Oceania",
    "Antarctica": "Antarctica",
}

print(f"Defined {len(REGION_TO_CONTINENT)} region-to-continent mappings")
print(f"Continents: {sorted(set(REGION_TO_CONTINENT.values()))}")


Defined 17 region-to-continent mappings
Continents: ['Africa', 'Antarctica', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']


### 3.3 Country to Region Mapping
Maps each country to its UN geoscheme-style region.

In [5]:
COUNTRY_TO_REGION: Dict[str, str] = {
    "Afghanistan": "South Asia",
    "Albania": "Southern Europe",
    "Algeria": "Northern Africa",
    "Angola": "Sub-Saharan Africa",
    "Argentina": "South America",
    "Armenia": "Western Asia / Middle East",
    "Australia": "Oceania",
    "Austria": "Western Europe",
    "Azerbaijan": "Western Asia / Middle East",
    "Bahrain": "Western Asia / Middle East",
    "Bangladesh": "South Asia",
    "Belarus": "Eastern Europe",
    "Belgium": "Western Europe",
    "Belize": "Central America",
    "Benin": "Sub-Saharan Africa",
    "Bhutan": "South Asia",
    "Bolivia": "South America",
    "Bosnia and Herzegovina": "Southern Europe",
    "Botswana": "Sub-Saharan Africa",
    "Brazil": "South America",
    "Bulgaria": "Eastern Europe",
    "Burkina Faso": "Sub-Saharan Africa",
    "Burundi": "Sub-Saharan Africa",
    "Cambodia": "Southeast Asia",
    "Cameroon": "Sub-Saharan Africa",
    "Canada": "North America",
    "Central African Republic": "Sub-Saharan Africa",
    "Chad": "Sub-Saharan Africa",
    "Chile": "South America",
    "China": "East Asia",
    "Colombia": "South America",
    "Comoros": "Sub-Saharan Africa",
    "Congo (Brazzaville)": "Sub-Saharan Africa",
    "Congo (Kinshasa)": "Sub-Saharan Africa",
    "Costa Rica": "Central America",
    "Croatia": "Southern Europe",
    "Cyprus": "Western Asia / Middle East",
    "Czech Republic": "Eastern Europe",
    "Denmark": "Northern Europe",
    "Djibouti": "Sub-Saharan Africa",
    "Dominican Republic": "Caribbean",
    "Ecuador": "South America",
    "Egypt": "Northern Africa",
    "El Salvador": "Central America",
    "Estonia": "Northern Europe",
    "Ethiopia": "Sub-Saharan Africa",
    "Finland": "Northern Europe",
    "France": "Western Europe",
    "Gabon": "Sub-Saharan Africa",
    "Gambia": "Sub-Saharan Africa",
    "Georgia": "Western Asia / Middle East",
    "Germany": "Western Europe",
    "Ghana": "Sub-Saharan Africa",
    "Greece": "Southern Europe",
    "Guatemala": "Central America",
    "Guinea": "Sub-Saharan Africa",
    "Haiti": "Caribbean",
    "Honduras": "Central America",
    "Hong Kong": "East Asia",
    "Hungary": "Eastern Europe",
    "Iceland": "Northern Europe",
    "India": "South Asia",
    "Indonesia": "Southeast Asia",
    "Iran": "Western Asia / Middle East",
    "Iraq": "Western Asia / Middle East",
    "Ireland": "Northern Europe",
    "Israel": "Western Asia / Middle East",
    "Italy": "Southern Europe",
    "Ivory Coast": "Sub-Saharan Africa",
    "Jamaica": "Caribbean",
    "Japan": "East Asia",
    "Jordan": "Western Asia / Middle East",
    "Kazakhstan": "Central Asia",
    "Kenya": "Sub-Saharan Africa",
    "Kosovo": "Southern Europe",
    "Kuwait": "Western Asia / Middle East",
    "Kyrgyzstan": "Central Asia",
    "Laos": "Southeast Asia",
    "Latvia": "Northern Europe",
    "Lebanon": "Western Asia / Middle East",
    "Lesotho": "Sub-Saharan Africa",
    "Liberia": "Sub-Saharan Africa",
    "Libya": "Northern Africa",
    "Lithuania": "Northern Europe",
    "Luxembourg": "Western Europe",
    "Macedonia": "Southern Europe",
    "Madagascar": "Sub-Saharan Africa",
    "Malawi": "Sub-Saharan Africa",
    "Malaysia": "Southeast Asia",
    "Mali": "Sub-Saharan Africa",
    "Malta": "Southern Europe",
    "Mauritania": "Sub-Saharan Africa",
    "Mauritius": "Sub-Saharan Africa",
    "Mexico": "Central America",
    "Moldova": "Eastern Europe",
    "Mongolia": "East Asia",
    "Montenegro": "Southern Europe",
    "Morocco": "Northern Africa",
    "Mozambique": "Sub-Saharan Africa",
    "Myanmar": "Southeast Asia",
    "Namibia": "Sub-Saharan Africa",
    "Nepal": "South Asia",
    "Netherlands": "Western Europe",
    "New Zealand": "Oceania",
    "Nicaragua": "Central America",
    "Niger": "Sub-Saharan Africa",
    "Nigeria": "Sub-Saharan Africa",
    "North Cyprus": "Western Asia / Middle East",
    "Northern Cyprus": "Western Asia / Middle East",
    "Norway": "Northern Europe",
    "Oman": "Western Asia / Middle East",
    "Pakistan": "South Asia",
    "Palestinian Territories": "Western Asia / Middle East",
    "Panama": "Central America",
    "Paraguay": "South America",
    "Peru": "South America",
    "Philippines": "Southeast Asia",
    "Poland": "Eastern Europe",
    "Portugal": "Southern Europe",
    "Puerto Rico": "Caribbean",
    "Qatar": "Western Asia / Middle East",
    "Romania": "Eastern Europe",
    "Russian Federation": "Eastern Europe",
    "Rwanda": "Sub-Saharan Africa",
    "Saudi Arabia": "Western Asia / Middle East",
    "Senegal": "Sub-Saharan Africa",
    "Serbia": "Southern Europe",
    "Sierra Leone": "Sub-Saharan Africa",
    "Singapore": "Southeast Asia",
    "Slovakia": "Eastern Europe",
    "Slovenia": "Southern Europe",
    "Somalia": "Sub-Saharan Africa",
    "Somaliland Region": "Sub-Saharan Africa",
    "Somaliland region": "Sub-Saharan Africa",
    "South Africa": "Sub-Saharan Africa",
    "South Korea": "East Asia",
    "South Sudan": "Sub-Saharan Africa",
    "Spain": "Southern Europe",
    "Sri Lanka": "South Asia",
    "Sudan": "Northern Africa",
    "Suriname": "South America",
    "Swaziland": "Sub-Saharan Africa",
    "Sweden": "Northern Europe",
    "Switzerland": "Western Europe",
    "Syria": "Western Asia / Middle East",
    "Taiwan": "East Asia",
    "Tajikistan": "Central Asia",
    "Tanzania": "Sub-Saharan Africa",
    "Thailand": "Southeast Asia",
    "Togo": "Sub-Saharan Africa",
    "Trinidad and Tobago": "Caribbean",
    "Tunisia": "Northern Africa",
    "Turkey": "Western Asia / Middle East",
    "Turkmenistan": "Central Asia",
    "Uganda": "Sub-Saharan Africa",
    "Ukraine": "Eastern Europe",
    "United Arab Emirates": "Western Asia / Middle East",
    "United Kingdom": "Northern Europe",
    "United States": "North America",
    "Uruguay": "South America",
    "Uzbekistan": "Central Asia",
    "Venezuela": "South America",
    "Vietnam": "Southeast Asia",
    "Yemen": "Western Asia / Middle East",
    "Zambia": "Sub-Saharan Africa",
    "Zimbabwe": "Sub-Saharan Africa",
}

print(f"Defined {len(COUNTRY_TO_REGION)} country-to-region mappings")


Defined 166 country-to-region mappings


## 4. Helper Functions

In [6]:
def normalize_columns(df: pd.DataFrame, year: int) -> pd.DataFrame:
    """Rename columns to standardized names for a given year."""
    mapping = COLUMN_NORMALIZATION.get(year, {})
    df = df.rename(columns=mapping)
    df.columns = [col.strip() for col in df.columns]
    if "country" not in df.columns:
        raise ValueError(f"'country' column missing for {year}")
    return df


def clean_country_names(df: pd.DataFrame) -> pd.DataFrame:
    """Apply country name aliases and strip whitespace."""
    df["country"] = df["country"].replace(COUNTRY_ALIASES).str.strip()
    return df


def convert_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Convert numeric columns to proper types."""
    numeric_cols = [
        "happiness_score",
        "gdp_per_capita",
        "social_support",
        "healthy_life_expectancy",
        "freedom",
        "generosity",
        "corruption_perception",
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    return df


def process_year(year: int, filepath: Path) -> pd.DataFrame:
    """Load and process a single year's data."""
    df = pd.read_csv(filepath)
    df = normalize_columns(df, year)
    df = clean_country_names(df)
    df["year"] = year
    df = convert_numeric_columns(df)
    return df


print("Helper functions defined ✓")


Helper functions defined ✓


## 5. Load and Combine Data

In [7]:
# Define year files
year_files = {
    2015: RAW_DATA_DIR / "2015.csv",
    2016: RAW_DATA_DIR / "2016.csv",
    2017: RAW_DATA_DIR / "2017.csv",
    2018: RAW_DATA_DIR / "2018.csv",
    2019: RAW_DATA_DIR / "2019.csv",
}

# Check for missing files
missing = [year for year, path in year_files.items() if not path.exists()]
if missing:
    raise FileNotFoundError(f"Missing raw data files for years: {missing}")

print("All raw data files found ✓")
for year, path in year_files.items():
    print(f"  {year}: {path.name}")


All raw data files found ✓
  2015: 2015.csv
  2016: 2016.csv
  2017: 2017.csv
  2018: 2018.csv
  2019: 2019.csv


In [8]:
# Process each year and combine
frames = []
for year, filepath in year_files.items():
    df_year = process_year(year, filepath)
    frames.append(df_year)
    print(f"{year}: {len(df_year)} countries")

# Combine all years
df = pd.concat(frames, ignore_index=True, sort=False)
print(f"\nCombined: {len(df)} total records")


2015: 158 countries
2016: 157 countries
2017: 155 countries
2018: 156 countries
2019: 156 countries

Combined: 782 total records


## 6. Add Region and Continent

In [9]:
# Map countries to regions
df["region"] = df["country"].map(COUNTRY_TO_REGION)

# Check for unmapped countries
unmapped = df[df["region"].isna()]["country"].unique()
if len(unmapped) > 0:
    raise ValueError(f"Unmapped countries (add to COUNTRY_TO_REGION): {sorted(unmapped)}")

print(f"All countries mapped to regions ✓")
print(f"Unique regions: {df['region'].nunique()}")


All countries mapped to regions ✓
Unique regions: 16


In [10]:
# Map regions to continents
df["continent"] = df["region"].map(REGION_TO_CONTINENT)

print(f"All regions mapped to continents ✓")
print(f"Unique continents: {df['continent'].nunique()}")
print(f"\nContinent distribution:")
print(df["continent"].value_counts())


All regions mapped to continents ✓
Unique continents: 6

Continent distribution:
continent
Asia             230
Africa           221
Europe           200
North America     69
South America     52
Oceania           10
Name: count, dtype: int64


## 7. Select and Order Columns

In [11]:
# Keep only the columns we need, in the specified order
df = df[[col for col in OUTPUT_COLUMNS if col in df.columns]]

print(f"Final columns ({len(df.columns)}):")
for col in df.columns:
    print(f"  - {col}")


Final columns (11):
  - country
  - region
  - continent
  - year
  - happiness_score
  - gdp_per_capita
  - social_support
  - healthy_life_expectancy
  - freedom
  - generosity
  - corruption_perception


## 8. Validate Data

In [12]:
def validate_data(df: pd.DataFrame) -> bool:
    """Run sanity checks on the processed dataset."""
    issues = []
    
    # Check for required columns
    required_cols = ["country", "year", "happiness_score"]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        issues.append(f"Missing required columns: {missing_cols}")
    
    # Check for duplicate country-year combinations
    duplicates = df.duplicated(subset=["country", "year"], keep=False)
    if duplicates.any():
        dup_count = duplicates.sum()
        issues.append(f"Found {dup_count} duplicate country-year rows")
    
    # Check for null happiness scores
    null_scores = df["happiness_score"].isna().sum()
    if null_scores > 0:
        issues.append(f"Found {null_scores} rows with null happiness_score")
    
    # Check happiness score range
    min_score = df["happiness_score"].min()
    max_score = df["happiness_score"].max()
    if min_score < 0 or max_score > 10:
        issues.append(f"Happiness scores outside expected range [0-10]: min={min_score}, max={max_score}")
    
    # Report results
    if issues:
        print("❌ Validation FAILED:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print("✓ Validation PASSED")
        return True


# Run validation
validate_data(df)

# Print summary
print(f"\n--- Dataset Summary ---")
print(f"Total records: {len(df)}")
print(f"Unique countries: {df['country'].nunique()}")
print(f"Unique regions: {df['region'].nunique()}")
print(f"Unique continents: {df['continent'].nunique()}")
print(f"Years covered: {sorted(df['year'].unique())}")
print(f"Happiness score range: {df['happiness_score'].min():.3f} - {df['happiness_score'].max():.3f}")


✓ Validation PASSED

--- Dataset Summary ---
Total records: 782
Unique countries: 166
Unique regions: 16
Unique continents: 6
Years covered: [np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019)]
Happiness score range: 2.693 - 7.769


## 9. Preview Data

In [13]:
# Show sample data
df.head(10)


Unnamed: 0,country,region,continent,year,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,generosity,corruption_perception
0,Switzerland,Western Europe,Europe,2015,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978
1,Iceland,Northern Europe,Europe,2015,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145
2,Denmark,Northern Europe,Europe,2015,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357
3,Norway,Northern Europe,Europe,2015,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503
4,Canada,North America,North America,2015,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957
5,Finland,Northern Europe,Europe,2015,7.406,1.29025,1.31826,0.88911,0.64169,0.23351,0.41372
6,Netherlands,Western Europe,Europe,2015,7.378,1.32944,1.28017,0.89284,0.61576,0.4761,0.31814
7,Sweden,Northern Europe,Europe,2015,7.364,1.33171,1.28907,0.91087,0.6598,0.36262,0.43844
8,New Zealand,Oceania,Oceania,2015,7.286,1.25018,1.31967,0.90837,0.63938,0.47501,0.42922
9,Australia,Oceania,Oceania,2015,7.284,1.33358,1.30923,0.93156,0.65124,0.43562,0.35637


In [14]:
# Show data types and info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  782 non-null    object 
 1   region                   782 non-null    object 
 2   continent                782 non-null    object 
 3   year                     782 non-null    int64  
 4   happiness_score          782 non-null    float64
 5   gdp_per_capita           782 non-null    float64
 6   social_support           782 non-null    float64
 7   healthy_life_expectancy  782 non-null    float64
 8   freedom                  782 non-null    float64
 9   generosity               782 non-null    float64
 10  corruption_perception    781 non-null    float64
dtypes: float64(7), int64(1), object(3)
memory usage: 67.3+ KB


In [15]:
# Show descriptive statistics
df.describe()


Unnamed: 0,year,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom,generosity,corruption_perception
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,781.0
mean,2016.993606,5.379018,0.916047,1.078392,0.612416,0.411091,0.218576,0.125436
std,1.417364,1.127456,0.40734,0.329548,0.248309,0.15288,0.122321,0.105816
min,2015.0,2.693,0.0,0.0,0.0,0.0,0.0,0.0
25%,2016.0,4.50975,0.6065,0.869363,0.440183,0.309768,0.13,0.054
50%,2017.0,5.322,0.982205,1.124735,0.64731,0.431,0.201982,0.091
75%,2018.0,6.1895,1.236187,1.32725,0.808,0.531,0.278832,0.15603
max,2019.0,7.769,2.096,1.644,1.141,0.724,0.838075,0.55191


## 10. Save Processed Data

In [16]:
# Save to CSV
df.to_csv(PROCESSED_DATA_PATH, index=False)

print(f"✓ Saved processed data to: {PROCESSED_DATA_PATH}")
print(f"  File size: {PROCESSED_DATA_PATH.stat().st_size / 1024:.1f} KB")
print(f"  Rows: {len(df)}")
print(f"  Columns: {len(df.columns)}")


✓ Saved processed data to: /Users/elar.saks/Desktop/world_happiness_report_data_visualization/happiness_combined_2015_2019.csv
  File size: 77.2 KB
  Rows: 782
  Columns: 11


---

## Summary

This notebook processed the World Happiness Report data (2015-2019) by:

1. **Loading** 5 yearly CSV files with inconsistent schemas
2. **Normalizing** column names to a consistent format
3. **Standardizing** country names using aliases
4. **Enriching** with UN geoscheme regions and 7-continent classification
5. **Validating** data quality (no duplicates, no missing scores, valid ranges)
6. **Saving** the combined dataset to CSV

The output file `happiness_combined_2015_2019.csv` is ready for analysis.