# 6.1 WHR Quality Check & Cleaning
### This script contains the following points:
#### 01. Import libraries and data
#### 02. Consistency Checks
#### 03. Cleaning & Wrangling Data (rename columns, create columns,  missing or incomplete data)
#### 04. Merge data: df_past and df_present
#### 05. Export data file

# 01 Import Libraries  & Data

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import scipy

# Create data path

In [2]:
#create data path
path = r'C:\Users\fa_an\OneDrive\CareerFoundry\Tasks\Data Analytics Immersion\Tasks 6.1-6.7'

# Import files

In [3]:
#WHR2024
df_present = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'WHR2024.csv'))

In [4]:
#WHR2015 to 2023
df_past = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'WHR2015_2023.csv'))

# Set no max for column display

In [5]:
pd.set_option('display.max_columns', None)

# 02 Consistency Checks on df_past and df_present

## df_past checks

In [6]:
df_past.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1362 entries, 0 to 1361
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          1362 non-null   object 
 1   Year             1362 non-null   int64  
 2   Region           1362 non-null   object 
 3   Happiness Rank   1362 non-null   int64  
 4   Happiness Score  1362 non-null   float64
 5   GDP              1362 non-null   float64
 6   Social Support   1362 non-null   float64
 7   Life Expectancy  1361 non-null   float64
 8   Freedom          1362 non-null   float64
 9   Generosity       1362 non-null   float64
 10  Corruption       1362 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 117.2+ KB


In [7]:
df_past.head()

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
0,Afghanistan,2023,Southern Asia,137,1.859,0.645,0.0,0.087,0.0,0.093,0.059
1,Afghanistan,2017,Southern Asia,141,3.794,0.401,0.582,0.181,0.106,0.312,0.061
2,Afghanistan,2018,Southern Asia,145,3.632,0.332,0.537,0.255,0.085,0.191,0.036
3,Afghanistan,2022,Southern Asia,146,2.404,0.758,0.0,0.289,0.0,0.089,0.005
4,Afghanistan,2021,Southern Asia,149,2.523,0.37,0.0,0.126,0.0,0.122,0.01


In [8]:
df_past.tail()

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
1357,Zimbabwe,2018,Sub-Saharan Africa,144,3.692,0.357,1.094,0.248,0.406,0.132,0.099
1358,Zimbabwe,2022,Sub-Saharan Africa,144,2.995,0.947,0.69,0.27,0.329,0.106,0.105
1359,Zimbabwe,2019,Sub-Saharan Africa,146,3.663,0.366,1.114,0.433,0.361,0.151,0.089
1360,Zimbabwe,2021,Sub-Saharan Africa,148,3.145,0.457,0.649,0.243,0.359,0.157,0.075
1361,Zimbabwe,2020,Sub-Saharan Africa,151,3.299,0.426,1.048,0.375,0.377,0.151,0.081


In [9]:
df_past.shape

(1362, 11)

In [10]:
df_past.describe()

Unnamed: 0,Year,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
count,1362.0,1362.0,1362.0,1362.0,1362.0,1361.0,1362.0,1362.0,1362.0
mean,2018.903084,76.596182,5.440554,1.018017,1.045228,0.583435,0.45115,0.196381,0.13221
std,2.56398,44.108242,1.118747,0.452866,0.331536,0.244886,0.156494,0.113493,0.112682
min,2015.0,1.0,1.859,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,38.25,4.59775,0.69625,0.832,0.402,0.356,0.115,0.057
50%,2019.0,76.5,5.431,1.04,1.083,0.612,0.468,0.183,0.097
75%,2021.0,114.0,6.25725,1.338,1.301,0.777,0.56975,0.253,0.166
max,2023.0,158.0,7.842,2.209,1.644,1.141,0.772,0.838,0.587


## df_present checks

In [11]:
df_present.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 11 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                143 non-null    object 
 1   Ladder score                                143 non-null    float64
 2   upperwhisker                                143 non-null    float64
 3   lowerwhisker                                143 non-null    float64
 4   Explained by: Log GDP per capita            140 non-null    float64
 5   Explained by: Social support                140 non-null    float64
 6   Explained by: Healthy life expectancy       140 non-null    float64
 7   Explained by: Freedom to make life choices  140 non-null    float64
 8   Explained by: Generosity                    140 non-null    float64
 9   Explained by: Perceptions of corruption     140 non-null    float64
 10  Dystopia + res

In [12]:
df_present.head()

Unnamed: 0,Country name,Ladder score,upperwhisker,lowerwhisker,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.741,7.815,7.667,1.844,1.572,0.695,0.859,0.142,0.546,2.082
1,Denmark,7.583,7.665,7.5,1.908,1.52,0.699,0.823,0.204,0.548,1.881
2,Iceland,7.525,7.618,7.433,1.881,1.617,0.718,0.819,0.258,0.182,2.05
3,Sweden,7.344,7.422,7.267,1.878,1.501,0.724,0.838,0.221,0.524,1.658
4,Israel,7.341,7.405,7.277,1.803,1.513,0.74,0.641,0.153,0.193,2.298


In [13]:
df_present.tail()

Unnamed: 0,Country name,Ladder score,upperwhisker,lowerwhisker,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
138,Congo (Kinshasa),3.295,3.462,3.128,0.534,0.665,0.262,0.473,0.189,0.072,1.102
139,Sierra Leone,3.245,3.366,3.124,0.654,0.566,0.253,0.469,0.181,0.053,1.068
140,Lesotho,3.186,3.469,2.904,0.771,0.851,0.0,0.523,0.082,0.085,0.875
141,Lebanon,2.707,2.797,2.616,1.377,0.577,0.556,0.173,0.068,0.029,-0.073
142,Afghanistan,1.721,1.775,1.667,0.628,0.0,0.242,0.0,0.091,0.088,0.672


In [14]:
df_present.shape

(143, 11)

In [15]:
df_present.describe()

Unnamed: 0,Ladder score,upperwhisker,lowerwhisker,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
count,143.0,143.0,143.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0
mean,5.52758,5.641175,5.413972,1.378807,1.134329,0.520886,0.620621,0.146271,0.154121,1.575914
std,1.170717,1.155008,1.187133,0.425098,0.333317,0.164923,0.162492,0.073441,0.126238,0.537459
min,1.721,1.775,1.667,0.0,0.0,0.0,0.0,0.0,0.0,-0.073
25%,4.726,4.8455,4.606,1.07775,0.92175,0.398,0.5275,0.091,0.06875,1.30825
50%,5.785,5.895,5.674,1.4315,1.2375,0.5495,0.641,0.1365,0.1205,1.6445
75%,6.416,6.5075,6.319,1.7415,1.38325,0.6485,0.736,0.1925,0.19375,1.88175
max,7.741,7.815,7.667,2.141,1.617,0.857,0.863,0.401,0.575,2.998


# 03 Cleaning & Wrangling Data

## Check for mixed data type in df_past

In [16]:
#check for mixed data type in df_past
for col in df_past.columns.tolist():
  weird = (df_past[[col]].map(type) != df_past[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_past[weird]) > 0:
    print (col)
else:
    print('Everything is a-ok!')

Everything is a-ok!


## Check for missing values in df_past

In [17]:
#check for missing values in df_past
df_past.isnull().sum()

Country            0
Year               0
Region             0
Happiness Rank     0
Happiness Score    0
GDP                0
Social Support     0
Life Expectancy    1
Freedom            0
Generosity         0
Corruption         0
dtype: int64

## Replace missing data in Life Expectancy column

In [18]:
# replace missing data in "Life Expectancy' column to 0
df_past['Life Expectancy'].fillna(0, inplace=True)

In [19]:
#repeat check for missing values in df_past
df_past.isnull().sum()

Country            0
Year               0
Region             0
Happiness Rank     0
Happiness Score    0
GDP                0
Social Support     0
Life Expectancy    0
Freedom            0
Generosity         0
Corruption         0
dtype: int64

## Check for duplicates in df_past

In [20]:
#check for duplicates in df_past
df_dups_past = df_past[df_past.duplicated()]

In [21]:
df_dups_past

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption


## Check for mixed data type in df_present

In [22]:
#no duplicates or mixed type data. Missing values are also ok now!

In [23]:
#check for mixed data type in df_present
for col in df_present.columns.tolist():
  weird = (df_present[[col]].map(type) != df_present[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_present[weird]) > 0:
    print (col)
else:
    print('Everything is a-ok!')

Everything is a-ok!


## Check for missing values in df_present

In [24]:
#check for missing values in df_present
df_present.isnull().sum()

Country name                                  0
Ladder score                                  0
upperwhisker                                  0
lowerwhisker                                  0
Explained by: Log GDP per capita              3
Explained by: Social support                  3
Explained by: Healthy life expectancy         3
Explained by: Freedom to make life choices    3
Explained by: Generosity                      3
Explained by: Perceptions of corruption       3
Dystopia + residual                           3
dtype: int64

## Replace all missing values with 0

In [25]:
#replace all missing values with 0
df_present.fillna(0, inplace=True)

In [26]:
# repeat check for missing values in df_present
df_present.isnull().sum()


Country name                                  0
Ladder score                                  0
upperwhisker                                  0
lowerwhisker                                  0
Explained by: Log GDP per capita              0
Explained by: Social support                  0
Explained by: Healthy life expectancy         0
Explained by: Freedom to make life choices    0
Explained by: Generosity                      0
Explained by: Perceptions of corruption       0
Dystopia + residual                           0
dtype: int64

## Check for duplicates in df_present

In [27]:
#check for duplicates in df_present
df_dups_present = df_present[df_present.duplicated()]

In [28]:
df_dups_present

Unnamed: 0,Country name,Ladder score,upperwhisker,lowerwhisker,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual


In [29]:
#no duplicates or mixed type data. Missing values are also ok now!
#Need to rename and create columns in df_present to match df_past.

# Data Wrangling

## Create a Year column

In [30]:
# Create a 'Year' column and all will have 2024
df_present['Year'] = 2024

## Create a Happiness Rank column

In [31]:
# Create a 'Happiness Rank' column

# Sort the DataFrame by 'Ladder score' column in ascending order
df_present.sort_values(by='Ladder score', ascending=False, inplace=True)

# Assign the 'Happiness Rank' based on the sorted order of 'Ladder score'
df_present['Happiness Rank'] = range(1, len(df_present) + 1)

## Create a Region column

In [32]:
# Create a 'Region' column with no data yet
df_present['Region'] = None

## Check for missing values in df_present after adding columns

In [33]:
#check for missing values in df_present
df_present.isnull().sum()

Country name                                    0
Ladder score                                    0
upperwhisker                                    0
lowerwhisker                                    0
Explained by: Log GDP per capita                0
Explained by: Social support                    0
Explained by: Healthy life expectancy           0
Explained by: Freedom to make life choices      0
Explained by: Generosity                        0
Explained by: Perceptions of corruption         0
Dystopia + residual                             0
Year                                            0
Happiness Rank                                  0
Region                                        143
dtype: int64

## Rename column Country name

In [34]:
# Rename the 'Country name' column to 'Country'
df_present.rename(columns={'Country name': 'Country'}, inplace=True)

## Rename column Ladder Score

In [35]:
# Rename the 'Ladder score' column to 'Happiness Score'
df_present.rename(columns={'Ladder score': 'Happiness Score'}, inplace=True)

## Rename column Explained by: Log GDP per capita

In [36]:
# Rename the 'Explained by: Log GDP per capita' column to 'GDP'
df_present.rename(columns={'Explained by: Log GDP per capita': 'GDP'}, inplace=True)

## Rename column Explained by: Social support

In [37]:
# Rename the 'Explained by: Social support' column to 'Social Support'
df_present.rename(columns={'Explained by: Social support': 'Social Support'}, inplace=True)

## Rename column Explained by: Healthy life expectancy

In [38]:
# Rename the 'Explained by: Healthy life expectancy' column to 'Life Expectancy'
df_present.rename(columns={'Explained by: Healthy life expectancy': 'Life Expectancy'}, inplace=True)

## Rename column Explained by: Freedom to make life choices

In [39]:
# Rename the 'Explained by: Freedom to make life choices' column to 'Freedom'
df_present.rename(columns={'Explained by: Freedom to make life choices': 'Freedom'}, inplace=True)

## Rename column Explained by: Generosity

In [40]:
# Rename the 'Explained by: Generosity' column to 'Generosity'
df_present.rename(columns={'Explained by: Generosity': 'Generosity'}, inplace=True)

## Rename column Explained by: Perceptions of corruption

In [41]:
# Rename the 'Explained by: Perceptions of corruption' column to 'Corruption'
df_present.rename(columns={'Explained by: Perceptions of corruption': 'Corruption'}, inplace=True)

## Drop columns upperwhisker, lowerwhisker, Dystopia + residual

In [42]:
# Drop the specified columns: upperwhisker, lowerwhisker, Dystopia + residual
columns_to_drop = ['upperwhisker', 'lowerwhisker', 'Dystopia + residual']
df_present.drop(columns=columns_to_drop, inplace=True)

## Check updates with header for df_past and df_present

In [43]:
df_past.head()

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
0,Afghanistan,2023,Southern Asia,137,1.859,0.645,0.0,0.087,0.0,0.093,0.059
1,Afghanistan,2017,Southern Asia,141,3.794,0.401,0.582,0.181,0.106,0.312,0.061
2,Afghanistan,2018,Southern Asia,145,3.632,0.332,0.537,0.255,0.085,0.191,0.036
3,Afghanistan,2022,Southern Asia,146,2.404,0.758,0.0,0.289,0.0,0.089,0.005
4,Afghanistan,2021,Southern Asia,149,2.523,0.37,0.0,0.126,0.0,0.122,0.01


In [44]:
df_present.head()

Unnamed: 0,Country,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption,Year,Happiness Rank,Region
0,Finland,7.741,1.844,1.572,0.695,0.859,0.142,0.546,2024,1,
1,Denmark,7.583,1.908,1.52,0.699,0.823,0.204,0.548,2024,2,
2,Iceland,7.525,1.881,1.617,0.718,0.819,0.258,0.182,2024,3,
3,Sweden,7.344,1.878,1.501,0.724,0.838,0.221,0.524,2024,4,
4,Israel,7.341,1.803,1.513,0.74,0.641,0.153,0.193,2024,5,


## Reorder columns in df_present to match df_past

In [45]:
# Reorder the columns
df_present = df_present[['Country','Year','Region', 'Happiness Rank', 'Happiness Score', 'GDP', 'Social Support', 'Life Expectancy', 'Freedom', 'Generosity', 'Corruption']]

## Check df_present head

In [46]:
df_present.head()

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
0,Finland,2024,,1,7.741,1.844,1.572,0.695,0.859,0.142,0.546
1,Denmark,2024,,2,7.583,1.908,1.52,0.699,0.823,0.204,0.548
2,Iceland,2024,,3,7.525,1.881,1.617,0.718,0.819,0.258,0.182
3,Sweden,2024,,4,7.344,1.878,1.501,0.724,0.838,0.221,0.524
4,Israel,2024,,5,7.341,1.803,1.513,0.74,0.641,0.153,0.193


## Create a mapping dictionary from country to region from df_past and use in df_present to populate regions

In [47]:
# Create a mapping dictionary from country to region using the merged DataFrame
country_region_mapping = df_past.set_index('Country')['Region'].to_dict()

# Map the regions to the countries in df_present based on the mapping dictionary
df_present['Region'] = df_present['Country'].map(country_region_mapping)

## Check for missing values in Regions to make sure mapping dictionary worked

In [48]:
#check for missing values in df_present - Region
df_present.isnull().sum()

Country            0
Year               0
Region             6
Happiness Rank     0
Happiness Score    0
GDP                0
Social Support     0
Life Expectancy    0
Freedom            0
Generosity         0
Corruption         0
dtype: int64

## Filter missing rows for countries without region

In [49]:
# Filter rows where the 'Region' column contains missing values
missing_region = df_present[df_present['Region'].isnull()]

# Display the DataFrame with missing values in the 'Region' column
print(missing_region)

                       Country  Year Region  Happiness Rank  Happiness Score  \
17                     Czechia  2024    NaN              18            6.822   
30    Taiwan Province of China  2024    NaN              31            6.503   
83             North Macedonia  2024    NaN              84            5.369   
85   Hong Kong S.A.R. of China  2024    NaN              86            5.316   
97                     Turkiye  2024    NaN              98            4.975   
102         State of Palestine  2024    NaN             103            4.879   

       GDP  Social Support  Life Expectancy  Freedom  Generosity  Corruption  
17   1.783           1.511            0.638    0.787       0.177       0.068  
30   1.842           1.346            0.650    0.649       0.068       0.202  
83   1.475           1.277            0.569    0.580       0.194       0.015  
85   1.909           1.184            0.857    0.485       0.147       0.402  
97   1.702           1.175            0.631 

## Update country names to match df_past

In [50]:
replace_values = {'Czechia': 'Czech Republic',
                  'Taiwan Province of China': 'Taiwan',
                  'North Macedonia': 'Macedonia',
                  'Hong Kong S.A.R. of China': 'Hong Kong',
                  'Turkiye': 'Turkey',
                  'State of Palestine': 'Palestinian Territories'
                 }

# Replace values in the 'Country' column
df_present['Country'] = df_present['Country'].replace(replace_values)

## Remap the regions to the updated countries

In [51]:
# Re-Map the regions to the countries in df_present based on the mapping dictionary
df_present['Region'] = df_present['Country'].map(country_region_mapping)

## Recheck for missing Regions in df_present

In [52]:
#recheck for missing values in df_present - Region
df_present.isnull().sum()

Country            0
Year               0
Region             0
Happiness Rank     0
Happiness Score    0
GDP                0
Social Support     0
Life Expectancy    0
Freedom            0
Generosity         0
Corruption         0
dtype: int64

# 04 Merge df_past and df_present as df_all

In [53]:
# Merge/concatenate the two DataFrames along axis 0 (rows)
df_all = pd.concat([df_past, df_present], ignore_index=True)

## Check head and info on merged dataframe

In [54]:
df_all.head()

Unnamed: 0,Country,Year,Region,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
0,Afghanistan,2023,Southern Asia,137,1.859,0.645,0.0,0.087,0.0,0.093,0.059
1,Afghanistan,2017,Southern Asia,141,3.794,0.401,0.582,0.181,0.106,0.312,0.061
2,Afghanistan,2018,Southern Asia,145,3.632,0.332,0.537,0.255,0.085,0.191,0.036
3,Afghanistan,2022,Southern Asia,146,2.404,0.758,0.0,0.289,0.0,0.089,0.005
4,Afghanistan,2021,Southern Asia,149,2.523,0.37,0.0,0.126,0.0,0.122,0.01


In [55]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1505 entries, 0 to 1504
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          1505 non-null   object 
 1   Year             1505 non-null   int64  
 2   Region           1505 non-null   object 
 3   Happiness Rank   1505 non-null   int64  
 4   Happiness Score  1505 non-null   float64
 5   GDP              1505 non-null   float64
 6   Social Support   1505 non-null   float64
 7   Life Expectancy  1505 non-null   float64
 8   Freedom          1505 non-null   float64
 9   Generosity       1505 non-null   float64
 10  Corruption       1505 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 129.5+ KB


## Descriptive Statistics for merged dataframe

In [56]:
df_all.describe()

Unnamed: 0,Year,Happiness Rank,Happiness Score,GDP,Social Support,Life Expectancy,Freedom,Generosity,Corruption
count,1505.0,1505.0,1505.0,1505.0,1505.0,1505.0,1505.0,1505.0,1505.0
mean,2019.387375,76.159468,5.448823,1.04955,1.051433,0.576066,0.466015,0.191328,0.133985
std,2.860818,43.867863,1.123676,0.464197,0.335578,0.240741,0.165712,0.111533,0.114189
min,2015.0,1.0,1.721,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,38.0,4.603,0.737,0.841,0.401,0.367,0.112,0.057
50%,2019.0,76.0,5.472,1.069,1.092,0.6,0.482,0.177,0.099
75%,2022.0,114.0,6.269,1.373,1.318,0.76,0.585,0.248,0.172
max,2024.0,158.0,7.842,2.209,1.644,1.141,0.863,0.838,0.587


## Get the median for columns in df_all

In [57]:
# Calculate the median of specific columns in the DataFrame
median_values = df_all[['Year', 'Happiness Rank', 'Happiness Score', 'GDP', 'Social Support', 'Life Expectancy', 'Freedom', 'Generosity', 'Corruption']].median()

# Display the median values
print(median_values)

Year               2019.000
Happiness Rank       76.000
Happiness Score       5.472
GDP                   1.069
Social Support        1.092
Life Expectancy       0.600
Freedom               0.482
Generosity            0.177
Corruption            0.099
dtype: float64


## Get the mode for all columns in df_all

In [58]:
# Calculate the mode of specific columns in the DataFrame
mode_values = df_all[['Year', 'Happiness Rank', 'Happiness Score', 'GDP', 'Social Support', 'Life Expectancy', 'Freedom', 'Generosity', 'Corruption']].mode()

# Display the mode values
print(mode_values)

     Year  Happiness Rank  Happiness Score  GDP  Social Support  \
0  2015.0            34.0            2.905  0.0             0.0   
1     NaN            57.0            4.308  NaN             NaN   
2     NaN            82.0            5.890  NaN             NaN   
3     NaN             NaN            6.125  NaN             NaN   
4     NaN             NaN            6.455  NaN             NaN   

   Life Expectancy  Freedom  Generosity  Corruption  
0              0.0      0.0       0.000       0.064  
1              NaN      NaN       0.153         NaN  
2              NaN      NaN         NaN         NaN  
3              NaN      NaN         NaN         NaN  
4              NaN      NaN         NaN         NaN  


## Check for missing values in df_all

In [59]:
#check for missing values in df_all *Region needs to be populated for 2024 data (143 countries)
df_all.isnull().sum()

Country            0
Year               0
Region             0
Happiness Rank     0
Happiness Score    0
GDP                0
Social Support     0
Life Expectancy    0
Freedom            0
Generosity         0
Corruption         0
dtype: int64

# 05 Export file

In [60]:
#Export the merged file in pickle format as “WHR_all.pkl”.
df_all.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'WHR_all.pkl'))