## Creating SQLite Database

In [1]:
# Import libraries
import pandas as pd
import sqlite3

In [2]:
# Install the ipthon-sql library
#!pip install ipython-sql

In [3]:
# To guarantee compatibility between jupysql / ipython-sql and newer prettytable releases
#!pip install "prettytable<3.10"

In [4]:
# Import data
df = pd.read_csv("WorldLifeExpectancy.csv")
# df.head()

In [5]:
# Create a connetion using sqlite3 library
cnn = sqlite3.connect('World_Life_Expectancy.db')

In [6]:
# Add data to the database
df.to_sql('World_Life_Expectancy', cnn)

2941

In [7]:
# Load the sql module to Ipython
%load_ext sql

In [8]:
# Point the jupiter sql library to the database
%sql sqlite:///World_Life_Expectancy.db

# Set a compatible style
%config SqlMagic.style = 'PLAIN_COLUMNS'

# Tell sql to output to a Pandas DataFrame directly
%config SqlMagic.autopandas = True

In [9]:
%%sql

SELECT * 
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,index,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,percentage expenditure,Measles,BMI,under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,thinness 1-19 years,thinness 5-9 years,Schooling,Row_ID
0,0,Afghanistan,2022,Developing,65.0,263,62,71.3,1154,19.1,83,6,65,0.1,584,17.2,17.3,10.1,1
1,1,Afghanistan,2021,Developing,59.9,271,64,73.5,492,18.6,86,58,62,0.1,613,17.5,17.5,10.0,2
2,2,Afghanistan,2020,Developing,59.9,268,66,73.2,430,18.1,89,62,64,0.1,632,17.7,17.7,9.9,3
3,3,Afghanistan,2019,Developing,59.5,272,69,78.2,2787,17.6,93,67,67,0.1,670,17.9,18.0,9.8,4
4,4,Afghanistan,2018,Developing,,275,71,7.1,3013,17.2,97,68,68,0.1,64,18.2,18.2,9.5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2936,2936,Zimbabwe,2011,Developing,44.3,723,27,0.0,31,27.1,42,67,65,33.6,454,9.4,9.4,9.2,2937
2937,2937,Zimbabwe,2010,Developing,44.5,715,26,0.0,998,26.7,41,7,68,36.7,453,9.8,9.9,9.5,2938
2938,2938,Zimbabwe,2009,Developing,44.8,73,25,0.0,304,26.3,40,73,71,39.8,57,1.2,1.3,10.0,2939
2939,2939,Zimbabwe,2008,Developing,45.3,686,25,0.0,529,25.9,39,76,75,42.1,549,1.6,1.7,9.8,2940


## Data Cleaning

The aim of data cleaning is to enhance the quality and reliability of a dataset by detecting and correcting errors or inconsistencies. The overall goal is to prepare the data so it can be trusted and effectively used for analysis, modeling, or decision-making. Examples include handling duplicate records, missing values, or invalid entries such as zeros where they are not appropriate.

In [11]:
%%sql

-- Count all rows

SELECT COUNT(*) AS total_rows
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,total_rows
0,2941


In [12]:
%%sql

-- Standardize column names

ALTER TABLE World_Life_Expectancy RENAME COLUMN "Life expectancy " TO "Life_expectancy";
ALTER TABLE World_Life_Expectancy RENAME COLUMN "Adult Mortality" TO "Adult_mortality";
ALTER TABLE World_Life_Expectancy RENAME COLUMN "infant deaths" TO "Infant_deaths";
ALTER TABLE World_Life_Expectancy RENAME COLUMN "percentage expenditure" TO "Percentage_expenditure";
ALTER TABLE World_Life_Expectancy RENAME COLUMN " BMI " TO "BMI";
ALTER TABLE World_Life_Expectancy RENAME COLUMN "under-five deaths " TO "Under_five_deaths";
ALTER TABLE World_Life_Expectancy RENAME COLUMN " HIV/AIDS" TO "hiv_aids";
ALTER TABLE World_Life_Expectancy RENAME COLUMN " thinness  1-19 years" TO "Thinness_1_19_years";
ALTER TABLE World_Life_Expectancy RENAME COLUMN " thinness 5-9 years" TO "Thinness_5_9_years";

 * sqlite:///World_Life_Expectancy.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


In [13]:
%%sql

-- Country and Year values should be unique

SELECT Country, Year, CONCAT(Country, Year), COUNT(CONCAT(Country, Year)) 
FROM World_Life_Expectancy
GROUP BY Country, Year, CONCAT(Country, Year)
HAVING COUNT(CONCAT(Country, Year)) > 1

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,Year,"CONCAT(Country, Year)","COUNT(CONCAT(Country, Year))"
0,Ireland,2022,Ireland2022,2
1,Senegal,2009,Senegal2009,2
2,Zimbabwe,2019,Zimbabwe2019,2


In [14]:
%%sql

-- Identify the IDs corresponding to the Year + Country combinations that have duplicate values in order to remove them from the data

SELECT *
FROM(
SELECT Row_ID, 
CONCAT(Country, Year),
ROW_NUMBER() OVER(PARTITION BY CONCAT(Country, Year) ORDER BY CONCAT(Country, Year)) as Row_Num
FROM World_Life_Expectancy
) AS Row_Table
WHERE Row_Num > 1 

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Row_ID,"CONCAT(Country, Year)",Row_Num
0,1252,Ireland2022,2
1,2265,Senegal2009,2
2,2929,Zimbabwe2019,2


In [15]:
%%sql

-- Remove IDs from the table

DELETE FROM World_Life_Expectancy
WHERE Row_ID IN (
	SELECT Row_ID
	FROM(
	SELECT Row_ID, 
	CONCAT(Country, Year),
	ROW_NUMBER() OVER(PARTITION BY CONCAT(Country, Year) ORDER BY CONCAT(Country, Year)) as Row_Num
	FROM World_Life_Expectancy
	) AS Row_Table
	WHERE Row_Num > 1 
)

 * sqlite:///World_Life_Expectancy.db
3 rows affected.


In [16]:
%%sql

-- Check for any missing or null values in the Status column

SELECT * 
FROM World_Life_Expectancy
WHERE Status IS NULL

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,index,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Percentage_expenditure,Measles,BMI,Under_five_deaths,Polio,Diphtheria,hiv_aids,GDP,Thinness_1_19_years,Thinness_5_9_years,Schooling,Row_ID
0,8,Afghanistan,2014,,57.5,295,82,10.9,1141,15.2,113,63,63,0.1,370,19.0,19.1,8.4,9
1,17,Albania,2021,,77.5,8,0,428.7,0,57.2,1,98,98,0.1,4576,1.2,1.3,14.2,18
2,988,Georgia,2012,,73.9,128,1,9.4,1356,48.6,1,82,82,0.1,154,2.8,2.9,12.2,989
3,990,Georgia,2010,,72.7,132,1,70.5,216,47.5,2,74,75,0.1,928,2.9,3.0,11.8,991
4,2797,United States of America,2021,,79.1,14,23,0.0,667,69.1,27,93,95,0.1,0,0.8,0.6,0.0,2798
5,2846,Vanuatu,2020,,71.6,135,0,447.5,0,51.7,0,65,64,0.1,3167,1.5,1.4,10.8,2847
6,2914,Zambia,2016,,57.4,368,30,143.9,26,2.2,47,93,94,9.1,1139,6.7,6.6,11.6,2915
7,2918,Zambia,2012,,49.3,554,34,121.9,45,18.4,55,84,82,17.0,691,7.1,7.0,10.7,2919


In [17]:
%%sql

-- Check for non-null values in the status column.

SELECT DISTINCT(Status) 
FROM World_Life_Expectancy
WHERE Status <> ''

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Status
0,Developing
1,Developed


In [18]:
%%sql

-- Update the values of the rows where the status is null, based on the status information recorded in other rows for the same country

UPDATE World_Life_Expectancy
SET Status = 'Developing'
WHERE Status IS NULL
  AND Country IN (
      SELECT Country
      FROM World_Life_Expectancy
      WHERE Status = 'Developing'
  )
;

UPDATE World_Life_Expectancy
SET Status = 'Developed'
WHERE Status IS NULL
  AND Country IN (
      SELECT Country
      FROM World_Life_Expectancy
      WHERE Status = 'Developed'
  )

 * sqlite:///World_Life_Expectancy.db
7 rows affected.
1 rows affected.


In [19]:
%%sql

-- Check for any missing or null values in the Life Expectancy column

SELECT * 
FROM World_Life_Expectancy
WHERE Life_expectancy IS NULL

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,index,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Percentage_expenditure,Measles,BMI,Under_five_deaths,Polio,Diphtheria,hiv_aids,GDP,Thinness_1_19_years,Thinness_5_9_years,Schooling,Row_ID
0,4,Afghanistan,2018,Developing,,275,71,7.1,3013,17.2,97,68,68,0.1,64,18.2,18.2,9.5,5
1,20,Albania,2018,Developing,,88,0,437.1,28,55.1,1,99,99,0.1,4437,1.4,1.5,13.3,21


In [20]:
%%sql

-- For Afghanistan and Albania, we can see a gradual increase in life expectancy over the years, so we can replace the null value with 
-- the mean average of the values from the previous and next year 

SELECT * 
FROM World_Life_Expectancy
WHERE Country IN ('Afghanistan', 'Albania')

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,index,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Percentage_expenditure,Measles,BMI,Under_five_deaths,Polio,Diphtheria,hiv_aids,GDP,Thinness_1_19_years,Thinness_5_9_years,Schooling,Row_ID
0,0,Afghanistan,2022,Developing,65.0,263,62,71.3,1154,19.1,83,6,65,0.1,584,17.2,17.3,10.1,1
1,1,Afghanistan,2021,Developing,59.9,271,64,73.5,492,18.6,86,58,62,0.1,613,17.5,17.5,10.0,2
2,2,Afghanistan,2020,Developing,59.9,268,66,73.2,430,18.1,89,62,64,0.1,632,17.7,17.7,9.9,3
3,3,Afghanistan,2019,Developing,59.5,272,69,78.2,2787,17.6,93,67,67,0.1,670,17.9,18.0,9.8,4
4,4,Afghanistan,2018,Developing,,275,71,7.1,3013,17.2,97,68,68,0.1,64,18.2,18.2,9.5,5
5,5,Afghanistan,2017,Developing,58.8,279,74,79.7,1989,16.7,102,66,66,0.1,553,18.4,18.4,9.2,6
6,6,Afghanistan,2016,Developing,58.6,281,77,56.8,2861,16.2,106,63,63,0.1,446,18.6,18.7,8.9,7
7,7,Afghanistan,2015,Developing,58.1,287,80,25.9,1599,15.7,110,64,64,0.1,373,18.8,18.9,8.7,8
8,8,Afghanistan,2014,Developing,57.5,295,82,10.9,1141,15.2,113,63,63,0.1,370,19.0,19.1,8.4,9
9,9,Afghanistan,2013,Developing,57.3,295,84,17.2,1990,14.7,116,58,58,0.1,273,19.2,19.3,8.1,10


In [21]:
%%sql

-- For Afghanistan, we can see a gradual increase in life expectancy over the years, so we can replace the null value with 
-- the mean average of the values from the previous and next year 

SELECT t1.Country, t1.Year,  t1.Life_expectancy, 
	t2.Country, t2.Year,  t2.Life_expectancy,
    t3.Country, t3.Year,  t3.Life_expectancy,
    ROUND((t2.Life_expectancy + t3.Life_expectancy)/2,1)
FROM World_Life_Expectancy t1
JOIN World_Life_Expectancy t2
	ON t1.Country = t2.Country
    AND t1.Year = t2.Year - 1
JOIN World_Life_Expectancy t3
	ON t1.Country = t3.Country
    AND t1.Year = t3.Year + 1
WHERE t1.Life_expectancy IS NULL     

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,Year,Life_expectancy,Country.1,Year.1,Life_expectancy.1,Country.2,Year.2,Life_expectancy.2,"ROUND((t2.Life_expectancy + t3.Life_expectancy)/2,1)"
0,Afghanistan,2018,,Afghanistan,2019,59.5,Afghanistan,2017,58.8,59.1
1,Albania,2018,,Albania,2019,76.9,Albania,2017,76.2,76.6


In [22]:
%%sql

-- Update the table with the new averaged values.

UPDATE World_Life_Expectancy AS w
SET Life_expectancy = (
  SELECT ROUND((p.Life_expectancy + n.Life_expectancy)/2.0, 1)
  FROM World_Life_Expectancy AS p
  JOIN World_Life_Expectancy AS n
    ON p.Country = n.Country
   AND p.Year   = w.Year - 1
   AND n.Year   = w.Year + 1
  WHERE p.Country = w.Country
)
WHERE w.Life_expectancy IS NULL
  AND EXISTS (
    SELECT 1
    FROM World_Life_Expectancy AS p
    JOIN World_Life_Expectancy AS n
      ON p.Country = n.Country
     AND p.Year   = w.Year - 1
     AND n.Year   = w.Year + 1
    WHERE p.Country = w.Country
)

 * sqlite:///World_Life_Expectancy.db
2 rows affected.


In [23]:
%%sql

-- Potential anomalies from summary statistics: Life_expectancy

SELECT
  MIN(Life_expectancy) AS min_val,
  MAX(Life_expectancy) AS max_val
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,min_val,max_val
0,0.0,89.0


In [24]:
%%sql

-- A life_expectancy value of 0 is impossible, so it should be treated as missing (NULL)

UPDATE World_Life_Expectancy
SET Life_expectancy = NULL
WHERE Life_expectancy = 0

 * sqlite:///World_Life_Expectancy.db
10 rows affected.


In [25]:
%%sql

-- Potential anomalies from summary statistics: Adult_mortality

SELECT
  MIN(Adult_mortality) AS min_val,
  MAX(Adult_mortality) AS max_val
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,min_val,max_val
0,0,723


In [26]:
%%sql

-- Zeros are implausible and should be set to NULL or imputed. Very high values may be real (e.g., during crises).

UPDATE World_Life_Expectancy
SET Adult_mortality = NULL
WHERE Adult_mortality = 0

 * sqlite:///World_Life_Expectancy.db
10 rows affected.


In [27]:
%%sql

-- Potential anomalies from summary statistics: Percentage_expenditure

SELECT
  MIN(Percentage_expenditure) AS min_val,
  MAX(Percentage_expenditure) AS max_val,
  AVG(Percentage_expenditure) AS avg_val
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,min_val,max_val,avg_val
0,0.0,19479.9,738.251259


In [28]:
%%sql

-- The column represents percentage of GDP spent on health. Higher percentage values (> 100) may suggest data entry errors, scaling issues, or exceptional cases.  
-- To help identify this, this query calculates, for each country, how often percentage expenditure exceeds 100 and what fraction of the 
-- country’s records that represents. For example, a few cases of a high value for a country may suggest an error in inserting the value. 

SELECT 
    Country,
    COUNT(*) AS total_count,
    SUM(CASE WHEN Percentage_expenditure > 100 THEN 1 ELSE 0 END) AS above_100,
    (SUM(CASE WHEN Percentage_expenditure > 100 THEN 1 ELSE 0 END) * 100.0 / COUNT(*)) AS percentage_above_100
FROM World_Life_Expectancy
GROUP BY Country
HAVING SUM(CASE WHEN Percentage_expenditure > 100 THEN 1 ELSE 0 END) > 0

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,total_count,above_100,percentage_above_100
0,Albania,16,8,50.00
1,Algeria,16,11,68.75
2,Angola,16,6,37.50
3,Antigua and Barbuda,16,14,87.50
4,Argentina,16,13,81.25
...,...,...,...,...
125,United Arab Emirates,16,15,93.75
126,Uruguay,16,12,75.00
127,Uzbekistan,16,2,12.50
128,Vanuatu,16,12,75.00


In [29]:
%%sql

SELECT *
FROM World_Life_Expectancy
WHERE Country = 'United Arab Emirates' 

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,index,Country,Year,Status,Life_expectancy,Adult_mortality,Infant_deaths,Percentage_expenditure,Measles,BMI,Under_five_deaths,Polio,Diphtheria,hiv_aids,GDP,Thinness_1_19_years,Thinness_5_9_years,Schooling,Row_ID
0,2748,United Arab Emirates,2022,Developing,77.1,75,1,0.0,347,64.2,1,99,99,0.1,3912,5.3,5.1,13.3,2749
1,2749,United Arab Emirates,2021,Developing,76.9,77,1,3862.7,344,62.4,1,99,99,0.1,44450,5.2,5.0,13.3,2750
2,2750,United Arab Emirates,2020,Developing,76.7,78,1,377.2,0,6.5,1,98,98,0.1,4336,5.2,5.0,13.3,2751
3,2751,United Arab Emirates,2019,Developing,76.5,8,1,3663.8,132,58.6,1,96,96,0.1,42112,5.1,5.0,13.3,2752
4,2752,United Arab Emirates,2018,Developing,76.3,81,1,387.8,0,57.1,1,95,95,0.1,4462,5.1,4.9,13.3,2753
5,2753,United Arab Emirates,2017,Developing,76.2,82,1,308.1,87,55.8,1,94,94,0.1,3549,5.1,4.9,13.3,2754
6,2754,United Arab Emirates,2016,Developing,76.0,84,1,292.4,0,55.1,1,94,93,0.1,3373,5.1,4.9,13.2,2755
7,2755,United Arab Emirates,2015,Developing,75.8,85,1,4003.9,55,54.8,1,94,92,0.1,45759,5.1,4.9,13.1,2756
8,2756,United Arab Emirates,2014,Developing,75.6,87,1,3759.5,0,55.1,1,94,92,0.1,42673,5.1,4.9,12.9,2757
9,2757,United Arab Emirates,2013,Developing,75.4,89,1,3749.9,0,55.8,1,94,92,0.1,42372,5.1,4.9,12.8,2758


Many countries show values greater than 100, and in some cases even in the thousands.
Since the column name is percentage expenditure, anything over 100% already looks suspicious since a country cannot realistically spend more than 100% of its GDP or income on health. This suggests either:

- Data entry errors, or

- Wrong units (e.g., per capita health expenditure relative to GDP, not true percentages).

There is no additional information in the original documentation to confirm what the column actually measures. For this reason, I prefer not use this column for subsequent analysis.

In [31]:
%%sql

-- Potential anomalies from summary statistics: BMI

SELECT
  MIN(BMI) AS min_val,
  MAX(BMI) AS max_val
FROM World_Life_Expectancy

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,min_val,max_val
0,0.0,87.3


In [32]:
%%sql

SELECT Country, Year, BMI
FROM World_Life_Expectancy
WHERE Country = 'Belgium'

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,Year,BMI
0,Belgium,2022,63.7
1,Belgium,2021,63.4
2,Belgium,2020,63.0
3,Belgium,2019,62.6
4,Belgium,2018,62.3
5,Belgium,2017,61.9
6,Belgium,2016,61.6
7,Belgium,2015,61.3
8,Belgium,2014,6.9
9,Belgium,2013,6.6


A BMI value of 0 is impossible, so it should be treated as missing (NULL). Extreme values would not make sense in almost any real-life context. As for the values recorded from Belgium that do not agree with the official values found online, this further suggests inconsistency. For this reason, BMI is not a reliable column to use for the analysis.

## Data Analysis

Data analysis is the process of examining raw data to find meaningful patterns, trends, and insights. With it, we can track changes over time, compare groups, detect anomalies, and make data-driven decisions.

In [35]:
%%sql

-- This query finds the five countries with the lowest minimum recorded life expectancy (ignoring null values). 

SELECT Country, MIN(Life_expectancy) AS Min_Life_Expectancy
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY MIN(Life_expectancy) ASC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,Min_Life_Expectancy
0,Haiti,36.3
1,Sierra Leone,39.0
2,Malawi,43.1
3,Zambia,43.8
4,Zimbabwe,44.3


In [36]:
%%sql

-- This query finds the five countries with the highest maximum recorded life expectancy (ignoring null values). 

SELECT Country, MAX(Life_expectancy) AS Max_Life_Expectancy
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY MAX(Life_expectancy) DESC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,Max_Life_Expectancy
0,Sweden,89.0
1,Spain,89.0
2,Portugal,89.0
3,Norway,89.0
4,New Zealand,89.0


In [37]:
%%sql

-- Countries with highest average life expectancy over time

SELECT Country, ROUND(AVG(Life_expectancy), 2) AS avg_life_exp
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY avg_life_exp DESC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,avg_life_exp
0,Japan,82.54
1,Sweden,82.52
2,Iceland,82.44
3,Switzerland,82.33
4,France,82.22


In [38]:
%%sql

-- Countries with lowest average life expectancy over time

SELECT Country, ROUND(AVG(Life_expectancy), 2) AS avg_life_exp
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY avg_life_exp ASC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,avg_life_exp
0,Sierra Leone,46.11
1,Central African Republic,48.51
2,Lesotho,48.78
3,Angola,49.02
4,Malawi,49.89


In [39]:
%%sql

-- Countries with highest improvement in life expectancy over time

SELECT 
    Country,
    MIN(Life_expectancy) AS min_life,
    MAX(Life_expectancy) AS max_life,
    ROUND(MAX(Life_expectancy) - MIN(Life_expectancy), 2) AS improvement
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY improvement DESC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,min_life,max_life,improvement
0,Haiti,36.3,65.0,28.7
1,Zimbabwe,44.3,67.0,22.7
2,Eritrea,45.3,67.0,21.7
3,Uganda,46.6,67.0,20.4
4,Rwanda,48.3,68.0,19.7


In [40]:
%%sql

-- Countries with lowest improvement in life expectancy over time

SELECT 
    Country,
    MIN(Life_expectancy) AS min_life,
    MAX(Life_expectancy) AS max_life,
    ROUND(MAX(Life_expectancy) - MIN(Life_expectancy), 2) AS improvement
FROM World_Life_Expectancy
WHERE Life_expectancy IS NOT NULL
GROUP BY Country
ORDER BY improvement ASC
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,min_life,max_life,improvement
0,Guyana,65.0,66.3,1.3
1,Seychelles,71.8,73.2,1.4
2,Kuwait,73.2,74.7,1.5
3,Philippines,66.8,68.5,1.7
4,Venezuela (Bolivarian Republic of),72.4,74.1,1.7


In [41]:
%%sql

-- Top 5 countries with the biggest improvement over the last 5-year window

SELECT 
  w.Country,
  w.Year AS latest_year,
  w.Life_expectancy AS lifeexp_latest,
  p.Life_expectancy AS lifeexp_5y_ago,
  (w.Life_expectancy - p.Life_expectancy) AS improvement_5y
FROM World_Life_Expectancy AS w
JOIN World_Life_Expectancy AS p
  ON p.Country = w.Country
 AND p.Year = w.Year - 5              
WHERE w.Year = (SELECT MAX(Year) FROM World_Life_Expectancy)
  AND p.Life_expectancy IS NOT NULL      
ORDER BY improvement_5y DESC, w.Country
LIMIT 5

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,latest_year,lifeexp_latest,lifeexp_5y_ago,improvement_5y
0,Haiti,2022,63.5,36.3,27.2
1,Zimbabwe,2022,67.0,52.4,14.6
2,Egypt,2022,79.0,70.0,9.0
3,Slovenia,2022,88.0,79.5,8.5
4,Bolivia (Plurinational State of),2022,77.0,68.7,8.3


In [42]:
%%sql

-- Correlation of GDP vs Life Expectancy

SELECT Country,
       AVG(GDP) AS avg_gdp,
       AVG(Life_expectancy) AS avg_life_expectancy
FROM World_Life_Expectancy
GROUP BY Country
LIMIT 30

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Country,avg_gdp,avg_life_expectancy
0,Afghanistan,340.125,58.1875
1,Albania,2119.8125,75.15625
2,Algeria,2847.8125,73.61875
3,Angola,1975.1875,49.01875
4,Antigua and Barbuda,9759.25,75.05625
5,Argentina,6998.5625,75.15625
6,Armenia,2000.0,73.4
7,Australia,34637.5625,81.8125
8,Austria,33827.5,81.48125
9,Azerbaijan,3302.9375,70.73125


In [44]:
%%sql

-- Average life expectancy defined with respect to low and high GDP, relative to the median value.

WITH ordered AS (
  SELECT GDP AS x,
         ROW_NUMBER() OVER (ORDER BY GDP) AS rn,
         COUNT(*)  OVER ()                             AS n
  FROM World_Life_Expectancy
  WHERE GDP IS NOT NULL
),
med AS (
  SELECT AVG(x) AS x_med
  FROM ordered
  WHERE rn IN ((n+1)/2, (n+2)/2)   -- handles odd/even counts
)

SELECT
  AVG(CASE WHEN GDP >= x_med THEN Life_expectancy END) AS High_GDP_Life_Expectancy,
  AVG(CASE WHEN GDP <  x_med THEN Life_expectancy END) AS Low_GDP_Life_Expectancy
FROM World_Life_Expectancy
CROSS JOIN med;

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,High_GDP_Life_Expectancy,Low_GDP_Life_Expectancy
0,73.713565,64.717796


There is a clear overall trend that higher GDP is associated with longer life expectancy, though there are outliers caused by health crises, missing data, or unique national circumstances. 

In [46]:
%%sql

-- Compare Developed vs Developing Countries

SELECT Status,
       AVG(GDP) AS avg_gdp,
       AVG(Life_expectancy) AS avg_life_expectancy
FROM World_Life_Expectancy
GROUP BY Status

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,Status,avg_gdp,avg_life_expectancy
0,Developed,19296.779297,79.197852
1,Developing,3608.112531,67.111424


Developed countries cluster with higher GDP and Life Expectancy.

In [99]:
%%sql

-- Average life expectancy defined with respect to low and high adult mortality, relative to the median value.

WITH ordered AS (
  SELECT Adult_mortality AS x,
         ROW_NUMBER() OVER (ORDER BY Adult_mortality) AS rn,
         COUNT(*)  OVER ()                             AS n
  FROM World_Life_Expectancy
  WHERE Adult_mortality IS NOT NULL
),
med AS (
  SELECT AVG(x) AS x_med
  FROM ordered
  WHERE rn IN ((n+1)/2, (n+2)/2)   -- handles odd/even counts
)

SELECT
  AVG(CASE WHEN Adult_mortality >= x_med THEN Life_expectancy END) AS High_Adult_Mortality_Life_Expectancy,
  AVG(CASE WHEN Adult_mortality <  x_med THEN Life_expectancy END) AS Low_Adult_Mortality_Life_Expectancy
FROM World_Life_Expectancy
CROSS JOIN med;

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,High_Adult_Mortality_Life_Expectancy,Low_Adult_Mortality_Life_Expectancy
0,64.195374,74.295816


There is a strong negative relationship between adult mortality and life expectancy. This means that when adult mortality is higher, life expectancy tends to be lower.

In [52]:
%%sql

-- Average life expectancy defined with respect to low and high adult schooling, relative to the median value.

WITH ordered AS (
  SELECT Schooling AS x,
         ROW_NUMBER() OVER (ORDER BY Schooling) AS rn,
         COUNT(*)  OVER ()                             AS n
  FROM World_Life_Expectancy
  WHERE Schooling IS NOT NULL
),
med AS (
  SELECT AVG(x) AS x_med
  FROM ordered
  WHERE rn IN ((n+1)/2, (n+2)/2)   -- handles odd/even counts
)

SELECT
  AVG(CASE WHEN Schooling >= x_med THEN Life_expectancy END) AS High_Schooling_Life_Expectancy,
  AVG(CASE WHEN Schooling <  x_med THEN Life_expectancy END) AS Low_Schooling_Life_Expectancy
FROM World_Life_Expectancy
CROSS JOIN med;

 * sqlite:///World_Life_Expectancy.db
Done.


Unnamed: 0,High_Schooling_Life_Expectancy,Low_Schooling_Life_Expectancy
0,75.268541,62.928382


There is a strong positive relationship between years of schooling and life expectancy: more years of schooling are associated with longer lives.